In [None]:
#Toggle Model Parameters

create_dataset = False #boolean to determine whether to download datasets from HDX Database vs. pre-loaded excel file
SAMPLE_NUMBER_OF_DATASETS = 150 #number of training datasets to download
use_randomized_sample_for_BOW = True #boolean to determine whether to take a random sample from the data to expedite 
#feature vectorization

In [None]:
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import itertools 
import pickle
from sklearn.neural_network import MLPClassifier
from fastText import load_model
from sklearn.model_selection import train_test_split
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
import ast

In [None]:
#Setting up account connection to HDX Database
setup_logging()
Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True)

In [None]:
#HELPER FUNCTIONS

#Check if the dataset has at least 1 resource of the required file type(s).

def check_type(dataset, file_types=[]):
    temp_dataset = Dataset.read_from_hdx(dataset)
    temp_dataset.separate_resources()
    if (len(temp_dataset.resources) > 0):
        if (len(file_types) > 0):
            if (not set(temp_dataset.get_filetypes()).isdisjoint(file_types)): 
                    return True
        else :
            return True
    return False

#check if organization from HXL
def check_organization(dataset):
    if dataset.get_organization()['title'] != 'Humanitarian Exchange Language(HXL)':
            return True
    return False

In [None]:
nltk.download('stopwords')

#Helper functions to preprocess data

def lower_cols(lst):
    #convert data to lowercases
    return [word.lower() for word in lst]

def remove_chars(lst):
    #remove punctuation characters such as ",", "(", ")", """, ":", "/", and "."
    #NOTE: PRESERVES WHITE SPACE.
    cleaned = [re.sub('\s+', ' ', mystring).strip() for mystring in lst]
    cleaned = [re.sub(r'[[^A-Za-z0-9\s]+]', ' ', mystr) for mystr in cleaned]
    cleaned = [mystr.replace('_', ' ') for mystr in cleaned]
    return cleaned

stopWords = set(stopwords.words('english'))

def remove_stop_words(data_lst):
    #remove stopwords from the data including 'the', 'and' etc. 
    wordsFiltered = []
    for w in data_lst:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered

#Function to aggregate above preprocessing functions
def clean_cols(data):
    data = lower_cols(data)
    data = remove_chars(data)
    return data

In [None]:
# Download one dataset with required type(s), read it into Dataframe, 
# add all Headers, Tags, Attributes, Data, Relative Column Position, Dataset_name, and Organizations to our DataFrame,
# temporarily stores this data in the datasets folder,
# and subsequently deletes the dataset

def process_dataset(dataset, file_type, dataframe, download_path, index, row_limit = 10):
    global count
    organization = ""
# Download one dataset and read it into a DataFrame 
    if (file_type == None):
        url, path = dataset.resources[0].download(download_path)
        pandas_dataset = pd.read_csv(path)
    else:
        if (file_type not in dataset.get_filetypes()):
            return 'Error: Required file type not in dataset OR dataset does not contain any resources.'
        try:
            url, path = dataset.resources[dataset.get_filetypes().index(file_type)].download(download_path)
            organization = dataset.get_organization()['title']
            print('Resource URL %s downloaded to %s' % (url, path))
            pandas_dataset = pd.read_csv(path, encoding='latin-1')
            pandas_dataset = pandas_dataset.head(row_limit)
        except:
            return 'Unknown error.'
    
    # Add headers, tags and data to our DataFrame if current dataset not empty
        if (not pandas_dataset.empty):
            dataset_df = pandas_dataset
            headers = list(dataset_df.columns.values)
            headers = clean_cols(headers)
            tags = list(dataset_df.iloc[0,:])
            for i in range(len(headers)):
                try:
                    splitted = re.split('[(^\s+)+#]', tags[i])
                    splitted = list(filter(None, splitted))
                    hashtag = splitted[0]
                    attributes = splitted[1:]
                    dic = {'Header': headers[i], 'Tag': hashtag, 'Attributes': attributes, 
                           'Data': list(dataset_df.iloc[1:, i]), 
                           'Relative Column Position': (i+1) / len(dataset_df.columns), 
                           'Dataset_name': os.path.basename(path), 
                           'Organization': organization,
                           'Index': index}
                    dataframe.loc[len(dataframe)] = dic
                except:
                    print("Error: different number of headers and tags")
            count += 1
        os.remove(path)
        print("File Removed!")
        return

In [None]:
# Search for all datasets with HXL tags
datasets_HXL = Dataset.search_in_hdx('HXL')
len(datasets_HXL)

In [None]:
# Create a DataFrame for all Headers, Tags, Attributes, Data, Relative Column Position, Dataset_name, and Organizations

col_names = ['Header', 'Tag', 'Attributes','Data','Relative Column Position','Dataset_name', 'Organization','Index']
headers_and_tags= pd.DataFrame(columns = col_names)

In [None]:
#Reading in n tagged datasets either from HDX Database or pre-loaded excel file
count = 0
if (create_dataset):
    for i in range(SAMPLE_NUMBER_OF_DATASETS):
        rand_dataset = np.random.randint(0, len(datasets_HXL))
        process_dataset_2(datasets_HXL[rand_dataset], 'CSV', headers_and_tags, './datasets', count)
        print(i)
        
    headers_and_tags.to_excel("headerandtag.xlsx")
else:
    headers_and_tags = pd.read_excel("headertag_fulldataset.xlsx")

In [None]:

headers_and_tags.head(200)


In [None]:
attr = headers_and_tags['Attributes']
new_attributes = attr.apply(ast.literal_eval)
new_series = []

for i in new_attributes:
    if i == []:
        new_series.append('')
    else:
        new_series.append(i[0])

new_attr = pd.Series(new_series)
headers_and_tags['New Attributes'] = new_attr
headers_and_tags


In [None]:
#implementing n-grams Model
from nltk import ngrams

def generate_n_grams(data_lst, n):
    cleaned = remove_chars(list(data_lst))
    cleaned = clean_cols(cleaned)
    cleaned = remove_stop_words(cleaned) 
    return list(ngrams(cleaned, n))

In [None]:
#creating a n-gram frequency table 

def count_stats_grams(two_d_arr):
    lst = np.array([])
    count = 0
    singles_count = 0
    multiples_count = 0
    for arr in two_d_arr:
        if arr not in lst:
            count += 1
            np.append(lst, arr)
        if two_d_arr.count(arr) == 1:
            singles_count += 1
        if two_d_arr.count(arr) > 1:
            multiples_count += 1
    check = count - singles_count
    assert(check == multiples_count)
    return count, singles_count, multiples_count

def n_gram_freqs(dataframe, max_n = 4):
    n_gram_cols = ['n-gram', 'data' ,'unique ngrams', 'multiples', 'singles']
    n_gram_freqs = pd.DataFrame(columns = n_gram_cols)
    for i in range(max_n):
        n = i+1
        n_grams = generate_n_grams(dataframe['Header'], n)
        unique_n_grams, singles, multiples = count_stats_grams(n_grams)
        row = {'n-gram': n, 
              'data': n_grams,
              'unique ngrams': unique_n_grams,
              'multiples': multiples,
              'singles': singles}
        n_gram_freqs.loc[len(n_gram_freqs)] = row
    return pd.DataFrame(n_gram_freqs)  

In [None]:
#Takes a data row and cleans it for model input
def word_extract(row):
    ignore = ['nan']
    no_white = [i.lstrip() for i in row if i not in ignore and not isinstance(i, float)]
    cleaned_text = [w.lower() for w in no_white if w not in ignore]
    return cleaned_text

long_string = []
for i in headers_and_tags['Data']:
    result_by_tag = word_extract(i)
    holder_list = ''.join(result_by_tag)
    long_string.append(holder_list)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#Vectorizing the data to a frequency count for the Bag of Words model

vectorizer = CountVectorizer()
corpus = long_string
X_vecs = vectorizer.fit_transform(corpus)


#selecting random representative samples for BOW to expedite model generation
if (use_randomized_sample_for_BOW):
    rand_indices = []
    num_of_words = len(X_vecs.toarray()[1]) // 3
    print(num_of_words)
    
    for i in range(num_of_words):
        rand_index = np.random.randint(0, len(X_vecs.toarray()[1]))
        rand_indices.append(rand_index)
    
    X_vecs = X_vecs[:, rand_indices]

In [None]:
#loading in the pre-determined word vectors
fasttext_model = 'wiki.en.bin'
fmodel = load_model(fasttext_model)
print("Pre-trained model loaded successfully!\n")

In [None]:
#helper function to extract the top n most likely tags for a given column (3 top tags selected in default.)
#returns a dictionary where key = header and values = dictionary of num_of_top_tags tags with their respective probabilities

def top_tags(clf, X_test, series, num_of_top_tags = 3):
    if (not isinstance(X_test, np.ndarray)):
        X_test = X_test.values.tolist()
    probs = clf.predict_proba(X_test)
    values = []
    for i in range(len(X_test)):
        max_args = probs[i].argsort()[-num_of_top_tags:][::-1]
        top_suggested_tags = clf.classes_[max_args]
        dictionary = {}
        sorted_probs = np.take(probs[i], max_args)
        key = series.iloc[i]
        for j in range(len(top_suggested_tags)):
            dictionary[top_suggested_tags[j]]=sorted_probs[j]
        values.append((key, dictionary))
    return values

In [None]:
#helper function to extract n random data points from the data column and vectorizes them into columns
df = headers_and_tags

def separate_words(series): 
    #each series is a long string that contains all the data
    lst = re.split(r"\W+", series)
    lst = list(filter(None, lst))
    return lst
    
def vectorize_n_datapoints(number_of_datapoints_to_vectorize = 7):
    df['Data_separated'] = df['Data'].apply(separate_words)
    if (number_of_datapoints_to_vectorize > len(df['Data_separated'][0])):
        number_of_datapoints_to_vectorize = len(df['Data_separated'][0])
    for i in range(number_of_datapoints_to_vectorize):
        df['datapoint' + str(i)] = df['Data_separated'].str[i]
        
def embedded_datapoints(number_of_data_point_to_vectorize = 7):
    vectorize_n_datapoints()
    for i in range(number_of_data_point_to_vectorize):
        df['embedded_datapoint' + str(i)] = df['datapoint' + str(i)].map(lambda x: fmodel.get_sentence_vector(str(x)))
        
number_of_data_point_to_vectorize = 7

In [None]:
#Classification accuracy using only headers
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column

df['Header_embedding'] = df['Header'].map(lambda x: fmodel.get_sentence_vector(str(x)))
print("Word embeddings extracted!\n")

X_train, X_test, y_train, y_test = train_test_split(df['Header_embedding'], 
                                                    df['New Attributes'], test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train.values.tolist(), y_train.values.tolist())
predicted_tags = clf.predict(X_test.values.tolist())
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Header: %s" %df['Header'][10])
print("Actual attribute: %s" %y_test.tolist()[10])
print("Predicted attribute: %s" %predicted_tags[10])
print("top 3 predicted attributes:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

In [None]:
type(y_test.tolist()[10])

In [None]:
#Classification accuracy using organization name
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
df['Organization_embedded'] = df['Organization'].map(lambda x: fmodel.get_sentence_vector(str(x)))
print("Word embeddings extracted!\n")

X_train, X_test, y_train, y_test = train_test_split(df['Organization_embedded'], 
                                                    df['New Attributes'], test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train.values.tolist(), y_train.values.tolist())
predicted_tags = clf.predict(X_test.values.tolist())
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Header: %s" %df['Header'][10])
print("Actual tag: %s" %y_test.tolist()[10])
print("Predicted tag: %s" %predicted_tags[10])
print("top 3 predicted tags:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

In [None]:
#Classification accuracy using n data points
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
embedded_datapoints()
print("Word embeddings extracted!\n")

df['data_combined'] = df.loc[:, 'embedded_datapoint0': 'embedded_datapoint' 
                                                           + str(number_of_data_point_to_vectorize-1)].values.tolist()
df['data_combined'] = df['data_combined'].apply(lambda x: [val for item in x for val in item])

X_train, X_test, y_train, y_test = train_test_split(df['data_combined'], 
                                                    df['New Attributes'], test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train.values.tolist(), y_train.values.tolist())
predicted_tags = clf.predict(X_test.values.tolist())
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Header: %s" %df['Header'][10])
print("Actual tag: %s" %y_test.tolist()[10])
print("Predicted tag: %s" %predicted_tags[10])
print("top 3 predicted tags:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

In [None]:
df.head()

In [None]:
#Tokenizing n-grams
ngrams = generate_n_grams(headers_and_tags['Header'], 3)
vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
X_vec_grams = vectorizer.fit_transform(ngrams)
print(np.shape(X_vec_grams.toarray()))
print(np.shape(X_vecs.toarray()))

In [None]:
#Testing accuracy of MLP Classifier on Bag of Words
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
df_2 = headers_and_tags

X_train, X_test, y_train, y_test = train_test_split(X_vecs.toarray(), 
                                                    df['New Attributes'], test_size=0.33, random_state=0)
clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train, y_train)
predicted_tags = clf.predict(X_test)
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test, y_test)
print("Header: %s" %df['Header'][10])
print("Actual attribute: %s" %y_test.tolist()[10])
print("Predicted attribute: %s" %predicted_tags[10])
print("top 3 predicted attributes:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

In [None]:
##Testing accuracy of MLP Classifier on ngrams
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
df_3 = headers_and_tags

X_train, X_test, y_train, y_test = train_test_split(X_vec_grams.toarray(), 
                                                    df['New Attributes'][0:len(X_vec_grams.toarray())], test_size=0.33, random_state=0)
clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train, y_train)
predicted_tags = clf.predict(X_test)
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test, y_test)
#print("Header: %s" %df['Header'][7:14])
print("Actual attribute: %s" %y_test.tolist()[10])
print("Predicted attribute: %s" %predicted_tags[10])
print("top 3 predicted attributes:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

In [None]:
#Aggregating embedded features into a single Dataframe

df_target = headers_and_tags
#df_target['BOW_counts'] = [item for item in X_vecs.toarray()]
n = len(X_vec_grams.toarray())
df_target = df.iloc[0:n, :]
df_target['ngrams_counts'] = [item for item in X_vec_grams.toarray()]
df_target = df_target[['Header_embedding', 
                      'Organization_embedded',
                      'data_combined',
                      'ngrams_counts']]

#df_target.to_csv('df_target.xslx')
df_target.head()

In [None]:
#using all four features to predict tags
#cleaning the dataset by flattening the datastructure

cols = ['Header_embedding', 'Organization_embedded', 'data_combined']
df_target['features_combined'] = df_target[cols].values.tolist()
df_target['features_combined'] = df_target['features_combined'].apply(lambda x: [val for item in x for val in item])

In [None]:
#Testing accuracy of MLP Classifier on all features
#This is the model that will be used in the API

#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
#         pickle file = stores the classifier to be inputted into the API

#Helper function to add hashtags to the predicted tags
def add_hashtags(predicted_tags):
    if (isinstance(predicted_tags, np.ndarray)):
        return ["#"+word for word in predicted_tags]

X_train, X_test, y_train, y_test = train_test_split(df_target['features_combined'], 
                                                    df['New Attributes'][0:len(df_target['features_combined'])], 
                                                    test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')
clf.fit(list(X_train), y_train)
predicted_attributes = clf.predict(list(X_test))
predicted_attributes = add_hashtags(predicted_attributes)
top_3_predicted_attributes = top_tags(clf, X_test, df['Header'])
test_score = clf.score(list(X_test), y_test)
print("Classification accuracy on test set: %s" %test_score)
pickle.dump(clf,open("model.pkl","wb"))

#### Testing The Accuracy for Other Models using all four features

#Accuracy of MLP: 95.10%

#Models Considered
#1) Gaussian Naive Bayes       
    #Accuracy: 88.03%
#2) RandomForest       
    #Accuracy: 98.36%

