In [None]:
#REQUIREMENTS TO RUN THE PYTHON DOCUMENT
#1) hdx python api
#2) pip install nltk

create_dataset = False

In [None]:
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import itertools 
from sklearn.neural_network import MLPClassifier
from fastText import load_model
from sklearn.model_selection import train_test_split
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
setup_logging()
Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True)

In [None]:
#Check if the dataset has at least 1 resource of the required file type(s).

def check_type(dataset, file_types=[]):
    temp_dataset = Dataset.read_from_hdx(dataset)
    temp_dataset.separate_resources()
    if (len(temp_dataset.resources) > 0):
        if (len(file_types) > 0):
            if (not set(temp_dataset.get_filetypes()).isdisjoint(file_types)): 
                    return True
        else :
            return True
    return False

In [None]:
# Check if the dataset is tagged with HXL tag, not provided by HXL

def check_organization(dataset):
    if dataset.get_organization()['title'] != 'Humanitarian Exchange Language(HXL)':
            return True
    return False

In [None]:
nltk.download('stopwords')

#CLEANING AND GENERATING N-GRAMS

def lower_cols(lst):
    #convert data to lowercases
    #QUESTION: will I miss anyt important information? 
    return [word.lower() for word in lst]

#Question: is HXL Core Schema.csv something we can use for comparing words??
#This method is going to take up a lot of space and time. Is it worth it? Are there any other ways to go about it? 

def remove_chars(lst):
    #remove punctuation characters such as ",", "(", ")", """, ":", "/", and "."
    #NOTE: PRESERVES WHITE SPACE.
    #QUESTION: any other characters we should be aware of? Is this a good idea? I'm inspecting each word individually.
    #Any potential pitfalls? 
    cleaned = [re.sub('\s+', ' ', mystring).strip() for mystring in lst]
    cleaned = [re.sub(r'[[^A-Za-z0-9\s]+]', ' ', mystr) for mystr in cleaned]
    cleaned = [mystr.replace('_', ' ') for mystr in cleaned]
    return cleaned

stopWords = set(stopwords.words('english'))

def remove_stop_words(data_lst):
    #remove stopwords from the data including 'the', 'and' etc. 
    wordsFiltered = []
    for w in data_lst:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered

def clean_cols(data):
    data = lower_cols(data)
    data = remove_chars(data)
    return data

In [None]:
# Download one dataset with certain type(s), read it into Dataframe, 
# add all headers, tags and dataset names to our DataFrame,
# and delete the dataset

def process_dataset(dataset, file_type, dataframe, download_path, index, row_limit = 10):
    global count
    organization = ""
# Download one dataset and read it into a DataFrame 
    if (file_type == None):
        url, path = dataset.resources[0].download(download_path)
        pandas_dataset = pd.read_csv(path)
    else:
        if (file_type not in dataset.get_filetypes()):
            return 'Error: Required file type not in dataset OR dataset does not contain any resources.'
        try:
            url, path = dataset.resources[dataset.get_filetypes().index(file_type)].download(download_path)
            organization = dataset.get_organization()['title']
            print('Resource URL %s downloaded to %s' % (url, path))
            pandas_dataset = pd.read_csv(path, encoding='latin-1')
            pandas_dataset = pandas_dataset.head(row_limit)
        except:
            return 'Unknown error.'
     
    #if "HXL" in os.path.basename(path) or "hxl" in os.path.basename(path):
        #return dataset_df
    
    # Add headers, tags and data to our DataFrame if current dataset not empty
        if (not pandas_dataset.empty):
            dataset_df = pandas_dataset
            headers = list(dataset_df.columns.values)
            headers = clean_cols(headers)
            tags = list(dataset_df.iloc[0,:])
            for i in range(len(headers)):
                try:
                    splitted = re.split('[(^\s+)+#]', tags[i])
                    splitted = list(filter(None, splitted))
                    hashtag = splitted[0]
                    attributes = splitted[1:]
                    dic = {'Header': headers[i], 'Tag': hashtag, 'Attributes': attributes, 
                           'Data': list(dataset_df.iloc[1:, i]), 
                           'Relative Column Position': (i+1) / len(dataset_df.columns), 
                           'Dataset_name': os.path.basename(path), 
                           'Organization': organization,
                           'Index': index}
                    dataframe.loc[len(dataframe)] = dic
                except:
                    print("Error: different number of headers and tags")
            count += 1
        os.remove(path)
        print("File Removed!")
        return

In [None]:
# Search for all datasets with HXL tags
datasets_HXL = Dataset.search_in_hdx('HXL')
len(datasets_HXL)

In [None]:
# Create a DataFrame for all headers and tags

col_names = ['Header', 'Tag', 'Attributes','Data','Relative Column Position','Dataset_name', 'Organization','Index']
headers_and_tags= pd.DataFrame(columns = col_names)

In [None]:
#Reading in n tagged datasets from HDX
count = 0
n = 150 #NUMBER OF DATASETS
if (create_dataset):
    for i in range(n):
        rand_dataset = np.random.randint(0, len(datasets_HXL))
        process_dataset_2(datasets_HXL[rand_dataset], 'CSV', headers_and_tags, './datasets', count)
        print(i)
        
    headers_and_tags.to_excel("headerandtag.xlsx")
else:
    headers_and_tags = pd.read_excel("headerandtag.xlsx")

In [None]:
#Reads an excel file with the above attributes
headers_and_tags.head(200)

In [None]:
#implementing n-grams Model

def generate_n_grams(data_lst, n):
    cleaned = remove_chars(list(data_lst))
    cleaned = clean_cols(cleaned)
    cleaned = remove_stop_words(cleaned)
    #make sure that n_grams 'refresh' when a new dataset is encountered!!!!   
    return list(ngrams(cleaned, n))

In [None]:
#creating a n-gram frequency table 

def count_stats_grams(two_d_arr):
    #np.unique 'axis' attribute doesn't work on my computer... 
    lst = np.array([])
    count = 0
    singles_count = 0
    multiples_count = 0
    for arr in two_d_arr:
        if arr not in lst:
            count += 1
            np.append(lst, arr)
        if two_d_arr.count(arr) == 1:
            singles_count += 1
        if two_d_arr.count(arr) > 1:
            multiples_count += 1
    check = count - singles_count
    assert(check == multiples_count)
    return count, singles_count, multiples_count

def n_gram_freqs(dataframe, max_n = 4):
    n_gram_cols = ['n-gram', 'data' ,'unique ngrams', 'multiples', 'singles']
    n_gram_freqs = pd.DataFrame(columns = n_gram_cols)
    for i in range(max_n):
        n = i+1
        n_grams = generate_n_grams(dataframe['Header'], n)
        unique_n_grams, singles, multiples = count_stats_grams(n_grams)
        row = {'n-gram': n, 
              'data': n_grams,
              'unique ngrams': unique_n_grams,
              'multiples': multiples,
              'singles': singles}
        n_gram_freqs.loc[len(n_gram_freqs)] = row
    return pd.DataFrame(n_gram_freqs)  

In [None]:
#Takes a data row and cleans it for model input
def word_extract(row):
    ignore = ['nan']
    no_white = [i.lstrip() for i in row if i not in ignore and not isinstance(i, float)]
    cleaned_text = [w.lower() for w in no_white if w not in ignore]
    return cleaned_text

long_string = []
for i in headers_and_tags['Data']:
    result_by_tag = word_extract(i)
    holder_list = ''.join(result_by_tag)
    long_string.append(holder_list)

In [None]:
long_string[0]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

corpus = long_string
X_vecs = vectorizer.fit_transform(corpus)

In [None]:
fasttext_model = 'wiki.en.bin'
fmodel = load_model(fasttext_model)
print("Pre-trained model loaded successfully!\n")

In [None]:
#Classification accuracy using only headers
df = headers_and_tags
df['Header_embedding'] = df['Header'].map(lambda x: fmodel.get_sentence_vector(str(x)))
print("Word embeddings extracted!\n")

X_train, X_test, y_train, y_test = train_test_split(df['Header_embedding'], 
                                                    df['Tag'], test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train.values.tolist(), y_train.values.tolist())
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Classification accuracy on test set: %s" %test_score)

In [None]:
#classification accuracy using organization
df = headers_and_tags
df['Organization_embedded'] = df['Organization'].map(lambda x: fmodel.get_sentence_vector(str(x)))
print("Word embeddings extracted!\n")

X_train, X_test, y_train, y_test = train_test_split(df['Organization_embedded'], 
                                                    df['Tag'], test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train.values.tolist(), y_train.values.tolist())
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Classification accuracy on test set: %s" %test_score)

In [None]:
#Tokenizing n-grams
ngrams = generate_n_grams(headers_and_tags['Header'], 3)
vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
X_vec_grams = vectorizer.fit_transform(ngrams)
print(np.shape(X_vec_grams.toarray()))
print(np.shape(X_vecs.toarray()))

In [None]:
#testing accuracy of MLP Classifier on BOW 
df_2 = headers_and_tags

X_train, X_test, y_train, y_test = train_test_split(X_vecs.toarray(), 
                                                    df['Tag'], test_size=0.33, random_state=0)
clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train, y_train)
test_score = clf.score(X_test, y_test)
print("Classification accuracy on test set: %s" %test_score)

In [None]:
#testing accuracy of MLP Classifier on ngrams
df_3 = headers_and_tags

X_train, X_test, y_train, y_test = train_test_split(X_vec_grams.toarray(), 
                                                    df['Tag'][0:len(X_vec_grams.toarray())], test_size=0.33, random_state=0)
clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train, y_train)
test_score = clf.score(X_test, y_test)
print("Classification accuracy on test set: %s" %test_score)

In [None]:
#Aggregating embedded features into a single Dataframe

df_target = headers_and_tags
df_target['BOW_counts'] = [item for item in X_vecs.toarray()]
n = len(X_vec_grams.toarray())
df_target = df.iloc[0:n, :]
df_target['ngrams_counts'] = [item for item in X_vec_grams.toarray()]
df_target = df_target[['Header_embedding', 
                      'Organization_embedded',
                      'BOW_counts',
                      'ngrams_counts']]
df_target

In [None]:
flattened = df_target.apply(lambda x: np.append(np.array([]), x), axis=1)

In [None]:
#add the hashtags on the predicted tags 
#add the pickle.dump 
#test nearest-neighbors
#test randomforests 