In [1]:
#Toggle Model Parameters

create_dataset = False #boolean to determine whether to download datasets from HDX Database vs. pre-loaded excel file
SAMPLE_NUMBER_OF_DATASETS = 150 #number of training datasets to download
use_randomized_sample_for_BOW = True #boolean to determine whether to take a random sample from the data to expedite 
#feature vectorization

In [2]:
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import itertools 
import pickle
from sklearn.neural_network import MLPClassifier
from fastText import load_model
from sklearn.model_selection import train_test_split
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
#Setting up account connection to HDX Database
setup_logging()
Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True)

No logging configuration parameter. Using default.
Loading logging configuration from: c:\users\cheneli\anaconda3\lib\site-packages\hdx\utilities\logging_configuration.yml
INFO - 2019-04-18 23:24:13 - hdx.hdx_configuration - No HDX base configuration parameter. Using default base configuration file: c:\users\cheneli\anaconda3\lib\site-packages\hdx\hdx_base_configuration.yml.
INFO - 2019-04-18 23:24:13 - hdx.hdx_configuration - Loading HDX base configuration from: c:\users\cheneli\anaconda3\lib\site-packages\hdx\hdx_base_configuration.yml
INFO - 2019-04-18 23:24:13 - hdx.hdx_configuration - No HDX configuration parameter and no configuration file at default path: C:\Users\cheneli\.hdx_configuration.yml.
INFO - 2019-04-18 23:24:13 - hdx.hdx_configuration - Read only access to HDX: True


'https://data.humdata.org/'

In [4]:
#HELPER FUNCTIONS

#Check if the dataset has at least 1 resource of the required file type(s).

def check_type(dataset, file_types=[]):
    temp_dataset = Dataset.read_from_hdx(dataset)
    temp_dataset.separate_resources()
    if (len(temp_dataset.resources) > 0):
        if (len(file_types) > 0):
            if (not set(temp_dataset.get_filetypes()).isdisjoint(file_types)): 
                    return True
        else :
            return True
    return False

#check if organization from HXL
def check_organization(dataset):
    if dataset.get_organization()['title'] != 'Humanitarian Exchange Language(HXL)':
            return True
    return False

In [5]:
nltk.download('stopwords')

#Helper functions to preprocess data

def lower_cols(lst):
    #convert data to lowercases
    return [word.lower() for word in lst]

def remove_chars(lst):
    #remove punctuation characters such as ",", "(", ")", """, ":", "/", and "."
    #NOTE: PRESERVES WHITE SPACE.
    cleaned = [re.sub('\s+', ' ', mystring).strip() for mystring in lst]
    cleaned = [re.sub(r'[[^A-Za-z0-9\s]+]', ' ', mystr) for mystr in cleaned]
    cleaned = [mystr.replace('_', ' ') for mystr in cleaned]
    return cleaned

stopWords = set(stopwords.words('english'))

def remove_stop_words(data_lst):
    #remove stopwords from the data including 'the', 'and' etc. 
    wordsFiltered = []
    for w in data_lst:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered

#Function to aggregate above preprocessing functions
def clean_cols(data):
    data = lower_cols(data)
    data = remove_chars(data)
    return data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cheneli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Download one dataset with required type(s), read it into Dataframe, 
# add all Headers, Tags, Attributes, Data, Relative Column Position, Dataset_name, and Organizations to our DataFrame,
# temporarily stores this data in the datasets folder,
# and subsequently deletes the dataset

def process_dataset(dataset, file_type, dataframe, download_path, index, row_limit = 10):
    global count
    organization = ""
# Download one dataset and read it into a DataFrame 
    if (file_type == None):
        url, path = dataset.resources[0].download(download_path)
        pandas_dataset = pd.read_csv(path)
    else:
        if (file_type not in dataset.get_filetypes()):
            return 'Error: Required file type not in dataset OR dataset does not contain any resources.'
        try:
            url, path = dataset.resources[dataset.get_filetypes().index(file_type)].download(download_path)
            organization = dataset.get_organization()['title']
            print('Resource URL %s downloaded to %s' % (url, path))
            pandas_dataset = pd.read_csv(path, encoding='latin-1')
            pandas_dataset = pandas_dataset.head(row_limit)
        except:
            return 'Unknown error.'
    
    # Add headers, tags and data to our DataFrame if current dataset not empty
        if (not pandas_dataset.empty):
            dataset_df = pandas_dataset
            headers = list(dataset_df.columns.values)
            headers = clean_cols(headers)
            tags = list(dataset_df.iloc[0,:])
            for i in range(len(headers)):
                try:
                    splitted = re.split('[(^\s+)+#]', tags[i])
                    splitted = list(filter(None, splitted))
                    hashtag = splitted[0]
                    attributes = splitted[1:]
                    dic = {'Header': headers[i], 'Tag': hashtag, 'Attributes': attributes, 
                           'Data': list(dataset_df.iloc[1:, i]), 
                           'Relative Column Position': (i+1) / len(dataset_df.columns), 
                           'Dataset_name': os.path.basename(path), 
                           'Organization': organization,
                           'Index': index}
                    dataframe.loc[len(dataframe)] = dic
                except:
                    print("Error: different number of headers and tags")
            count += 1
        os.remove(path)
        print("File Removed!")
        return

In [7]:
# Search for all datasets with HXL tags
datasets_HXL = Dataset.search_in_hdx('HXL')
len(datasets_HXL)

3612

In [8]:
# Create a DataFrame for all Headers, Tags, Attributes, Data, Relative Column Position, Dataset_name, and Organizations

col_names = ['Header', 'Tag', 'Attributes','Data','Relative Column Position','Dataset_name', 'Organization','Index']
headers_and_tags= pd.DataFrame(columns = col_names)

In [9]:
#Reading in n tagged datasets either from HDX Database or pre-loaded excel file
count = 0
if (create_dataset):
    for i in range(SAMPLE_NUMBER_OF_DATASETS):
        rand_dataset = np.random.randint(0, len(datasets_HXL))
        process_dataset_2(datasets_HXL[rand_dataset], 'CSV', headers_and_tags, './datasets', count)
        print(i)
        
    headers_and_tags.to_excel("headerandtag.xlsx")
else:
    headers_and_tags = pd.read_excel("headertag_fulldataset.xlsx")

In [10]:

headers_and_tags.head(200)


Unnamed: 0,Header,Tag,Attributes,Data,Relative Column Position,Dataset_name,Organization,Index
0,https://api.idmcdb.org/api/disaster data?ci=hd...,valid_tag,[],"['#access', '#activity', '#adm1', '#adm2', '#a...",0.100000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
1,hashtag one-liner,description,"['short', 'en']","['Access ability/constraints', 'Programme, pro...",0.200000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
2,hashtag long description,description,"['long', 'en']",['Accessiblity and constraints on access to a ...,0.300000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
3,release status,status,[],"['Released', 'Released', 'Released', 'Released...",0.400000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
4,data type restriction,valid_datatype,[],"[nan, nan, nan, nan, nan, nan, nan, 'number', ...",0.500000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
5,first release,meta,['release'],"['1.1', '1.0', '1.0', '1.0', '1.0', '1.0', '1....",0.600000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
6,default taxonomy,valid_vocab,['default'],"[nan, nan, '+v_pcode', '+v_pcode', '+v_pcode',...",0.700000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
7,category,meta,['category'],"['1.3. Responses and other operations', '1.3. ...",0.800000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
8,sample hxl,meta,"['example', 'hxl']","['#access +type', '#activity +project', '#adm1...",0.900000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
9,sample description,meta,"['example', 'description', 'en']","['type of access being described', 'an aid pro...",1.000000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0


In [11]:
#implementing n-grams Model
from nltk import ngrams

def generate_n_grams(data_lst, n):
    cleaned = remove_chars(list(data_lst))
    cleaned = clean_cols(cleaned)
    cleaned = remove_stop_words(cleaned) 
    return list(ngrams(cleaned, n))

In [12]:
#creating a n-gram frequency table 

def count_stats_grams(two_d_arr):
    lst = np.array([])
    count = 0
    singles_count = 0
    multiples_count = 0
    for arr in two_d_arr:
        if arr not in lst:
            count += 1
            np.append(lst, arr)
        if two_d_arr.count(arr) == 1:
            singles_count += 1
        if two_d_arr.count(arr) > 1:
            multiples_count += 1
    check = count - singles_count
    assert(check == multiples_count)
    return count, singles_count, multiples_count

def n_gram_freqs(dataframe, max_n = 4):
    n_gram_cols = ['n-gram', 'data' ,'unique ngrams', 'multiples', 'singles']
    n_gram_freqs = pd.DataFrame(columns = n_gram_cols)
    for i in range(max_n):
        n = i+1
        n_grams = generate_n_grams(dataframe['Header'], n)
        unique_n_grams, singles, multiples = count_stats_grams(n_grams)
        row = {'n-gram': n, 
              'data': n_grams,
              'unique ngrams': unique_n_grams,
              'multiples': multiples,
              'singles': singles}
        n_gram_freqs.loc[len(n_gram_freqs)] = row
    return pd.DataFrame(n_gram_freqs)  

In [13]:
#Takes a data row and cleans it for model input
def word_extract(row):
    ignore = ['nan']
    no_white = [i.lstrip() for i in row if i not in ignore and not isinstance(i, float)]
    cleaned_text = [w.lower() for w in no_white if w not in ignore]
    return cleaned_text

long_string = []
for i in headers_and_tags['Data']:
    result_by_tag = word_extract(i)
    holder_list = ''.join(result_by_tag)
    long_string.append(holder_list)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

#Vectorizing the data to a frequency count for the Bag of Words model

vectorizer = CountVectorizer()
corpus = long_string
X_vecs = vectorizer.fit_transform(corpus)


#selecting random representative samples for BOW to expedite model generation
if (use_randomized_sample_for_BOW):
    rand_indices = []
    num_of_words = len(X_vecs.toarray()[1]) // 3
    print(num_of_words)
    
    for i in range(num_of_words):
        rand_index = np.random.randint(0, len(X_vecs.toarray()[1]))
        rand_indices.append(rand_index)
    
    X_vecs = X_vecs[:, rand_indices]

2587


In [15]:
#loading in the pre-determined word vectors
fasttext_model = 'wiki.en.bin'
fmodel = load_model(fasttext_model)
print("Pre-trained model loaded successfully!\n")

Pre-trained model loaded successfully!



In [16]:
#helper function to extract the top n most likely tags for a given column (3 top tags selected in default.)
#returns a dictionary where key = header and values = dictionary of num_of_top_tags tags with their respective probabilities

def top_tags(clf, X_test, series, num_of_top_tags = 3):
    if (not isinstance(X_test, np.ndarray)):
        X_test = X_test.values.tolist()
    probs = clf.predict_proba(X_test)
    values = []
    for i in range(len(X_test)):
        max_args = probs[i].argsort()[-num_of_top_tags:][::-1]
        top_suggested_tags = clf.classes_[max_args]
        dictionary = {}
        sorted_probs = np.take(probs[i], max_args)
        key = series.iloc[i]
        for j in range(len(top_suggested_tags)):
            dictionary[top_suggested_tags[j]]=sorted_probs[j]
        values.append((key, dictionary))
    return values

In [17]:
#helper function to extract n random data points from the data column and vectorizes them into columns
df = headers_and_tags

def separate_words(series): 
    #each series is a long string that contains all the data
    if (not isinstance(series, str)):
        series = str(series)
    lst = re.split(r"[^a-zA-Z0-9_.]", series)
    lst = list(filter(None, lst))
    return lst
    
def vectorize_n_datapoints(number_of_datapoints_to_vectorize = 7):
    df['Data_separated'] = df['Data'].apply(separate_words)
    if (number_of_datapoints_to_vectorize > len(df['Data_separated'][0])):
        number_of_datapoints_to_vectorize = len(df['Data_separated'][0])
    for i in range(number_of_datapoints_to_vectorize):
        df['datapoint' + str(i)] = df['Data_separated'].str[i]
        
def embedded_datapoints(number_of_data_point_to_vectorize = 7):
    vectorize_n_datapoints()
    for i in range(number_of_data_point_to_vectorize):
        df['embedded_datapoint' + str(i)] = df['datapoint' + str(i)].map(lambda x: fmodel.get_sentence_vector(str(x)))
        
number_of_data_point_to_vectorize = 7

In [18]:
#Classification accuracy using only headers
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column

df['Header_embedding'] = df['Header'].map(lambda x: fmodel.get_sentence_vector(str(x)))
print("Word embeddings extracted!\n")

X_train, X_test, y_train, y_test = train_test_split(df['Header_embedding'], 
                                                    df['Tag'], test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train.values.tolist(), y_train.values.tolist())
predicted_tags = clf.predict(X_test.values.tolist())
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Header: %s" %df['Header'][10])
print("Actual tag: %s" %y_test.tolist()[10])
print("Predicted tag: %s" %predicted_tags[10])
print("top 3 predicted tags:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

Word embeddings extracted!

Header: vocabulary identifier
Actual tag: affected
Predicted tag: affected
top 3 predicted tags:('vocabulary identifier', {'indicator': 0.0037819818899884742, 'affected': 0.99615770581916985, 'date': 1.6189153996241466e-05})
Classification accuracy on test set: 0.954378954379


In [19]:
#Classification accuracy using organization name
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
df['Organization_embedded'] = df['Organization'].map(lambda x: fmodel.get_sentence_vector(str(x)))
print("Word embeddings extracted!\n")

X_train, X_test, y_train, y_test = train_test_split(df['Organization_embedded'], 
                                                    df['Tag'], test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train.values.tolist(), y_train.values.tolist())
predicted_tags = clf.predict(X_test.values.tolist())
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Header: %s" %df['Header'][10])
print("Actual tag: %s" %y_test.tolist()[10])
print("Predicted tag: %s" %predicted_tags[10])
print("top 3 predicted tags:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

Word embeddings extracted!

Header: vocabulary identifier
Actual tag: affected
Predicted tag: affected
top 3 predicted tags:('vocabulary identifier', {'country': 0.27494116257891105, 'affected': 0.51365898904278551, 'date': 0.15036626435293229})
Classification accuracy on test set: 0.457875457875


In [20]:
#Classification accuracy using n data points
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
embedded_datapoints()
print("Word embeddings extracted!\n")

df['data_combined'] = df.loc[:, 'embedded_datapoint0': 'embedded_datapoint' 
                                                           + str(number_of_data_point_to_vectorize-1)].values.tolist()
df['data_combined'] = df['data_combined'].apply(lambda x: [val for item in x for val in item])

X_train, X_test, y_train, y_test = train_test_split(df['data_combined'], 
                                                    df['Tag'], test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train.values.tolist(), y_train.values.tolist())
predicted_tags = clf.predict(X_test.values.tolist())
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test.tolist(), y_test.tolist())
print("Header: %s" %df['Header'][10])
print("Actual tag: %s" %y_test.tolist()[10])
print("Predicted tag: %s" %predicted_tags[10])
print("top 3 predicted tags:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

Word embeddings extracted!

Header: vocabulary identifier
Actual tag: affected
Predicted tag: affected
top 3 predicted tags:('vocabulary identifier', {'affected': 0.99759830262514182, 'damaged': 0.00074177369692129025, 'x_applicants': 0.00071139804408135176})
Classification accuracy on test set: 0.788544788545


In [21]:
df.head()

Unnamed: 0,Header,Tag,Attributes,Data,Relative Column Position,Dataset_name,Organization,Index,Header_embedding,Organization_embedded,...,datapoint5,datapoint6,embedded_datapoint0,embedded_datapoint1,embedded_datapoint2,embedded_datapoint3,embedded_datapoint4,embedded_datapoint5,embedded_datapoint6,data_combined
0,https://api.idmcdb.org/api/disaster data?ci=hd...,valid_tag,[],"['#access', '#activity', '#adm1', '#adm2', '#a...",0.1,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0,"[-0.0336763, 0.00615811, 0.0143248, 0.0602024,...","[-0.00215195, -0.0442978, -0.00501256, 0.01639...",...,adm4,adm5,"[0.028119, 0.0233844, 0.00579112, 0.0155053, -...","[0.0171477, 0.0587136, -0.107394, 0.0272542, -...","[0.0342421, -0.128727, -0.0253494, 0.0348104, ...","[0.0177757, -0.0548761, -0.0122221, 0.0553509,...","[-0.00978223, -0.0695783, -0.0589929, -0.00383...","[-0.0749834, 0.00684472, -0.150769, -0.0663059...","[-0.0156176, -0.0368736, -0.0534386, 0.120901,...","[0.028119, 0.0233844, 0.00579112, 0.0155053, -..."
1,hashtag one-liner,description,"['short', 'en']","['Access ability/constraints', 'Programme, pro...",0.2,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0,"[-0.0570783, 0.0517114, -0.0836945, 0.0270065,...","[-0.00215195, -0.0442978, -0.00501256, 0.01639...",...,or,activity.,"[0.0713341, 0.0200936, 0.0177978, 0.1283, 0.01...","[-0.0253023, 0.0463502, -0.147774, 0.0188464, ...","[-0.0615845, 0.0362075, -0.119153, 0.0451412, ...","[-0.0207568, 0.0236098, -0.0683881, -0.0960696...","[-0.0137882, -0.0806513, -0.0075456, 0.0415596...","[0.0558548, 0.0977309, -0.052073, -0.0152659, ...","[-0.0223301, -0.0038519, -0.0660513, 0.027506,...","[0.0713341, 0.0200936, 0.0177978, 0.1283, 0.01..."
2,hashtag long description,description,"['long', 'en']",['Accessiblity and constraints on access to a ...,0.3,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0,"[-0.0178264, 0.0423944, -0.0561291, 0.036485, ...","[-0.00215195, -0.0442978, -0.00501256, 0.01639...",...,to,a,"[0.0265239, 2.29938e-05, 0.0355017, 0.152221, ...","[-0.0170612, 0.0250391, -0.0678157, 0.103692, ...","[-0.0615845, 0.0362075, -0.119153, 0.0451412, ...","[-0.0107518, 0.0298298, -0.0147365, 0.0223539,...","[0.028119, 0.0233844, 0.00579112, 0.0155053, -...","[-0.0876177, 0.0630342, 0.0217104, -0.0451418,...","[0.0495569, 0.129446, -0.0491537, 0.0042916, -...","[0.0265239, 2.29938e-05, 0.0355017, 0.152221, ..."
3,release status,status,[],"['Released', 'Released', 'Released', 'Released...",0.4,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0,"[0.0106982, -0.0216493, 0.0117078, -0.00184711...","[-0.00215195, -0.0442978, -0.00501256, 0.01639...",...,Released,Released,"[-0.0507323, 0.01016, 0.0787771, 0.0243171, -0...","[-0.0507323, 0.01016, 0.0787771, 0.0243171, -0...","[-0.0507323, 0.01016, 0.0787771, 0.0243171, -0...","[-0.0507323, 0.01016, 0.0787771, 0.0243171, -0...","[-0.0507323, 0.01016, 0.0787771, 0.0243171, -0...","[-0.0507323, 0.01016, 0.0787771, 0.0243171, -0...","[-0.0507323, 0.01016, 0.0787771, 0.0243171, -0...","[-0.0507323, 0.01016, 0.0787771, 0.0243171, -0..."
4,data type restriction,valid_datatype,[],"[nan, nan, nan, nan, nan, nan, nan, 'number', ...",0.5,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0,"[-0.00368617, 0.0571897, 0.00329302, 0.0613524...","[-0.00215195, -0.0442978, -0.00501256, 0.01639...",...,,,"[0.0566073, 0.0195136, -0.00766985, -0.0078007...","[0.0566073, 0.0195136, -0.00766985, -0.0078007...","[0.0566073, 0.0195136, -0.00766985, -0.0078007...","[0.0566073, 0.0195136, -0.00766985, -0.0078007...","[0.0566073, 0.0195136, -0.00766985, -0.0078007...","[0.0566073, 0.0195136, -0.00766985, -0.0078007...","[0.0566073, 0.0195136, -0.00766985, -0.0078007...","[0.0566073, 0.0195136, -0.00766985, -0.0078007..."


In [17]:
#Tokenizing n-grams
ngrams = generate_n_grams(headers_and_tags['Header'], 3)
vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
X_vec_grams = vectorizer.fit_transform(ngrams)
print(np.shape(X_vec_grams.toarray()))
print(np.shape(X_vecs.toarray()))

(9094, 666)
(9098, 2587)


In [21]:
#Testing accuracy of MLP Classifier on Bag of Words
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
df_2 = headers_and_tags

X_train, X_test, y_train, y_test = train_test_split(X_vecs.toarray(), 
                                                    df['Tag'], test_size=0.33, random_state=0)
clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train, y_train)
predicted_tags = clf.predict(X_test)
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test, y_test)
print("Header: %s" %df['Header'][10])
print("Actual tag: %s" %y_test.tolist()[10])
print("Predicted tag: %s" %predicted_tags[10])
print("top 3 predicted tags:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

Header: vocabulary identifier
Actual tag: affected
Predicted tag: country
top 3 predicted tags:('vocabulary identifier', {'country': 0.41033625823834258, 'affected': 0.23055257535177306, 'date': 0.15533690291330776})
Classification accuracy on test set: 0.637362637363


In [22]:
##Testing accuracy of MLP Classifier on ngrams
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
df_3 = headers_and_tags

X_train, X_test, y_train, y_test = train_test_split(X_vec_grams.toarray(), 
                                                    df['Tag'][0:len(X_vec_grams.toarray())], test_size=0.33, random_state=0)
clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')

clf.fit(X_train, y_train)
predicted_tags = clf.predict(X_test)
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(X_test, y_test)
#print("Header: %s" %df['Header'][7:14])
print("Actual tag: %s" %y_test.tolist()[10])
print("Predicted tag: %s" %predicted_tags[10])
print("top 3 predicted tags:" + str(top_3_predicted_tags[10]))
print("Classification accuracy on test set: %s" %test_score)

Actual tag: date
Predicted tag: date
top 3 predicted tags:('vocabulary identifier', {'country': 0.0051356591713987388, 'affected': 0.0051233658039913522, 'date': 0.98959905105990842})
Classification accuracy on test set: 0.852431712192


In [23]:
#Aggregating embedded features into a single Dataframe

df_target = headers_and_tags
#df_target['BOW_counts'] = [item for item in X_vecs.toarray()]
n = len(X_vec_grams.toarray())
df_target = df.iloc[0:n, :]
df_target['ngrams_counts'] = [item for item in X_vec_grams.toarray()]
df_target = df_target[['Header_embedding', 
                      'Organization_embedded',
                      'data_combined',
                      'ngrams_counts']]

#df_target.to_csv('df_target.xslx')
df_target.head()

NameError: name 'X_vec_grams' is not defined

In [24]:
#using all three embedded features to predict tags
#cleaning the dataset by flattening the datastructure

cols = ['Header_embedding', 'Organization_embedded', 'data_combined']
df_target['features_combined'] = df_target[cols].values.tolist()
df_target['features_combined'] = df_target['features_combined'].apply(lambda x: [val for item in x for val in item])

In [25]:
#Testing accuracy of MLP Classifier on all features
#This is the model that will be used in the API

#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
#         pickle file = stores the classifier to be inputted into the API

#Helper function to add hashtags to the predicted tags
def add_hashtags(predicted_tags):
    if (isinstance(predicted_tags, np.ndarray)):
        return ["#"+word for word in predicted_tags]

X_train, X_test, y_train, y_test = train_test_split(df_target['features_combined'], 
                                                    df['Tag'][0:len(df_target['features_combined'])], 
                                                    test_size=0.33, random_state=0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')
clf.fit(list(X_train), y_train)
predicted_tags = clf.predict(list(X_test))
predicted_tags = add_hashtags(predicted_tags)
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
test_score = clf.score(list(X_test), y_test)
print("Classification accuracy on test set: %s" %test_score)
pickle.dump(clf,open("model.pkl","wb"))

Classification accuracy on test set: 0.943056943057


In [31]:
#Testing The Accuracy for Other Models using all four features
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.datasets import make_classification

names = ["Nearest Neighbors",
         "Random Forest", "Neural Net",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(3),
    #SVC(kernel="linear", C=0.025),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam'),
    GaussianNB()]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)

i = 1
for name, clf in zip(names, classifiers):
        X_train, X_test, y_train, y_test = train_test_split(df_target['features_combined'], 
                                                    df['Tag'][0:len(df_target['features_combined'])], 
                                                    test_size=0.33, random_state=0) 
        clf.fit(list(X_train), y_train)
        score = clf.score(list(X_test), y_test)
        print(name)
        print(score)
        

#Accuracy of MLP: 95.10%

#Models Considered
#1) Gaussian Naive Bayes       
    #Accuracy: 88.03%
#2) RandomForest       
    #Accuracy: 98.36%

Nearest Neighbors
0.904095904096
Random Forest
0.67698967699
Neural Net
0.94338994339
Naive Bayes
0.625041625042
