In [1]:
#Toggle Model Parameters

create_dataset = False #boolean to determine whether to download datasets from HDX Database vs. pre-loaded excel file
SAMPLE_NUMBER_OF_DATASETS = 150 #number of training datasets to download
use_randomized_sample_for_BOW = True #boolean to determine whether to take a random sample from the data to expedite 
#feature vectorization
use_randomized_sample_for_training_data = False

In [2]:
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import itertools 
import pickle
from sklearn.neural_network import MLPClassifier
from fastText import load_model
from sklearn.model_selection import train_test_split
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
#Setting up account connection to HDX Database
setup_logging()
Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True)

No logging configuration parameter. Using default.
Loading logging configuration from: c:\users\cheneli\anaconda3\lib\site-packages\hdx\utilities\logging_configuration.yml
INFO - 2019-07-19 11:35:18 - hdx.hdx_configuration - No HDX base configuration parameter. Using default base configuration file: c:\users\cheneli\anaconda3\lib\site-packages\hdx\hdx_base_configuration.yml.
INFO - 2019-07-19 11:35:18 - hdx.hdx_configuration - Loading HDX base configuration from: c:\users\cheneli\anaconda3\lib\site-packages\hdx\hdx_base_configuration.yml
INFO - 2019-07-19 11:35:18 - hdx.hdx_configuration - No HDX configuration parameter and no configuration file at default path: C:\Users\cheneli\.hdx_configuration.yml.
INFO - 2019-07-19 11:35:18 - hdx.hdx_configuration - Read only access to HDX: True


'https://data.humdata.org/'

In [4]:
#HELPER FUNCTIONS

#Check if the dataset has at least 1 resource of the required file type(s).

def check_type(dataset, file_types=[]):
    temp_dataset = Dataset.read_from_hdx(dataset)
    temp_dataset.separate_resources()
    if (len(temp_dataset.resources) > 0):
        if (len(file_types) > 0):
            if (not set(temp_dataset.get_filetypes()).isdisjoint(file_types)): 
                    return True
        else :
            return True
    return False

#check if organization from HXL
def check_organization(dataset):
    if dataset.get_organization()['title'] != 'Humanitarian Exchange Language(HXL)':
            return True
    return False

In [5]:
nltk.download('stopwords')

#Helper functions to preprocess data

def lower_cols(lst):
    #convert data to lowercases
    return [word.lower() for word in lst]

def remove_chars(lst):
    #remove punctuation characters such as ",", "(", ")", """, ":", "/","." and "_".
    #NOTE: PRESERVES WHITE SPACE.
    cleaned = [re.sub('\s+', ' ', mystring).strip() for mystring in lst]
    cleaned = [re.sub(r'[[^A-Za-z0-9\s]+]', ' ', mystr) for mystr in cleaned]
    cleaned = [mystr.replace('_', ' ') for mystr in cleaned]
    return cleaned

stopWords = set(stopwords.words('english'))

def remove_stop_words(data_lst):
    #remove stopwords from the data including 'the', 'and' etc. 
    wordsFiltered = []
    for w in data_lst:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered

def separate_words(lst):
    #separate words according to capitalization. Ex: projectNumber --> project Number
    wordsFiltered = []
    for word in lst:
        temp = re.sub( r"([A-Z])", r" \1", word).split()
        if not temp or (len(temp) == len(word)):
            wordsFiltered.append(word)
        else:     
            string = ' '.join(temp)
            wordsFiltered.append(string)
    return wordsFiltered

#Function to aggregate above preprocessing functions
def clean_cols(data):
    data = separate_words(data)
    data = lower_cols(data)
    data = remove_chars(data)
    return data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cheneli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Download one dataset with required type(s), read it into Dataframe, 
# add all Headers, Tags, Attributes, Data, Relative Column Position, Dataset_name, and Organizations to our DataFrame,
# temporarily stores this data in the datasets folder,
# and subsequently deletes the dataset

def process_dataset(dataset, file_type, dataframe, download_path, index, row_limit = 10):
    global count
    organization = ""
# Download one dataset and read it into a DataFrame 
    if (file_type == None):
        url, path = dataset.resources[0].download(download_path)
        pandas_dataset = pd.read_csv(path)
    else:
        if (file_type not in dataset.get_filetypes()):
            return 'Error: Required file type not in dataset OR dataset does not contain any resources.'
        try:
            url, path = dataset.resources[dataset.get_filetypes().index(file_type)].download(download_path)
            organization = dataset.get_organization()['title']
            print('Resource URL %s downloaded to %s' % (url, path))
            pandas_dataset = pd.read_csv(path, encoding='latin-1')
            pandas_dataset = pandas_dataset.head(row_limit)
        except:
            return 'Unknown error.'
    
    # Add headers, tags and data to our DataFrame if current dataset not empty
        if (not pandas_dataset.empty):
            dataset_df = pandas_dataset
            headers = list(dataset_df.columns.values)
            headers = clean_cols(headers)
            tags = list(dataset_df.iloc[0,:])
            for i in range(len(headers)):
                try:
                    splitted = re.split('[(^\s+)+#]', tags[i])
                    splitted = list(filter(None, splitted))
                    hashtag = splitted[0]
                    attributes = splitted[1:]
                    dic = {'Header': headers[i], 'Tag': hashtag, 'Attributes': attributes, 
                           'Data': list(dataset_df.iloc[1:, i]), 
                           'Relative Column Position': (i+1) / len(dataset_df.columns), 
                           'Dataset_name': os.path.basename(path), 
                           'Organization': organization,
                           'Index': index}
                    dataframe.loc[len(dataframe)] = dic
                except:
                    print("Error: different number of headers and tags")
            count += 1
        os.remove(path)
        print("File Removed!")
        return

In [7]:
# Search for all datasets with HXL tags
datasets_HXL = Dataset.search_in_hdx('HXL')
len(datasets_HXL)

3684

In [8]:
# Create a DataFrame for all Headers, Tags, Attributes, Data, Relative Column Position, Dataset_name, and Organizations

col_names = ['Header', 'Tag', 'Attributes','Data','Relative Column Position','Dataset_name', 'Organization','Index']
headers_and_tags= pd.DataFrame(columns = col_names)

In [9]:
# Clean dataframe for model input
headers_and_tags['Header'] = clean_cols(headers_and_tags['Header'])
headers_and_tags['Data'] = remove_stop_words(headers_and_tags['Data'])

In [10]:
#Reading in n tagged datasets either from HDX Database or pre-loaded excel file
count = 0
if (create_dataset):
    for i in range(SAMPLE_NUMBER_OF_DATASETS):
        rand_dataset = np.random.randint(0, len(datasets_HXL))
        process_dataset_2(datasets_HXL[rand_dataset], 'CSV', headers_and_tags, './datasets', count)
        print(i)
        
    headers_and_tags.to_excel("headerandtag.xlsx")
else:
    headers_and_tags = pd.read_excel("headertag_fulldataset.xlsx")

In [11]:

headers_and_tags.head(200)


Unnamed: 0,Header,Tag,Attributes,Data,Relative Column Position,Dataset_name,Organization,Index
0,https://api.idmcdb.org/api/disaster data?ci=hd...,valid_tag,[],"['#access', '#activity', '#adm1', '#adm2', '#a...",0.100000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
1,hashtag one-liner,description,"['short', 'en']","['Access ability/constraints', 'Programme, pro...",0.200000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
2,hashtag long description,description,"['long', 'en']",['Accessiblity and constraints on access to a ...,0.300000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
3,release status,status,[],"['Released', 'Released', 'Released', 'Released...",0.400000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
4,data type restriction,valid_datatype,[],"[nan, nan, nan, nan, nan, nan, nan, 'number', ...",0.500000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
5,first release,meta,['release'],"['1.1', '1.0', '1.0', '1.0', '1.0', '1.0', '1....",0.600000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
6,default taxonomy,valid_vocab,['default'],"[nan, nan, '+v_pcode', '+v_pcode', '+v_pcode',...",0.700000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
7,category,meta,['category'],"['1.3. Responses and other operations', '1.3. ...",0.800000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
8,sample hxl,meta,"['example', 'hxl']","['#access +type', '#activity +project', '#adm1...",0.900000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0
9,sample description,meta,"['example', 'description', 'en']","['type of access being described', 'an aid pro...",1.000000,hxl-core-hashtag-schema.csv.CSV,Humanitarian Exchange Language (HXL),0


In [12]:
headers_and_tags.to_excel('processed_input.xlsx')

In [13]:
def select_randomly(dataset, threshold = 200):
    #ensures that dataset isn't skewed towards particular tags by ensuring that each tag has at most a given number 
    #(default: 200) of rows defined by the threshold
    new_dataset = dataset
    tags_to_be_pruned = dataset['Tag'].value_counts()[dataset['Tag'].value_counts() > threshold].keys().tolist()
    for tag in tags_to_be_pruned:
        count = dataset['Tag'][dataset['Tag'] == tag].value_counts()
        drop_count = count - threshold
        drop_count = drop_count.tolist()[0]
        new_dataset = new_dataset.drop(np.random.choice(new_dataset[new_dataset['Tag']==tag].index,size=drop_count,replace=False))
    return new_dataset    

In [14]:
#sample the data to avoid favoring frequently appearing words
if (use_randomized_sample_for_training_data):
    headers_and_tags = select_randomly(headers_and_tags)
    headers_and_tags.to_excel('processed_input_random_selection.xlsx')

In [12]:
#implementing n-grams Model (not used in this model.)
from nltk import ngrams

def generate_n_grams(data_lst, n):
    cleaned = remove_chars(list(data_lst))
    cleaned = clean_cols(cleaned)
    cleaned = remove_stop_words(cleaned) 
    return list(ngrams(cleaned, n))

In [13]:
#creating a n-gram frequency table 

def count_stats_grams(two_d_arr):
    lst = np.array([])
    count = 0
    singles_count = 0
    multiples_count = 0
    for arr in two_d_arr:
        if arr not in lst:
            count += 1
            np.append(lst, arr)
        if two_d_arr.count(arr) == 1:
            singles_count += 1
        if two_d_arr.count(arr) > 1:
            multiples_count += 1
    check = count - singles_count
    assert(check == multiples_count)
    return count, singles_count, multiples_count

def n_gram_freqs(dataframe, max_n = 4):
    n_gram_cols = ['n-gram', 'data' ,'unique ngrams', 'multiples', 'singles']
    n_gram_freqs = pd.DataFrame(columns = n_gram_cols)
    for i in range(max_n):
        n = i+1
        n_grams = generate_n_grams(dataframe['Header'], n)
        unique_n_grams, singles, multiples = count_stats_grams(n_grams)
        row = {'n-gram': n, 
              'data': n_grams,
              'unique ngrams': unique_n_grams,
              'multiples': multiples,
              'singles': singles}
        n_gram_freqs.loc[len(n_gram_freqs)] = row
    return pd.DataFrame(n_gram_freqs)  

In [14]:
#Takes a data row and cleans it for model input
def word_extract(row):
    ignore = ['nan']
    no_white = [i.lstrip() for i in row if i not in ignore and not isinstance(i, float)]
    cleaned_text = [w.lower() for w in no_white if w not in ignore]
    return cleaned_text

long_string = []
for i in headers_and_tags['Data']:
    result_by_tag = word_extract(i)
    holder_list = ''.join(result_by_tag)
    long_string.append(holder_list)

In [16]:
#loading in the pre-determined word vectors
fasttext_model = 'wiki.en.bin'
fmodel = load_model(fasttext_model)
print("Pre-trained model loaded successfully!\n")

Pre-trained model loaded successfully!



In [17]:
#helper function to extract the top n most likely tags for a given column (3 top tags selected in default.)
#returns a dictionary where key = header and values = dictionary of num_of_top_tags tags with their respective probabilities

def top_tags(clf, X_test, series, num_of_top_tags = 3):
    if (not isinstance(X_test, np.ndarray)):
        X_test = X_test.values.tolist()
    probs = clf.predict_proba(X_test)
    values = []
    for i in range(len(X_test)):
        max_args = probs[i].argsort()[-num_of_top_tags:][::-1]
        top_suggested_tags = clf.classes_[max_args]
        dictionary = {}
        sorted_probs = np.take(probs[i], max_args)
        key = series.iloc[i]
        for j in range(len(top_suggested_tags)):
            dictionary[top_suggested_tags[j]]=sorted_probs[j]
        values.append((key, dictionary))
    return values

In [18]:
#helper functions to determine the confidence level of top predicted tag. Any tags with a confidence level < 0.5
#will be discarded and replaced by a blank tag instead. The function returns an array of booleans determining whether 
#a given tag will be blank or not. 

def tag_predicted(clf, X_test, series, threshold):
    #True if tag should be left blank
    if (not isinstance(X_test, np.ndarray)):
        X_test = X_test.values.tolist()
    probs = clf.predict_proba(X_test)
    values = []
    for i in range(len(X_test)):
        max_arg = probs[i].argsort()[-1]
        top_suggested_tag = clf.classes_[max_arg]
        prob = np.take(probs[i], max_arg)
        if (prob > threshold):
            values.append(False)
        else:
            values.append(True)
    return values

#helper function to fill in the blanks for tags that have a confidence level less than the threshold
def fill_blank_tags(predicted_tags, clf, X_test, series, threshold = 0.5):
    boolean_array = tag_predicted(clf, X_test, series, threshold)
    for i in range(len(predicted_tags)):
        if (boolean_array[i] == True):
            predicted_tags[i] = ''
    return predicted_tags

In [19]:
#helper function to extract n random data points from the data column and vectorizes them into columns
def separate_words(series): 
    #each series is a long string that contains all the data
    if (not isinstance(series, str)):
        series = str(series)
    lst = re.split(r"[^a-zA-Z0-9_.]", series)
    lst = list(filter(None, lst))
    return lst
    
def vectorize_n_datapoints(number_of_datapoints_to_vectorize = 7):
    df['Data_separated'] = df['Data'].apply(separate_words)
    if (number_of_datapoints_to_vectorize > len(df['Data_separated'][0])):
        number_of_datapoints_to_vectorize = len(df['Data_separated'][0])
    for i in range(number_of_datapoints_to_vectorize):
        df['datapoint' + str(i)] = df['Data_separated'].str[i]
        
def embedded_datapoints(number_of_data_point_to_vectorize = 7):
    vectorize_n_datapoints()
    for i in range(number_of_data_point_to_vectorize):
        df['embedded_datapoint' + str(i)] = df['datapoint' + str(i)].map(lambda x: fmodel.get_sentence_vector(str(x)))
        
number_of_data_point_to_vectorize = 7

In [24]:
#post-processing function that 1) checks tags with low confidence against mappings 2) fills in a blank prediction for 
#tags with a confidence level lower than threshold and had no obvious mappings associated with the predicted tag. 
#The function returns 1) the count of corrected tags 2) predicted and predicted tags

def check_mapping(header, predicted_tag):
    MAPPINGS = {
    "#geo" : ['lon', 'lat', 'latitude', 'longitude'], #words that would likely appear for #geo tag
    "#admin" : ['county'], #words that would likely appear for #admin tag
    "#country" :  ['country'], #words that would likely appear for #country tag
    "#date" : ['year', 'date'], #words that would likely appear for #date tag
    "#funding": ['funding', 'funded'], #words that would likely appear for #funding tag
    "#value": ['percentfunded'], #words that would likely appear for #value tag
    "#org":['organization', 'funder ref', 'org'], #words that would likely appear for #org tag
    "#status":['status'], #words that would likely appear for #status tag
    "#sector":['sector'], #words that would likely appear for #sector tag
    "#adm1":['adm1', 'admin1'], #words that would likely appear for #adm1 tag
    "#adm2":['adm2', 'admin2'], #words that would likely appear for #adm2 tag
    "#adm3":['adm3', 'admin3'], #words that would likely appear for #adm3 tag
    "#adm4":['adm4', 'admin4']  #words that would likely appear for #adm4 tag        
    }
    change_tag = False
    header_words = header.split()
    for key, val in MAPPINGS.items():
        for word in header_words:
            #check if the header contains any of the words in the mappings (substrings are not included)
            if (word in val):
                if (predicted_tag != key):
                    predicted_tag = key
                    change_tag = True
    return change_tag, predicted_tag
    

def post_processing(headers, predicted_tags, clf, X_test, mapping_threshold = 0.85, blank_threshold = 0.2):
    if (not isinstance(X_test, np.ndarray)):
        X_test = X_test.values.tolist()
    probs = clf.predict_proba(X_test)
    values = []
    corrected_count = 0
    blank_count = 0
    for i in range(len(X_test)):
        max_arg = probs[i].argsort()[-1]
        top_suggested_tag = clf.classes_[max_arg]
        prob = np.take(probs[i], max_arg)
        header = headers.tolist()[i]
        predicted_tag = predicted_tags[i]
        if (prob < mapping_threshold):
            inc, predicted_tag = check_mapping(header, predicted_tag)
            if (inc):
                corrected_count += 1
            else:
                if (prob < blank_threshold): 
                    predicted_tag = ''
                    blank_count += 1
        values.append(predicted_tag)
    return corrected_count, blank_count, values 

In [21]:
#Helper function to add hashtags to the predicted tags
def add_hashtags(predicted_tags):
    if (isinstance(predicted_tags, np.ndarray)):
        return ["#"+word for word in predicted_tags]

In [22]:
#CREATING THE MODEL and TESTING ACCURACY OF MLP Classifer on all features. This is the model used in the API.

#using all three embedded features to predict tags
#cleaning the dataset by flattening the datastructure

df = headers_and_tags

df['Header_embedding'] = df['Header'].map(lambda x: fmodel.get_sentence_vector(str(x)))
df['Organization_embedded'] = df['Organization'].map(lambda x: fmodel.get_sentence_vector(str(x)))
embedded_datapoints()
print("Word embeddings extracted!\n")

df['data_combined'] = df.loc[:, 'embedded_datapoint0': 'embedded_datapoint' 
                                                           + str(number_of_data_point_to_vectorize-1)].values.tolist()
df['data_combined'] = df['data_combined'].apply(lambda x: [val for item in x for val in item])

df_target = df
cols = ['Header_embedding', 'Organization_embedded', 'data_combined']
df_target['features_combined'] = df_target[cols].values.tolist()
df_target['features_combined'] = df_target['features_combined'].apply(lambda x: [val for item in x for val in item])
print("Features created!")

Word embeddings extracted!

Features created!


In [25]:
#Outputs: predicted_tags = 1D array including the most likely tags for each column
#         top_3_predicted_tags = a dictionary of 3 most likely tags with their respective probabilities for a given column
#         pickle file = stores the classifier to be inputted into the API

X_train, X_test, y_train, y_test = train_test_split(df_target['features_combined'], 
                                                    df['Tag'][0:len(df_target['features_combined'])], 
                                                    test_size=0.33, random_state=0)

_1, headers, _2, _3 = train_test_split(df_target['Header'][0:len(df_target['features_combined'])], 
                                       df['Tag'][0:len(df_target['features_combined'])], test_size = 0.33, random_state = 0)

clf = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')
clf.fit(list(X_train), y_train)
predicted_tags = clf.predict(list(X_test))
predicted_tags = add_hashtags(predicted_tags)

corrected_count, blank_count, predicted_tags = post_processing(headers, predicted_tags, clf, X_test)
print("Number of tags corrected: " + str(corrected_count))
print("Number of blank tags: " + str(blank_count))

top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
#predicted_tags = fill_blank_tags(predicted_tags, clf, X_test, df['Header'])
test_score = clf.score(list(X_test), y_test)
print("Classification accuracy on test set: %s" %test_score)
pickle.dump(clf,open("model.pkl","wb"))
pickle.dump(predicted_tags, open("predicted_tags.pkl","wb"))
pickle.dump(top_3_predicted_tags, open("top_three_predicted_tags.pkl", "wb"))

Number of tags corrected: 18
Number of blank tags: 35
Classification accuracy on test set: 0.943056943057


In [None]:
#CELLS BELOW ARE MEANT FOR TESTING PURPOSES. THEY SERVE NO PURPOSE FOR THE MODEL.

In [114]:
count, predicted_tags = post_processing(headers, predicted_tags, clf, X_test)

In [75]:
count, predicted_tags = check_mappings(headers, predicted_tags)
print("Number of tags corrected: " + str(count))

Number of tags corrected: 13


In [106]:
pd.set_option('display.max_rows', 5000)
df_target['Header'][1056:1070]

10973            disp date
10974       orig prov code
10975       orig prov name
10976       orig dist code
10977       orig dist name
10978       disp prov code
10979       disp prov name
10980       disp dist code
10981       disp dist name
10982             disp ind
10983             disp fam
10984      disp adult male
10985    disp adult female
10986    disp children u18
Name: Header, dtype: object

In [108]:
x_test_train, x_test_test, y_test_train, y_test_test = train_test_split(
df_target['Header_embedding'][1500:], df['Tag'][1500:], test_size = 0, random_state = 0)
clf2 = MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam')
clf2.fit(list(x_test_train), y_test_train)
#df_target['Header'][1056:1070]
test_arr = pd.DataFrame(['PooledFundName', 'ExternalProjectCode', 'ProjectStatus', 'ProcessStatus'])
test = test_arr.map(lambda x: fmodel.get_sentence_vector(str(x)))
test_tags = clf2.predict(list(test))
test_tags

array(['date', 'adm2', 'adm2', 'adm3', 'adm3', 'adm2', 'adm2', 'adm3',
       'adm3', 'affected', 'affected', 'affected', 'affected', 'affected'],
      dtype='<U22')

In [78]:
top_3_predicted_tags = top_tags(clf, X_test, df['Header'])
predicted_tags = fill_blank_tags(predicted_tags, clf, X_test, df['Header'])
test_score = clf.score(list(X_test), y_test)
print("Classification accuracy on test set: %s" %test_score)
pickle.dump(clf,open("model.pkl","wb"))
pickle.dump(predicted_tags, open("predicted_tags.pkl","wb"))
pickle.dump(top_3_predicted_tags, open("top_three_predicted_tags.pkl", "wb"))

Classification accuracy on test set: 0.943722943723


In [31]:
#Testing The Accuracy for Other Models using all three features
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.datasets import make_classification

names = ["Nearest Neighbors",
         "Random Forest", "Neural Net",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(3),
    #SVC(kernel="linear", C=0.025),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),#use graph search? toggle max_features
    MLPClassifier(activation='relu', alpha=0.001, epsilon=1e-08, hidden_layer_sizes=150, solver='adam'),
    GaussianNB()]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)

i = 1
for name, clf in zip(names, classifiers):
        X_train, X_test, y_train, y_test = train_test_split(df_target['features_combined'], 
                                                    df['Tag'][0:len(df_target['features_combined'])], 
                                                    test_size=0.33, random_state=0) 
        clf.fit(list(X_train), y_train)
        score = clf.score(list(X_test), y_test)
        print(name)
        print(score)
        

#Accuracy of MLP: 95.10%

#Models Considered
#1) Gaussian Naive Bayes       
    #Accuracy: 88.03%
#2) RandomForest       
    #Accuracy: 98.36%

Nearest Neighbors
0.904095904096
Random Forest
0.67698967699
Neural Net
0.94338994339
Naive Bayes
0.625041625042
