In [1]:
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import itertools 
import pickle
from sklearn.neural_network import MLPClassifier
from fastText import load_model
from sklearn.model_selection import train_test_split
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
df = pd.read_csv("try.csv")

In [8]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruochen99/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
fasttext_model = 'wiki.en.bin'
fmodel = load_model(fasttext_model)

In [10]:
def lower_cols(lst):
    return [word.lower() for word in lst]


def remove_chars(lst):

    cleaned = [re.sub('\s+', ' ', mystring).strip() for mystring in lst]
    cleaned = [re.sub(r'[[^A-Za-z0-9\s]+]', ' ', mystr) for mystr in cleaned]
    cleaned = [mystr.replace('_', ' ') for mystr in cleaned]
    return cleaned

def clean_cols(data):
    data = lower_cols(data)
    data = remove_chars(data)
    return data

In [17]:
def preprocess(pandas_dataset, df_target):
    if (not pandas_dataset.empty):
    	organization = 'HDX'   #Replace if datasets contains organization
    	headers = list(pandas_dataset.columns.values)
    	headers = clean_cols(headers)
    for i in range(len(headers)):
        try:
            dic = {'Header': headers[i], 
                   'Data': list(pandas_dataset.iloc[1:, i]), 
                   'Relative Column Position': (i+1) / len(pandas_dataset.columns), 
                   'Organization': organization,
                   'Index': i}
            df_target.loc[len(df_target)] = dic
        except:
            raise Exception("Error: arguments not matched")

    df_result = transform_vectorizers(df_target)
    return df_result

def transform_vectorizers(df_target):
    cols = ['Header_embedding', 'Organization_embedded', 'BOW_counts', 'ngrams_counts']
    df = pd.DataFrame(columns = cols)
    long_string = []
    for i in df_target['Data']:
        result_by_tag = word_extract(i)
        holder_list = ''.join(result_by_tag)
        long_string.append(holder_list)
    bag_vectorizer = CountVectorizer()
    corpus = long_string
    X_vecs_bag = bag_vectorizer.fit_transform(corpus)
    df['BOW_counts'] = [item for item in X_vecs_bag.toarray()]
    ngrams = generate_n_grams(df_target['Header'], 3)
    ngrams_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
    X_vec_grams = ngrams_vectorizer.fit_transform(ngrams)
    df['ngrams_counts'] = pd.Series([item for item in X_vec_grams.toarray()])
    df['Header_embedding'] = df_target['Header'].astype(str).apply(fmodel.get_sentence_vector)
    df['Organization_embedded'] = df_target['Organization'].astype(str).apply(fmodel.get_sentence_vector)
    cols = ['Header_embedding', 'Organization_embedded', 'BOW_counts', 'ngrams_counts']
#     df['features_combined'] = df[cols].values.tolist()
#     df['features_combined'] = df['features_combined'].apply(lambda x: [val for item in x for val in item])
    return df

In [18]:
def remove_stop_words(data_lst):
    #remove stopwords from the data including 'the', 'and' etc.
    wordsFiltered = []
    for w in data_lst:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered

def word_extract(row):
    ignore = ['nan']
    no_white = [i.lstrip() for i in row if i not in ignore and not isinstance(i, float)]
    cleaned_text = [w.lower() for w in no_white if w not in ignore]
    return cleaned_text

def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS 

from nltk import ngrams
def generate_n_grams(data_lst, n):
    # cleaned = remove_chars(list(data_lst))
    # cleaned = clean_cols(cleaned)
    cleaned = remove_stop_words(data_lst)
    #make sure that n_grams 'refresh' when a new dataset is encountered!!!!   
    return list(ngrams(cleaned, n))

In [19]:
processed_dataset = preprocess(df, 
                pd.DataFrame(columns=['Header','Data','Relative Column Position','Organization','Index']))

In [20]:
processed_dataset

Unnamed: 0,Header_embedding,Organization_embedded,BOW_counts,ngrams_counts
0,"[0.0328802, -0.0268485, 0.0519262, -0.0128513,...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[10, 1, 4, 2, 28, 16, 0, 13, 1, 0, 0, 0, 16, 0...","[0, 0, 1, 0, 0, 1, 1]"
1,"[0.0071931, 0.00791951, 0.0851453, 0.019095, -...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 99,...","[0, 0, 0, 1, 0, 1, 1]"
2,"[0.0346546, 0.0293416, 0.0911392, -0.035828, -...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 0, 1, 0]"
3,"[0.00451584, 0.0267195, 0.0414496, -0.0346788,...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 1, 0, 0]"
4,"[0.0247415, -0.0548961, 0.0600528, -0.0703906,...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 1, 1, 0, 0, 0, ...","[1, 1, 0, 0, 1, 0, 0]"
5,"[0.0328387, 0.0226398, 0.0369784, -0.0588113, ...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
6,"[-0.00230675, 0.0241692, -0.0922038, -0.018988...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",


In [57]:
processed_dataset['Header_embedding'][0]

array([ 0.03288024, -0.02684848,  0.05192618, -0.01285128, -0.0635905 ,
       -0.05403647,  0.08779965, -0.07167736,  0.03897271,  0.02293284,
       -0.02944689, -0.08673232, -0.09263594, -0.03888061,  0.0752315 ,
       -0.09787023, -0.01358463,  0.02958274, -0.04515599,  0.12325224,
       -0.12456893,  0.0937695 , -0.01050754, -0.0037347 , -0.00369096,
       -0.1762747 , -0.07706834, -0.08182441,  0.00241445,  0.07978925,
       -0.04256374,  0.01809001,  0.09561884, -0.06217436, -0.01019568,
        0.04137434,  0.04448066, -0.05133887,  0.02631863,  0.00505017,
       -0.01075878,  0.00626424,  0.03493123,  0.01084564, -0.0244076 ,
        0.07872323,  0.01787304, -0.02668435,  0.0023393 , -0.1359282 ,
        0.11748824, -0.08286507,  0.01465994,  0.00077083, -0.0296096 ,
       -0.02515159,  0.08357313,  0.06202837, -0.06070811,  0.03515776,
       -0.04565581, -0.03414272,  0.06185154, -0.00675017,  0.02223102,
        0.00688542,  0.01838031, -0.05574631, -0.03063917, -0.08

In [58]:
cols = ['Header_embedding', 'Organization_embedded', 'BOW_counts', 'ngrams_counts']
processed_dataset['features_combined'] = processed_dataset[cols].values.tolist()
processed_dataset['features_combined'] = processed_dataset['features_combined'].apply(lambda x: np.concatenate(x, axis=None))
processed_dataset['features_combined'].iloc[2]


array([  3.46545577e-02,   2.93416288e-02,   9.11392346e-02,
        -3.58279794e-02,  -2.52960641e-02,   5.75821511e-02,
         6.42661303e-02,   3.58346961e-02,  -6.35495111e-02,
         1.14948209e-02,  -4.86125275e-02,  -5.56523092e-02,
        -2.38003284e-02,  -1.01408973e-01,  -6.89931680e-03,
        -1.14249155e-01,  -6.82919025e-02,   2.58631222e-02,
        -6.42322674e-02,   1.05743043e-01,  -1.97368171e-02,
        -7.23236240e-03,  -3.66313532e-02,   8.94448534e-03,
        -4.02227081e-02,  -5.59153818e-02,  -7.78581277e-02,
        -9.00442600e-02,   4.57502715e-02,   8.45306888e-02,
         1.38753187e-02,   5.23300329e-03,  -4.16784808e-02,
         1.11593209e-01,   4.97127473e-02,  -4.95482832e-02,
         6.09176643e-02,   1.98979359e-02,  -4.19279095e-03,
        -4.08384651e-02,   4.98512425e-02,  -5.30964099e-02,
         2.26239748e-02,  -6.78759813e-02,   9.05484799e-03,
         3.24023962e-02,  -1.75457783e-02,  -1.25688821e-01,
        -1.91826113e-02,