In [1]:
from flask import Flask, redirect, request, render_template, url_for, send_from_directory, make_response
from werkzeug.wrappers import Request, Response
from werkzeug.serving import run_simple
from jsonrpc import JSONRPCResponseManager, dispatcher
import os
import hashlib
import pandas as pd
import numpy as np
from werkzeug.utils import secure_filename
import pickle
import re
from sklearn.neural_network import MLPClassifier
from fastText import load_model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from flask_cors import CORS
from nltk import ngrams
import json

In [2]:
pd.__version__

'0.20.3'

In [3]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruochen99/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Install fastText by doing:<br>
#git clone https://github.com/facebookresearch/fastText.git<br>
#cd fastText<br>
#pip install .<br>
#'wiki.en.bin' needs to be in the same directory as server.py (can be downloaded from <br>
#https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip)<br>


In [4]:
fasttext_model = 'wiki.en.bin'
fmodel = load_model(fasttext_model)

In [5]:
UPLOAD_FOLDER = '\datasets'
ALLOWED_EXTENSIONS_CSV = set(['csv'])
ALLOWED_EXTENSIONS_JSON = set(['json'])

In [6]:
# app = Flask(__name__)
# # CORS(app)
# app.config['UPLOAD_FOLDER'] = os.path.join(app.instance_path)

In [7]:
def lower_cols(lst):
    #convert data to lowercases
    #QUESTION: will I miss any important information? 
    return [word.lower() for word in lst if isinstance(word,str)]

In [8]:
def remove_chars(lst):
    #remove punctuation characters such as ",", "(", ")", """, ":", "/", and "."
    #NOTE: PRESERVES WHITE SPACE.
    #QUESTION: any other characters we should be aware of? Is this a good idea? I'm inspecting each word individually.
    #Any potential pitfalls? 
    cleaned = [re.sub('\s+', ' ', mystring).strip() for mystring in lst]
    cleaned = [re.sub(r'[[^A-Za-z0-9\s]+]', ' ', mystr) for mystr in cleaned]
    cleaned = [mystr.replace('_', ' ') for mystr in cleaned]
    return cleaned

In [9]:
def clean_cols(data):
    data = lower_cols(data)
    data = remove_chars(data)
    return data

In [10]:
def fill_empty_cols(df):
    empty_cols = []
    for i in df.columns.values:
        if (len(df[i].dropna()) == 0):
            df.at[2,i] = 1
            empty_cols.append(df.columns.get_loc(i))
    return df, empty_cols

In [11]:
def preprocess(pandas_dataset, df_target):
    if (not pandas_dataset.empty):
        organization = 'HDX'   #Replace if datasets contains organization
        pandas_dataset.dropna(how = 'all', inplace = True)
        pandas_dataset, empty_cols = fill_empty_cols(pandas_dataset)
        print(empty_cols)
#         pandas_dataset.dropna(axis=1, how = 'all', subset=range(1,len(pandas_dataset)), inplace = True)
        headers = list(pandas_dataset.columns.values)        
        headers = clean_cols(headers)
    for i in range(len(headers)):
        try:
            dic = {'Header': headers[i], 
                   'Data': list(pandas_dataset.iloc[1:, i]), 
                   'Relative Column Position': (i+1) / len(pandas_dataset.columns), 
                   'Organization': organization,
                   'Index': i}
            df_target.loc[len(df_target)] = dic
        except:
            raise Exception("Error: arguments not matched")
    df_result = transform_vectorizers(df_target)
    return df_result, empty_cols

In [26]:
def transform_vectorizers(df_target):
    number_of_data_point_to_vectorize = 7
    cols = ['Header_embedding', 'Organization_embedded', 'features_combined']
    df = pd.DataFrame(columns = cols)
    df_target, number_of_data_point_to_vectorize = embedded_datapoints(df_target, 7)
    df['data_combined'] = df_target.loc[:, 'embedded_datapoint0': 'embedded_datapoint' 
                                                           + str(number_of_data_point_to_vectorize-1)].values.tolist()
    df['data_combined'] = df['data_combined'].apply(lambda x: [val for item in x for val in item])
    df['Header_embedding'] = df_target['Header'].astype(str).apply(fmodel.get_sentence_vector)
    df['Organization_embedded'] = df_target['Organization'].astype(str).apply(fmodel.get_sentence_vector)
    cols = ['Header_embedding', 'Organization_embedded', 'data_combined']
    df['features_combined'] = df[cols].values.tolist()
    df['features_combined'] = df['features_combined'].apply(lambda x: [val for item in x for val in item])
    diff = 2700 - len(df['features_combined'][0])
    for i in range(len(df)):
        for j in range(diff):
            df['features_combined'][i].append(0)
    df = df.dropna()
    return df

In [27]:
def separate_words(series): 
    #each series is a long string that contains all the data
    lst = []
    cleanlist = [str(x) for x in series if str(x) != 'nan']
    for i in cleanlist:
        lst = re.split(r"\W+", i)
        lst.extend(list(filter(None, lst)))
    return lst
    
def vectorize_n_datapoints(df, number_of_datapoints_to_vectorize = 7):
#     print(df['Data'].head())
#     print(df['Data'].iloc[0])
#     for i in range(len(df['Data'])):
#         df['Data_separated'].iloc[0] = separate_words(df['Data'].iloc[0])
    df['Data_separated'] = df['Data'].apply(separate_words)
    if (number_of_datapoints_to_vectorize > len(df['Data_separated'][0])):
        number_of_datapoints_to_vectorize = len(df['Data_separated'][0])
    for i in range(number_of_datapoints_to_vectorize):
        df['datapoint' + str(i)] = df['Data_separated'].str[i]
    return df, number_of_datapoints_to_vectorize

In [28]:
def embedded_datapoints(df, number_of_data_point_to_vectorize=7):
    df, number_of_data_point_to_vectorize = vectorize_n_datapoints(df)
    for i in range(number_of_data_point_to_vectorize):
        
        df['embedded_datapoint' + str(i)] = df['datapoint' + str(i)].map(lambda x: fmodel.get_sentence_vector(str(x)))
    return df, number_of_data_point_to_vectorize

In [29]:
def remove_stop_words(data_lst):
    #remove stopwords from the data including 'the', 'and' etc.
    wordsFiltered = []
    for w in data_lst:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered

In [30]:
def word_extract(row):
    ignore = ['nan']
    no_white = [i.lstrip() for i in row if i not in ignore and not (isinstance(i, float) or isinstance(i,int))]
    cleaned_text = [w.lower() for w in no_white if w not in ignore]
    return cleaned_text

In [31]:
def allowed_file_csv(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS_CSV

In [32]:
def allowed_file_json(filename):
    return '.' in filename and \
            filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS_JSON

In [33]:
def generate_n_grams(data_lst, n):
    # cleaned = remove_chars(list(data_lst))
    # cleaned = clean_cols(cleaned)
    cleaned = remove_stop_words(data_lst)
    #make sure that n_grams 'refresh' when a new dataset is encountered!!!!   
    return list(ngrams(cleaned, n))

In [185]:
input_dataset = pd.read_csv('test_files/palestine.csv', encoding = "ISO-8859-1", na_values=['nan',' nan'])
input_headers = input_dataset.columns.values
# input_dataset = input_dataset.rename(columns=input_dataset.iloc[0]).drop(input_dataset.index[0])

In [186]:
input_dataset.head()

Unnamed: 0,X,Y,source_url,what3words,upstream,name,completeness,uuid,date_modified,source,version,type,physical-address,phone,url,email
0,35.368699,32.454088,http://www.openstreetmap.org/node/432544821,storeroom.incinerated.spud,openstreetmapÂ¶n432544821,Abu Da'if Medical Clinic,35.29%,6a53505edb7448a0a67b1322b1dc301f,2015/11/17 09:43:57.838+00,OpenStreetMap,2,hospital,,,,
1,34.272789,31.273187,http://www.openstreetmap.org/way/41319501,testers.tree.conqueror,openstreetmapÂ¶w41319501,Abu Yousef Alnajaar Hospital,35.29%,2a418040b4c042fea6733cc30f43bc7f,2015/12/11 10:51:22.924+00,OpenStreetMap,2,hospital,,,,
2,35.174534,32.047521,http://www.openstreetmap.org/node/432260897,dispensable.bicycle.gazebos,openstreetmapÂ¶n432260897,Adnan Hospital,35.29%,a2c8aa28b6534dfb990342c98f299cec,2015/11/17 09:43:57.838+00,OpenStreetMap,2,hospital,,,,
3,35.350111,32.35011,http://www.openstreetmap.org/node/431708802,solutions.something.reward,openstreetmapÂ¶n431708802,Akkaba Medical Clinic,35.29%,f98690d64e04490baae0ff2177bfdabf,2015/11/17 09:43:57.838+00,OpenStreetMap,2,hospital,,,,
4,35.417349,32.336588,http://www.openstreetmap.org/node/2749369559,underpinned.bookcases.name,openstreetmapÂ¶n2749369559,Al Amal (Hope) Clinic,35.29%,3265d3d5f8604a94bf82e92038ce3c9b,2015/11/17 09:43:57.838+00,OpenStreetMap,2,hospital,,,,


In [187]:
# process the untagged dataset
processed_dataset, empty_cols = preprocess(input_dataset, 
                               pd.DataFrame(columns=['Header','Data','Relative Column Position','Organization','Index']))

[]


In [188]:
na = []
for i in range(len(processed_dataset)):
    na.append(' ')

In [189]:
model = pickle.load(open("model.pkl", "rb")) #Model needs be named model.pkl
output_dataset = pd.DataFrame(data = model.predict(list(processed_dataset['features_combined'])))
output_dataset.loc[empty_cols,0] = 'No Prediction. Column only had missing values'
output_dataset.insert(loc=0, column='Header', value=input_headers)
# output_dataset.insert(loc=1, column='Original tag', value=np.array(input_dataset.iloc[0,:]))
output_dataset.insert(loc=1, column='Original tag', value=na)
output_dataset.rename(index=str, columns={0: "Predicted tag"}, inplace=True)
output_dataset



Unnamed: 0,Header,Original tag,Predicted tag
0,X,,affected
1,Y,,affected
2,source_url,,meta
3,what3words,,country
4,upstream,,affected
5,name,,loc
6,completeness,,affected
7,uuid,,meta
8,date_modified,,date
9,source,,meta


In [190]:
import pandas
from openpyxl import load_workbook

book = load_workbook('compare.xlsx')
writer = pandas.ExcelWriter('compare.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = {ws.title: ws for ws in book.worksheets}

for sheetname in writer.sheets:
    output_dataset.to_excel(writer,sheet_name=sheetname, startrow=writer.sheets[sheetname].max_row, index = False,header= False)

writer.save()

In [191]:
# @app.route('/', methods=['GET','POST'])
# def upload_file():
#     if request.method == 'POST':
#         # check if the post request has the file part
#         if 'file' not in request.files:
#             flash('No file part')
#             return redirect(request.url)
#         file = request.files['file']
        
#         if file.filename == '':
#             # flash('No selected file')
#             return redirect(request.url)
#         # file.save(os.getcwd())
#         if file and allowed_file_csv(file.filename):
#             filename = secure_filename(file.filename)
#             input_dataset = pd.read_csv(file)
                
#         if file and allowed_file_json(file.filename):
#             # filename = secure_filename(file.filename)
#             input_dataset = pd.read_json(file)
#             input_dataset = input_dataset.rename(columns=input_dataset.iloc[0]).drop(input_dataset.index[0])
#                 # process the untagged dataset
#         processed_dataset = preprocess(input_dataset, 
#             pd.DataFrame(columns=['Header','Data','Relative Column Position','Organization','Index']))
#         model = pickle.load(open("model.pkl", "rb")) #Model needs be named model.pkl, preferably using version 0.20.3
#         output_dataset = pd.DataFrame(data = model.predict(list(processed_dataset['features_combined'])))
#         resp = make_response(output_dataset.to_csv())
#         resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
#         resp.headers["Content-Type"] = "text/csv"
#         return resp
        
          

    return 
<br>
    <!doctype html><br>
    <title>Upload new File</title><br>
    <h1>Upload new File (only CSV and JSON files accepted)</h1><br>
    <form method=post enctype=multipart/form-data><br>
      <input type=file name=file><br>
      <input type=submit value=Upload><br>
    # <form method=post><br>
    #   <input name=text><br>
    #   <input type=submit><br>
    </form><br>
  
 

In [81]:
model = pickle.load(open("model.pkl", "rb")) #Model needs be named model.pkl
output_dataset = pd.DataFrame(data = model.predict(list(processed_dataset['features_combined'])))
output_dataset.loc[empty_cols,0] = 'No Prediction. Column only had missing values'
output_dataset.insert(loc=0, column='Header', value=input_headers)
output_dataset.insert(loc=1, column='original tag', value=input_dataset.iloc[0,:].values)
output_dataset.rename(index=str, columns={0: "Predicted tag"}, inplace=True)
output_dataset



Unnamed: 0,Header,original tag,Predicted tag
0,Indicator,"Cumulative number of confirmed, probable and s...",indicator
1,Country,Guinea,country
2,Date,2015-03-10,date
3,value,3285,affected


In [None]:
# if __name__ == '__main__':
#      app.run(debug=True)
     

In [52]:
processed_dataset

Unnamed: 0,Header_embedding,Organization_embedded,features_combined,data_combined
0,"[0.00187157, -0.0188091, -0.00182546, -0.03487...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[0.00187157, -0.0188091, -0.00182546, -0.03487...","[-0.104381, -0.041732, 0.00798202, -0.0574861,..."
1,"[-0.0577098, 0.0106014, 0.076496, 0.0175093, 0...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[-0.0577098, 0.0106014, 0.076496, 0.0175093, 0...","[-0.0464195, -0.0562082, -0.0450554, 0.152472,..."
2,"[-0.0117924, 0.0334399, -0.0723287, 0.00917512...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[-0.0117924, 0.0334399, -0.0723287, 0.00917512...","[-0.109646, -0.104045, 0.0746975, 0.0679601, -..."
3,"[-0.0607706, -0.0288627, -0.0202762, -0.003495...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[-0.0607706, -0.0288627, -0.0202762, -0.003495...","[-0.0392919, -0.0336452, 0.157544, 0.0263585, ..."
4,"[-0.0170964, -0.0967856, -0.00234281, 0.035446...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[-0.0170964, -0.0967856, -0.00234281, 0.035446...","[0.0255279, -0.0862958, -0.012092, 0.00521915,..."
5,"[0.00443994, -0.0819114, 0.0289839, 0.0541079,...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[0.00443994, -0.0819114, 0.0289839, 0.0541079,...","[-0.0349723, -0.0828113, 0.0011545, -0.0050243..."
6,"[-0.0133045, 0.0682965, -0.0260522, -0.019703,...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[-0.0133045, 0.0682965, -0.0260522, -0.019703,...","[-0.0179946, 0.0290696, 0.0425825, -0.0150895,..."
7,"[-0.0160203, 0.0745997, -0.0198301, 0.063669, ...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[-0.0160203, 0.0745997, -0.0198301, 0.063669, ...","[-0.104381, -0.041732, 0.00798202, -0.0574861,..."
8,"[-0.0406963, 0.0416051, -0.0467667, 0.0659937,...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[-0.0406963, 0.0416051, -0.0467667, 0.0659937,...","[-0.104381, -0.041732, 0.00798202, -0.0574861,..."
9,"[-0.0716354, -0.00111947, 0.0113983, 0.0270508...","[0.0122939, -0.0148763, -0.0858311, 0.0693863,...","[-0.0716354, -0.00111947, 0.0113983, 0.0270508...","[-0.0365678, -0.0530042, 0.0081554, 0.201757, ..."


In [51]:
input_dataset

Unnamed: 0,countryCode,id,name,code,startDate,endDate,year,requirements,funding,percentFunded
0,#country+code,#activity+appeal+id+fts_internal,#activity+appeal+name,#activity+appeal+id+external,#date+start,#date+end,#date+year,#value+funding+required+usd,#value+funding+total+usd,#value+funding+pct
1,SAU,,Not specified,,,,2015,,1000,
2,SAU,,Not specified,,,,2008,,146647,
