In [1]:
from flask import Flask, redirect, request, render_template, url_for, send_from_directory, make_response
from werkzeug.wrappers import Request, Response
from werkzeug.serving import run_simple
from jsonrpc import JSONRPCResponseManager, dispatcher
import os
import hashlib
import pandas as pd
import numpy as np
from werkzeug.utils import secure_filename
import pickle
import re
from sklearn.neural_network import MLPClassifier
from fastText import load_model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from flask_cors import CORS
from nltk import ngrams
import json

In [2]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Install fastText by doing:<br>
#git clone https://github.com/facebookresearch/fastText.git<br>
#cd fastText<br>
#pip install .<br>
#'wiki.en.bin' needs to be in the same directory as server.py (can be downloaded from <br>
#https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip)<br>


In [3]:
fasttext_model = 'wiki.en.bin'
fmodel = load_model(fasttext_model)

In [4]:
UPLOAD_FOLDER = '\datasets'
ALLOWED_EXTENSIONS_CSV = set(['csv'])
ALLOWED_EXTENSIONS_JSON = set(['json'])

In [5]:
# app = Flask(__name__)
# # CORS(app)
# app.config['UPLOAD_FOLDER'] = os.path.join(app.instance_path)

In [19]:
def lower_cols(lst):
    #convert data to lowercases
    #QUESTION: will I miss any important information? 
    return [word.lower() for word in lst if isinstance(word,str)]

In [7]:
def remove_chars(lst):
    #remove punctuation characters such as ",", "(", ")", """, ":", "/", and "."
    #NOTE: PRESERVES WHITE SPACE.
    #QUESTION: any other characters we should be aware of? Is this a good idea? I'm inspecting each word individually.
    #Any potential pitfalls? 
    cleaned = [re.sub('\s+', ' ', mystring).strip() for mystring in lst]
    cleaned = [re.sub(r'[[^A-Za-z0-9\s]+]', ' ', mystr) for mystr in cleaned]
    cleaned = [mystr.replace('_', ' ') for mystr in cleaned]
    return cleaned

In [8]:
def clean_cols(data):
    data = lower_cols(data)
    data = remove_chars(data)
    return data

In [66]:
def preprocess(pandas_dataset, df_target):
    if (not pandas_dataset.empty):
        organization = 'HDX'   #Replace if datasets contains organization
        pandas_dataset.dropna(how = 'all', inplace = True)
        pandas_dataset.dropna(axis=1, how = 'all', subset=range(1,len(pandas_dataset)), inplace = True)
        headers = list(pandas_dataset.columns.values)        
        headers = clean_cols(headers)
    for i in range(len(headers)):
        try:
            dic = {'Header': headers[i], 
                   'Data': list(pandas_dataset.iloc[1:, i]), 
                   'Relative Column Position': (i+1) / len(pandas_dataset.columns), 
                   'Organization': organization,
                   'Index': i}
            df_target.loc[len(df_target)] = dic
        except:
            raise Exception("Error: arguments not matched")
    df_result = transform_vectorizers(df_target)
    return df_result

In [70]:
def transform_vectorizers(df_target):
    number_of_data_point_to_vectorize = 7
    cols = ['Header_embedding', 'Organization_embedded', 'features_combined']
    df = pd.DataFrame(columns = cols)
    print(df_target.head())
    df_target, number_of_data_point_to_vectorize = embedded_datapoints(df_target, 7)
    df['data_combined'] = df_target.loc[:, 'embedded_datapoint0': 'embedded_datapoint' 
                                                           + str(number_of_data_point_to_vectorize-1)].values.tolist()
    df['data_combined'] = df['data_combined'].apply(lambda x: [val for item in x for val in item])
    df['Header_embedding'] = df_target['Header'].astype(str).apply(fmodel.get_sentence_vector)
    df['Organization_embedded'] = df_target['Organization'].astype(str).apply(fmodel.get_sentence_vector)
    cols = ['Header_embedding', 'Organization_embedded', 'data_combined']
    df['features_combined'] = df[cols].values.tolist()
    df['features_combined'] = df['features_combined'].apply(lambda x: [val for item in x for val in item])
    diff = 2700 - len(df['features_combined'][0])
    for i in range(len(df)):
        for j in range(diff):
            df['features_combined'][i].append(0)
    df = df.dropna()
    return df

In [11]:
def separate_words(series): 
    #each series is a long string that contains all the data
    lst = []
    cleanlist = [str(x) for x in series if str(x) != 'nan']
    for i in cleanlist:
        lst = re.split(r"\W+", i)
        lst.extend(list(filter(None, lst)))
    return lst
    
def vectorize_n_datapoints(df, number_of_datapoints_to_vectorize = 7):
#     print(df['Data'].head())
#     print(df['Data'].iloc[0])
#     for i in range(len(df['Data'])):
#         df['Data_separated'].iloc[0] = separate_words(df['Data'].iloc[0])
    df['Data_separated'] = df['Data'].apply(separate_words)
    if (number_of_datapoints_to_vectorize > len(df['Data_separated'][0])):
        number_of_datapoints_to_vectorize = len(df['Data_separated'][0])
    for i in range(number_of_datapoints_to_vectorize):
        df['datapoint' + str(i)] = df['Data_separated'].str[i]
    return df, number_of_datapoints_to_vectorize

In [60]:
def embedded_datapoints(df, number_of_data_point_to_vectorize=7):
    df, number_of_data_point_to_vectorize = vectorize_n_datapoints(df)
    for i in range(number_of_data_point_to_vectorize):
        
        df['embedded_datapoint' + str(i)] = df['datapoint' + str(i)].map(lambda x: fmodel.get_sentence_vector(str(x)))
    return df, number_of_data_point_to_vectorize

In [13]:
def remove_stop_words(data_lst):
    #remove stopwords from the data including 'the', 'and' etc.
    wordsFiltered = []
    for w in data_lst:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered

In [14]:
def word_extract(row):
    ignore = ['nan']
    no_white = [i.lstrip() for i in row if i not in ignore and not (isinstance(i, float) or isinstance(i,int))]
    cleaned_text = [w.lower() for w in no_white if w not in ignore]
    return cleaned_text

In [15]:
def allowed_file_csv(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS_CSV

In [16]:
def allowed_file_json(filename):
    return '.' in filename and \
            filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS_JSON

In [17]:
def generate_n_grams(data_lst, n):
    # cleaned = remove_chars(list(data_lst))
    # cleaned = clean_cols(cleaned)
    cleaned = remove_stop_words(data_lst)
    #make sure that n_grams 'refresh' when a new dataset is encountered!!!!   
    return list(ngrams(cleaned, n))

In [68]:
input_dataset = pd.read_csv('Cluster 3ws - Protection.csv', na_values=['nan',' nan'])
# input_dataset = input_dataset.rename(columns=input_dataset.iloc[0]).drop(input_dataset.index[0])


In [71]:
#                 # process the untagged dataset
processed_dataset = preprocess(input_dataset, 
                               pd.DataFrame(columns=['Header','Data','Relative Column Position','Organization','Index']))
model = pickle.load(open("model.pkl", "rb")) #Model needs be named model.pkl, preferably using version 0.20.3
output_dataset = pd.DataFrame(data = model.predict(list(processed_dataset['features_combined'])))
output_dataset

                          Header  \
0              lead organization   
1    organization type lead org.   
2           implementing partner   
3  organization type imp.partner   
4                 sector/cluster   

                                                Data  \
0  [UNHCR, WFP, UN Women, UN Women, IOM, PNDH, PN...   
1  [Agências das Nações Unidas, Agências das Naçõ...   
2  [nan, nan, Gender Links, Gender Links, PGR, na...   
3  [nan, nan, ONGs internacionais, ONGs internaci...   
4  [Proteção, Proteção, Proteção, Proteção, Prote...   

   Relative Column Position Organization Index  
0                  0.047619          HDX     0  
1                  0.095238          HDX     1  
2                  0.142857          HDX     2  
3                  0.190476          HDX     3  
4                  0.238095          HDX     4  




Unnamed: 0,0
0,affected
1,affected
2,affected
3,affected
4,sector
5,affected
6,meta
7,affected
8,affected
9,adm2


In [None]:
# @app.route('/', methods=['GET','POST'])
# def upload_file():
#     if request.method == 'POST':
#         # check if the post request has the file part
#         if 'file' not in request.files:
#             flash('No file part')
#             return redirect(request.url)
#         file = request.files['file']
        
#         if file.filename == '':
#             # flash('No selected file')
#             return redirect(request.url)
#         # file.save(os.getcwd())
#         if file and allowed_file_csv(file.filename):
#             filename = secure_filename(file.filename)
#             input_dataset = pd.read_csv(file)
                
#         if file and allowed_file_json(file.filename):
#             # filename = secure_filename(file.filename)
#             input_dataset = pd.read_json(file)
#             input_dataset = input_dataset.rename(columns=input_dataset.iloc[0]).drop(input_dataset.index[0])
#                 # process the untagged dataset
#         processed_dataset = preprocess(input_dataset, 
#             pd.DataFrame(columns=['Header','Data','Relative Column Position','Organization','Index']))
#         model = pickle.load(open("model.pkl", "rb")) #Model needs be named model.pkl, preferably using version 0.20.3
#         output_dataset = pd.DataFrame(data = model.predict(list(processed_dataset['features_combined'])))
#         resp = make_response(output_dataset.to_csv())
#         resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
#         resp.headers["Content-Type"] = "text/csv"
#         return resp
        
          

    return 
<br>
    <!doctype html><br>
    <title>Upload new File</title><br>
    <h1>Upload new File (only CSV and JSON files accepted)</h1><br>
    <form method=post enctype=multipart/form-data><br>
      <input type=file name=file><br>
      <input type=submit value=Upload><br>
    # <form method=post><br>
    #   <input name=text><br>
    #   <input type=submit><br>
    </form><br>
  
 

In [None]:
if __name__ == '__main__':
     app.run(debug=True)
     