In [None]:
# ********************** REFERENCES *****************************

# reading from a CSV file
https://www.youtube.com/watch?v=eEIr70i8vbs

# writing to a CSV file
https://www.youtube.com/watch?v=hmYdzvmcTD8
    
# ***************************************************************

In [None]:
# ************************ IMPORTS ******************************

# -------------- Modelling Packages --------------
# For modeling
from keras.models import Model
from keras.layers import Concatenate, Input, Dense, Embedding
from keras.layers import LSTM, Bidirectional, SpatialDropout1D
from keras.layers import TimeDistributed

# Callback Functions
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

# For Timestamping Models
import time

# -------------- Preprocessing Packages --------------
# For tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# -------------- General Packages --------------
# General Use
import pandas as pd
import numpy as np

# For Saving Files
import pickle
import os

# ***************************************************************

In [None]:
# ******************* LOADING THE DATASET ***********************

# Given the split dataset directory, return the train/test split
def load_dataset(split_data_dir):
    pickle_in = open(split_data_dir+'X_train.pickle','rb')
    X_train = pickle.load(pickle_in)
    
    pickle_in = open(split_data_dir+'X_test.pickle','rb')
    X_test = pickle.load(pickle_in)

    pickle_in = open(split_data_dir+'y_train.pickle','rb')
    y_train = pickle.load(pickle_in)

    pickle_in = open(split_data_dir+'y_test.pickle','rb')
    y_test = pickle.load(pickle_in)
    return X_train,X_test,y_train,y_test

def load_tokenizer(tokenizer_dir):
    pickle_in = open(tokenizer_dir,'rb')
    t = pickle.load(pickle_in)
    return t
    
split_data_dir = './split_data/'
X_train,X_test,y_train,y_test = load_dataset(split_data_dir)

tokenizer_dir = 'tokenizer.pickle'
t = load_tokenizer(tokenizer_dir)

# ***************************************************************

In [None]:
# ****************** PREPROCESSING FUNCTION *********************

# Filters corrupted, unusable/unused data from the dataset
def filter_dataset(df):
    df = df.drop(columns=['id','tid1','tid2']) # drop id columns
    df = df.drop(columns=['title1_zh','title2_zh']) # drop chinese columns

    # Remove symbols
    df['title1_en'] = df['title1_en'].str.replace('[^a-zA-Z0-9 ]','')
    df['title2_en'] = df['title2_en'].str.replace('[^a-zA-Z0-9 ]','')

    # Replace empty strings with NaN values
    df['title1_en'].replace('', np.nan, inplace=True)
    df['title2_en'].replace('', np.nan, inplace=True)
    
    # Remove rows with no label
    labels = ['unrelated','agreed','disagreed']
    df = df[df.label.isin(labels)]
    
    # Remove Unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Drop rows with null values
    df = df.dropna()
    return df

# Returns a datset with an equal sampling of each label
def equalize_dataset_labels(df, seed=1):
    # Get minimum label count
    n = df.label.value_counts().min()
    
    # Grabbing equal amounts of training data from each class
    dfa = df[df['label']=='unrelated'].sample(n,random_state=seed)
    dfb = df[df['label']=='agreed'].sample(n,random_state=seed)
    dfc = df[df['label']=='disagreed'].sample(n,random_state=seed)
    
    # Recombine dataset and shuffle
    df = pd.concat([dfa,dfb,dfc])
    df = df.sample(frac=1,random_state=seed)
    return df

# Convert labels to integers for predictions
def encode_labels(df):
    # encoding the labels
    labels = {'unrelated':0,'agreed':1,'disagreed':2}
    df['label'].replace(labels,inplace=True)
    df = df.reset_index()
    return df

# ***************************************************************

In [None]:
# ******************* TOKENIZING FUNCTIONS **********************

# Create a word tokenizer given dataframe(s)
def create_tokenizer(*data, num_words=None, lower=True, split=' ', oov_token=None, filename='tokenizer'):
    # create the tokenizer
    t = Tokenizer(num_words=num_words, lower=lower, split=split, oov_token=oov_token)
    
    # fit tokenizer
    for df in data:
        t.fit_on_texts(df['title1_en'])  
        t.fit_on_texts(df['title2_en'])  
    
    # save for future use
    pickle_out = open(filename+'.pickle', 'wb')
    pickle.dump(t,pickle_out)
    pickle_out.close()
    return t

# Tokenizes titles and encodes labels, trains a word tokenizer that is saved to a file
def tokenize(t, df, maxlen=20):
    # fit the tokenizer on the documents  
    data1 = pad_sequences(sequences=t.texts_to_sequences(df['title1_en']), maxlen=maxlen)
    data2 = pad_sequences(sequences=t.texts_to_sequences(df['title2_en']), maxlen=maxlen)
    
    # recombine
    df = pd.DataFrame(np.concatenate((data1,data2),axis=1)).join(df['label'])
    return df

# ***************************************************************

In [None]:
# **************** MODELING SUMMARY *****************************

# FIRST MODEL: TITLE1_EN
first_input = Input(shape=(SENTENCE_SIZE,))
m1 = Embedding(vocab_size,
                EMBED_SIZE,
                weights=[embedding_matrix],
                input_length=SENTENCE_SIZE,
                trainable=False)(first_input)
m1 = Bidirectional(LSTM(LSTM_SIZE,dropout=0.2, recurrent_dropout=0.2))(m1)

# SECOND MODEL: TITLE2_EN
second_input = Input(shape=(SENTENCE_SIZE,))
m2 = Embedding(vocab_size,
                 EMBED_SIZE,
                 weights=[embedding_matrix],
                 input_length=SENTENCE_SIZE,
                 trainable=False)(second_input)
m2 = Bidirectional(LSTM(LSTM_SIZE,dropout=0.2, recurrent_dropout=0.2))(m2)

# MERGE MODEL
merged = Concatenate(axis=1)([m1, m2])
output_layer = Dense(3, activation='softmax')(merged)

model = Model(inputs=[first_input, second_input], outputs=output_layer)
model.compile(optimizer=optimizer, loss=loss,metrics=metrics)

model.summary()

# ***************************************************************

In [None]:
# **************** PREDICT FUNCTION *****************************

index = 100

first_sentence = X_train[index][:SENTENCE_SIZE].reshape(1,SENTENCE_SIZE)
second_sentence = X_train[index][SENTENCE_SIZE:SENTENCE_SIZE*2].reshape(1,SENTENCE_SIZE)

prediction = model.predict([first_sentence,second_sentence])

prediction_list = [int(round(p)) for p in prediction[0]]
prediction_list

np.argmax(prediction_list)

# ***************************************************************

In [None]:
# ************************ MAIN *********************************

input_test = './data/test.csv'

# load test.csv into a dataframe
df_test = pd.read_csv(input_test,encoding='utf-8-sig',error_bad_lines=False)
dataFrame_test = pd.DataFrame(df_test)

# load keras model
# ... methods above
# not sure how to work modeling, Tensor not available

# load a tokenizer
# ... methods above

# preprocess the csv and tokenize it
df_test = filter_dataset(df_test)
t = create_tokenizer(df_test, num_words=40000, oov_token=None)
df_test_encoded = encode_labels(df_test)
df_test_tokenized = tokenize(t,df_test_encoded,25)

# save_data(train=df_test_tokenized,
#          test=df_test_tokenized,
#          maxlen=25)
# is saving necessary for tokenizing ^^^?

# (not certain if parameters needed)

# create a new csv (just writing to one i created)
df_test = df_test[['id', 'label']] # set labels for db, just like sample_submission.csv
# (not sure how to call predictions above, additional function needed?)
df_test.to_csv('./data/prediction.csv') # write dataframe to csv

# ***************************************************************