In [1]:
# ********************** REFERENCES *****************************

"""
reading from a CSV file
- https://www.youtube.com/watch?v=eEIr70i8vbs

writing to a CSV file
- https://www.youtube.com/watch?v=hmYdzvmcTD8
- https://stackoverflow.com/questions/34864695/saving-prediction-results-to-csv
"""

# ***************************************************************

'\nreading from a CSV file\n- https://www.youtube.com/watch?v=eEIr70i8vbs\n\nwriting to a CSV file\n- https://www.youtube.com/watch?v=hmYdzvmcTD8\n- https://stackoverflow.com/questions/34864695/saving-prediction-results-to-csv\n'

In [2]:
# ************************ IMPORTS ******************************

# -------------- Modelling Packages --------------
# For modeling
from keras.models import Model
from keras.layers import Concatenate, Input, Dense, Embedding
from keras.layers import LSTM, Bidirectional, SpatialDropout1D
from keras.layers import TimeDistributed
from keras.models import load_model

# Callback Functions
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

# For Timestamping Models
import time

# -------------- Preprocessing Packages --------------
# For tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# -------------- General Packages --------------
# General Use
import pandas as pd
import numpy as np

# For Saving Files
import pickle
import os

# ***************************************************************

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# ******************* LOADING THE DATASET ***********************

# Given the split dataset directory, return the train/test split
def load_dataset(split_data_dir):
    pickle_in = open(split_data_dir+'X_train.pickle','rb')
    X_train = pickle.load(pickle_in)
    
    pickle_in = open(split_data_dir+'X_test.pickle','rb')
    X_test = pickle.load(pickle_in)

    pickle_in = open(split_data_dir+'y_train.pickle','rb')
    y_train = pickle.load(pickle_in)

    pickle_in = open(split_data_dir+'y_test.pickle','rb')
    y_test = pickle.load(pickle_in)
    return X_train,X_test,y_train,y_test

def load_tokenizer(tokenizer_dir):
    pickle_in = open(tokenizer_dir,'rb')
    t = pickle.load(pickle_in)
    return t
    
split_data_dir = './split_data/'
X_train,X_test,y_train,y_test = load_dataset(split_data_dir)

tokenizer_dir = 'tokenizer.pickle'
t = load_tokenizer(tokenizer_dir)

# ***************************************************************

In [4]:
# ****************** PREPROCESSING FUNCTION *********************

# Filters corrupted, unusable/unused data from the dataset
def filter_dataset(df):
    df = df.drop(columns=['id','tid1','tid2']) # drop id columns
    df = df.drop(columns=['title1_zh','title2_zh']) # drop chinese columns

    # Remove symbols
    df['title1_en'] = df['title1_en'].str.replace('[^a-zA-Z0-9 ]','')
    df['title2_en'] = df['title2_en'].str.replace('[^a-zA-Z0-9 ]','')

    # Replace empty strings with NaN values
    df['title1_en'].replace('', np.nan, inplace=True)
    df['title2_en'].replace('', np.nan, inplace=True)
    
    # Remove rows with no label
    labels = ['unrelated','agreed','disagreed']
    df = df[df.label.isin(labels)]
    
    # Remove Unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Drop rows with null values
    df = df.dropna()
    return df

# Returns a datset with an equal sampling of each label
def equalize_dataset_labels(df, seed=1):
    # Get minimum label count
    n = df.label.value_counts().min()
    
    # Grabbing equal amounts of training data from each class
    dfa = df[df['label']=='unrelated'].sample(n,random_state=seed)
    dfb = df[df['label']=='agreed'].sample(n,random_state=seed)
    dfc = df[df['label']=='disagreed'].sample(n,random_state=seed)
    
    # Recombine dataset and shuffle
    df = pd.concat([dfa,dfb,dfc])
    df = df.sample(frac=1,random_state=seed)
    return df

# Convert labels to integers for predictions
def encode_labels(df):
    # encoding the labels
    labels = {'unrelated':0,'agreed':1,'disagreed':2}
    df['label'].replace(labels,inplace=True)
    df = df.reset_index()
    return df
# ***************************************************************

In [5]:
# ******************* TOKENIZING FUNCTIONS **********************

# Create a word tokenizer given dataframe(s)
def create_tokenizer(*data, num_words=None, lower=True, split=' ', oov_token=None, filename='tokenizer'):
    # create the tokenizer
    t = Tokenizer(num_words=num_words, lower=lower, split=split, oov_token=oov_token)
    
    # fit tokenizer
    for df in data:
        t.fit_on_texts(df['title1_en'])  
        t.fit_on_texts(df['title2_en'])  
    
    # save for future use
    pickle_out = open(filename+'.pickle', 'wb')
    pickle.dump(t,pickle_out)
    pickle_out.close()
    return t

# Tokenizes titles and encodes labels, trains a word tokenizer that is saved to a file
def tokenize(t, df, maxlen=20):
    # fit the tokenizer on the documents  
    data1 = pad_sequences(sequences=t.texts_to_sequences(df['title1_en']), maxlen=maxlen)
    data2 = pad_sequences(sequences=t.texts_to_sequences(df['title2_en']), maxlen=maxlen)
    
    # recombine
    df = pd.DataFrame(np.concatenate((data1,data2),axis=1)).join(df['label'])
    return df

# ***************************************************************

In [6]:
"""
embeddings_index = dict()

EMBED_SIZE = 100
LSTM_SIZE = 2

# -------------- Compile Parameters --------------
activation = 'softmax'
optimizer = 'RMSProp'
loss = 'sparse_categorical_crossentropy'
metrics = ['accuracy']

SENTENCE_SIZE = int(X_train.shape[1]/2)
vocab_size = len(t.word_index) + 1

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, EMBED_SIZE))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# FIRST MODEL: TITLE1_EN
first_input = Input(shape=(SENTENCE_SIZE,))
m1 = Embedding(vocab_size,
                EMBED_SIZE,
                weights=[embedding_matrix],
                input_length=SENTENCE_SIZE,
                trainable=False)(first_input)
m1 = Bidirectional(LSTM(LSTM_SIZE,dropout=0.2, recurrent_dropout=0.2))(m1)

# SECOND MODEL: TITLE2_EN
second_input = Input(shape=(SENTENCE_SIZE,))
m2 = Embedding(vocab_size,
                 EMBED_SIZE,
                 weights=[embedding_matrix],
                 input_length=SENTENCE_SIZE,
                 trainable=False)(second_input)
m2 = Bidirectional(LSTM(LSTM_SIZE,dropout=0.2, recurrent_dropout=0.2))(m2)

# MERGE MODEL
merged = Concatenate(axis=1)([m1, m2])
output_layer = Dense(3, activation='softmax')(merged)

model = Model(inputs=[first_input, second_input], outputs=output_layer)
model.compile(optimizer=optimizer, loss=loss,metrics=metrics)
"""

"\nembeddings_index = dict()\n\nEMBED_SIZE = 100\nLSTM_SIZE = 2\n\n# -------------- Compile Parameters --------------\nactivation = 'softmax'\noptimizer = 'RMSProp'\nloss = 'sparse_categorical_crossentropy'\nmetrics = ['accuracy']\n\nSENTENCE_SIZE = int(X_train.shape[1]/2)\nvocab_size = len(t.word_index) + 1\n\n# create a weight matrix for words in training docs\nembedding_matrix = np.zeros((vocab_size, EMBED_SIZE))\nfor word, i in t.word_index.items():\n    embedding_vector = embeddings_index.get(word)\n    if embedding_vector is not None:\n        embedding_matrix[i] = embedding_vector\n\n# FIRST MODEL: TITLE1_EN\nfirst_input = Input(shape=(SENTENCE_SIZE,))\nm1 = Embedding(vocab_size,\n                EMBED_SIZE,\n                weights=[embedding_matrix],\n                input_length=SENTENCE_SIZE,\n                trainable=False)(first_input)\nm1 = Bidirectional(LSTM(LSTM_SIZE,dropout=0.2, recurrent_dropout=0.2))(m1)\n\n# SECOND MODEL: TITLE2_EN\nsecond_input = Input(shape=(SENT

In [7]:
# ************************ MAIN *********************************
import pandas as pd

input_test = './data/test.csv'

# load test.csv into a dataframe and preprocess it
df_test = pd.read_csv(input_test,encoding='utf-8-sig',error_bad_lines=False)
dataFrame_test = pd.DataFrame(df_test)
dataFrame_test = dataFrame_test.drop(columns=['tid1', 'tid2', 'title1_en', 'title1_zh', 'title2_zh', 'title2_en']) # drop columns

# add predictions
"""
split_data_dir = './split_data/'
X_train,X_test,y_train,y_test = load_dataset(split_data_dir)

index = 100

first_sentence = X_train[index][:SENTENCE_SIZE].reshape(1,SENTENCE_SIZE)
second_sentence = X_train[index][SENTENCE_SIZE:SENTENCE_SIZE*2].reshape(1,SENTENCE_SIZE)

prediction = model.predict([first_sentence,second_sentence])

prediction_list = [int(round(p)) for p in prediction[0]]
print(prediction_list)
"""

# write dataframe to csv
print(dataFrame_test)
dataFrame_test.insert(1, 'label', 0) #replace 0 with prediction data
dataFrame_test.to_csv('./data/submission.csv', index = False)

"""
# preprocess the csv and tokenize it
new_t = create_tokenizer(dataFrame_test, num_words=40000, oov_token=None)
df_test_encoded = encode_labels(dataFrame_test)
df_test_tokenized = tokenize(t,df_test_encoded,25)

# create a new csv (just writing to one i created)
#prediction = model.predict(...)
prediction = pd.DataFrame(prediction)
"""

# ***************************************************************

b'Skipping line 47254: expected 7 fields, saw 9\n'


           id
0       41937
1      117063
2      306750
3        5069
4      218992
5       11569
6      319706
7      106194
8      212572
9       25238
10     197559
11     150201
12     308268
13     184546
14     189459
15     222176
16      59898
17     182649
18      37649
19      80201
20     198449
21     232173
22      84431
23      32877
24     179366
25     171745
26       4352
27     238881
28      93581
29     295716
...       ...
64112  302833
64113  271435
64114  166415
64115   31939
64116  139174
64117  102320
64118   19553
64119   34531
64120  204822
64121  194357
64122   93440
64123  229085
64124  318419
64125   61636
64126  193994
64127   81989
64128   23561
64129  161241
64130  213605
64131   58763
64132   31935
64133  315316
64134   65843
64135   40923
64136   59521
64137  214559
64138  229127
64139  277207
64140   33059
64141  144818

[64142 rows x 1 columns]


'\n# preprocess the csv and tokenize it\nnew_t = create_tokenizer(dataFrame_test, num_words=40000, oov_token=None)\ndf_test_encoded = encode_labels(dataFrame_test)\ndf_test_tokenized = tokenize(t,df_test_encoded,25)\n\n# create a new csv (just writing to one i created)\n#prediction = model.predict(...)\nprediction = pd.DataFrame(prediction)\n'