# Imports

In [1]:
# -------------- Preprocessing Packages --------------
# For tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# -------------- General Packages --------------
# General Use
import pandas as pd
import numpy as np

# For Saving Files
import pickle
import os

Using TensorFlow backend.


# Preprocessing Functions

In [2]:
# Filters corrupted, unusable/unused data from the dataset
def filter_dataset(df):
    df = df.drop(columns=['id','tid1','tid2']) # drop id columns
    df = df.drop(columns=['title1_zh','title2_zh']) # drop chinese columns

    # Remove symbols
    df['title1_en'] = df['title1_en'].str.replace('[^a-zA-Z0-9 ]','')
    df['title2_en'] = df['title2_en'].str.replace('[^a-zA-Z0-9 ]','')

    # Replace empty strings with NaN values
    df['title1_en'].replace('', np.nan, inplace=True)
    df['title2_en'].replace('', np.nan, inplace=True)
    
    # Remove rows with no label
    labels = ['unrelated','agreed','disagreed']
    df = df[df.label.isin(labels)]
    
    # Remove Unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Drop rows with null values
    df = df.dropna()
    return df

# Returns a datset with an equal sampling of each label
def equalize_dataset_labels(df, seed=1):
    # Get minimum label count
    n = df.label.value_counts().min()
    
    # Grabbing equal amounts of training data from each class
    dfa = df[df['label']=='unrelated'].sample(n,random_state=seed)
    dfb = df[df['label']=='agreed'].sample(n,random_state=seed)
    dfc = df[df['label']=='disagreed'].sample(n,random_state=seed)
    
    # Recombine dataset and shuffle
    df = pd.concat([dfa,dfb,dfc])
    df = df.sample(frac=1,random_state=seed)
    return df

# Convert labels to integers for predictions
def encode_labels(df):
    # encoding the labels
    labels = {'unrelated':0,'agreed':1,'disagreed':2}
    df['label'].replace(labels,inplace=True)
    df = df.reset_index()
    return df

# Tokenizing Functions

In [3]:
# Create a word tokenizer given dataframe(s)
def create_tokenizer(*data, split=' ', filename='tokenizer'):
    # create the tokenizer
    t = Tokenizer(oov_token=True, split=split)
    
    # fit tokenizer
    for df in data:
        t.fit_on_texts(df['title1_en'])  
        t.fit_on_texts(df['title2_en'])  
    
    # save for future use
    pickle_out = open(filename+'.pickle', 'wb')
    pickle.dump(t,pickle_out)
    pickle_out.close()
    return t

# Tokenizes titles and encodes labels, trains a word tokenizer that is saved to a file
def tokenize(t, df, maxlen=20):
    # fit the tokenizer on the documents  
    data1 = pad_sequences(sequences=t.texts_to_sequences(df['title1_en']), maxlen=maxlen)
    data2 = pad_sequences(sequences=t.texts_to_sequences(df['title2_en']), maxlen=maxlen)
    
    # recombine
    df = pd.DataFrame(np.concatenate((data1,data2),axis=1)).join(df['label'])
    return df

# Save Preprocessed/Tokenized Data

In [4]:
# Given train and test data, split into features and labels and save
def save_data(train, test, maxlen=20, save_dir='./split_data/'):
    # Split the data into X and y
    X_train = train.iloc[:,:maxlen*2].to_numpy()
    X_test = test.iloc[:,:maxlen*2].to_numpy()
    y_train = train['label'].to_numpy()
    y_test = test['label'].to_numpy()

    # reshape the X train data for use
    X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1)
    X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)
    
    split_data_dir=save_dir
    os.makedirs(os.path.dirname(split_data_dir), exist_ok=True)

    pickle_out = open(split_data_dir+'X_train.pickle', 'wb')
    pickle.dump(X_train,pickle_out)
    pickle_out.close()

    pickle_out = open(split_data_dir+'X_test.pickle', 'wb')
    pickle.dump(X_test,pickle_out)
    pickle_out.close()

    pickle_out = open(split_data_dir+'y_train.pickle', 'wb')
    pickle.dump(y_train,pickle_out)
    pickle_out.close()
    
    pickle_out = open(split_data_dir+'y_test.pickle', 'wb')
    pickle.dump(y_test,pickle_out)
    pickle_out.close()

# Creating Train/Test Data

In [5]:
input_train = './data/train.csv'
input_validation = './data/validation.csv'

df_train = pd.read_csv(input_train,encoding='utf-8-sig',error_bad_lines=False)
df_test = pd.read_csv(input_validation,encoding='utf-8-sig',error_bad_lines=False)

df_train = filter_dataset(df_train)
df_test = filter_dataset(df_test)

b'Skipping line 7026: expected 8 fields, saw 9\nSkipping line 44341: expected 8 fields, saw 9\nSkipping line 58784: expected 8 fields, saw 10\n'
b'Skipping line 99101: expected 8 fields, saw 10\nSkipping line 104716: expected 8 fields, saw 9\nSkipping line 127866: expected 8 fields, saw 10\n'
b'Skipping line 140436: expected 8 fields, saw 9\nSkipping line 152888: expected 8 fields, saw 11\n'


In [6]:
t = create_tokenizer(df_train,df_test)

In [7]:
df_train_equalized = equalize_dataset_labels(df_train)
df_test_equalized = equalize_dataset_labels(df_test)

df_train_encoded = encode_labels(df_train_equalized)
df_test_encoded = encode_labels(df_test_equalized)

df_train_tokenized = tokenize(t,df_train_encoded,20)
df_test_tokenized = tokenize(t,df_test_encoded,20)

In [8]:
save_data(train=df_train_tokenized,test=df_test_tokenized)

# Inverse Tokenization

Some experimentation with inverse tokenizing to double check if it's 

In [9]:
t.sequences_to_texts([df_train_tokenized.iloc[0,:20]])

['at last the loch ness monster was captured and it is said to be the most recognizable of all time']

In [25]:
t.sequences_to_texts([df_train_tokenized.iloc[0,27:40]])

['the american scientist has finally caught the clear loch ness monster its terrible']

In [11]:
df_train_tokenized.iloc[0,40]

1

In [21]:
df_train_encoded[:1]

Unnamed: 0,index,title1_en,title2_en,label
0,5040,At last the Loch Ness Monster was captured and...,The American scientist has finally caught the ...,1


In [13]:
df_train[:5]

Unnamed: 0,title1_en,title2_en,label
0,Some people say that cabbage is the king of po...,Maradona talks about health Im better than eve...,unrelated
1,Secretly the pharmacy has a cheap cream put on...,Dont be a fool to get the bags under your eyes...,agreed
2,Just at 23 28 the Chinese stock market broke a...,China stock market startling news the words of...,unrelated
3,Who was the most powerful legion in World War ...,The three great rumblings of the Second World ...,unrelated
4,Discovery of Miracles of Medical Miracles The ...,Why is red wine fighting cancer,unrelated


In [15]:
df_train_equalized[:5]

Unnamed: 0,title1_en,title2_en,label
5040,At last the Loch Ness Monster was captured and...,The American scientist has finally caught the ...,1
100367,Hair takes a lot of nourishment,A big ass kid A long shot to get nourishment ...,2
190245,Diabetics get addicted to insulin,Is it addictive to take insulin Its not too la...,1
17046,The survival of the jedi is brought with the r...,Tencent disbarredly PUBG national service anno...,0
46092,The first spoony girl in sports miss yao ming ...,China Womens Volleyball Team Announces 14 Peop...,0


In [20]:
df_train[df_train.title1_en.str.contains('Loch Ness')]

Unnamed: 0,title1_en,title2_en,label
963,A foreign internet friend captured shocking fo...,Amateurs take pictures of the Loch Ness Monster,agreed
5040,At last the Loch Ness Monster was captured and...,The American scientist has finally caught the ...,agreed
27978,The old man waited for the Loch Ness Monster f...,The old man waited for the Loch Ness monster f...,agreed
34735,The old man waited for the Loch Ness Monster f...,The old man waited for the Loch Ness monster f...,agreed
50927,The old man stayed for twentysix years with th...,The old man waited for the Loch Ness monster f...,agreed
51102,The old man waited for the Loch Ness Monster f...,The old man stayed with the Loch Ness monster ...,agreed
58844,The old man waited for the Loch Ness Monster f...,The old man waited for the Loch Ness monster f...,agreed
59720,The Loch Ness Monster Scientists will use DNA ...,The mysterious thing of legend finally appeare...,unrelated
75914,The old man waited for the Loch Ness Monster f...,The old man stayed with the Loch Ness Monster ...,agreed
83149,The old man stayed with the Loch Ness monster ...,The old man waited for the Loch Ness monster f...,agreed


# Resources Used

- https://keras.io/preprocessing/text/
- https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/
- https://www.youtube.com/watch?v=j-3vuBynnOE - Loading in your own data - Deep Learning basics with Python, TensorFlow and Keras p.2

- http://faroit.com/keras-docs/1.2.2/preprocessing/text/#tokenizer
