# Resources Used

- https://keras.io/preprocessing/text/
- https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/
- https://www.youtube.com/watch?v=j-3vuBynnOE - Loading in your own data - Deep Learning basics with Python, TensorFlow and Keras p.2

- http://faroit.com/keras-docs/1.2.2/preprocessing/text/#tokenizer

- https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras


# Imports

In [1]:
# -------------- Preprocessing Packages --------------
# For tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# -------------- General Packages --------------
# General Use
import pandas as pd
import numpy as np

# For Saving Files
import pickle
import os

Using TensorFlow backend.


# Preprocessing Functions

In [2]:
# Filters corrupted, unusable/unused data from the dataset
def filter_dataset(df):
    df = df.drop(columns=['id','tid1','tid2']) # drop id columns
    df = df.drop(columns=['title1_zh','title2_zh']) # drop chinese columns

    # Remove symbols
    df['title1_en'] = df['title1_en'].str.replace('[^a-zA-Z0-9 ]','')
    df['title2_en'] = df['title2_en'].str.replace('[^a-zA-Z0-9 ]','')

    # Replace empty strings with NaN values
    df['title1_en'].replace('', np.nan, inplace=True)
    df['title2_en'].replace('', np.nan, inplace=True)
    
    # Remove rows with no label
    labels = ['unrelated','agreed','disagreed']
    df = df[df.label.isin(labels)]
    
    # Remove Unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Drop rows with null values
    df = df.dropna()
    return df

# Returns a datset with an equal sampling of each label
def equalize_dataset_labels(df, seed=1):
    # Get minimum label count
    n = df.label.value_counts().min()
    
    # Grabbing equal amounts of training data from each class
    dfa = df[df['label']=='unrelated'].sample(n,random_state=seed)
    dfb = df[df['label']=='agreed'].sample(n,random_state=seed)
    dfc = df[df['label']=='disagreed'].sample(n,random_state=seed)
    
    # Recombine dataset and shuffle
    df = pd.concat([dfa,dfb,dfc])
    df = df.sample(frac=1,random_state=seed)
    return df

# Convert labels to integers for predictions
def encode_labels(df):
    # encoding the labels
    labels = {'unrelated':0,'agreed':1,'disagreed':2}
    df['label'].replace(labels,inplace=True)
    df = df.reset_index()
    return df

# Tokenizing Functions

In [3]:
# Create a word tokenizer given dataframe(s)
def create_tokenizer(*data, num_words=None, lower=True, split=' ', oov_token=None, filename='tokenizer'):
    # create the tokenizer
    t = Tokenizer(num_words=num_words, lower=lower, split=split, oov_token=oov_token)
    
    # fit tokenizer
    for df in data:
        t.fit_on_texts(df['title1_en'])  
        t.fit_on_texts(df['title2_en'])  
    
    # save for future use
    pickle_out = open(filename+'.pickle', 'wb')
    pickle.dump(t,pickle_out)
    pickle_out.close()
    return t

# Tokenizes titles and encodes labels, trains a word tokenizer that is saved to a file
def tokenize(t, df, maxlen=20):
    # fit the tokenizer on the documents  
    data1 = pad_sequences(sequences=t.texts_to_sequences(df['title1_en']), maxlen=maxlen)
    data2 = pad_sequences(sequences=t.texts_to_sequences(df['title2_en']), maxlen=maxlen)
    
    # recombine
    df = pd.DataFrame(np.concatenate((data1,data2),axis=1)).join(df['label'])
    return df

# Save Preprocessed/Tokenized Data

In [4]:
# Given train and test data, split into features and labels and save
def save_data(train, test, maxlen=20, save_dir='./split_data/'):
    # Split the data into X and y
    X_train = train.iloc[:,:maxlen*2].to_numpy()
    X_test = test.iloc[:,:maxlen*2].to_numpy()
    y_train = train['label'].to_numpy()
    y_test = test['label'].to_numpy()
    
    split_data_dir=save_dir
    os.makedirs(os.path.dirname(split_data_dir), exist_ok=True)

    pickle_out = open(split_data_dir+'X_train.pickle', 'wb')
    pickle.dump(X_train,pickle_out)
    pickle_out.close()

    pickle_out = open(split_data_dir+'X_test.pickle', 'wb')
    pickle.dump(X_test,pickle_out)
    pickle_out.close()

    pickle_out = open(split_data_dir+'y_train.pickle', 'wb')
    pickle.dump(y_train,pickle_out)
    pickle_out.close()
    
    pickle_out = open(split_data_dir+'y_test.pickle', 'wb')
    pickle.dump(y_test,pickle_out)
    pickle_out.close()

# Creating Train/Test Data

In [5]:
input_train = './data/train.csv'
input_validation = './data/validation.csv'

df_train = pd.read_csv(input_train,encoding='utf-8-sig',error_bad_lines=False)
df_test = pd.read_csv(input_validation,encoding='utf-8-sig',error_bad_lines=False)

df_train = filter_dataset(df_train)
df_test = filter_dataset(df_test)

b'Skipping line 7026: expected 8 fields, saw 9\nSkipping line 44341: expected 8 fields, saw 9\nSkipping line 58784: expected 8 fields, saw 10\n'
b'Skipping line 99101: expected 8 fields, saw 10\nSkipping line 104716: expected 8 fields, saw 9\nSkipping line 127866: expected 8 fields, saw 10\n'
b'Skipping line 140436: expected 8 fields, saw 9\nSkipping line 152888: expected 8 fields, saw 11\n'


In [6]:
t = create_tokenizer(df_train, num_words=40000, oov_token=None)

In [7]:
# df_train_equalized = equalize_dataset_labels(df_train)
# df_test_equalized = equalize_dataset_labels(df_test)

df_train_encoded = encode_labels(df_train)
df_test_encoded = encode_labels(df_test)

df_train_tokenized = tokenize(t,df_train_encoded,25)
df_test_tokenized = tokenize(t,df_test_encoded,25)

In [8]:
save_data(train=df_train_tokenized,
          test=df_test_tokenized,
          maxlen=25)

# Inverse Tokenization

Some experimentation with inverse tokenizing to double check if it's working.

In [9]:
t.sequences_to_texts([df_train_tokenized.iloc[0,:25]])

['some people say that cabbage is the king of poison in vegetables can you still']

In [16]:
t.sequences_to_texts([df_train_tokenized.iloc[0,25:50]])

['maradona talks about health im better than ever dont listen to rumour']

In [18]:
df_train_tokenized.iloc[0,50]

0

In [12]:
df_train_encoded[:1]

Unnamed: 0,index,title1_en,title2_en,label
0,0,Some people say that cabbage is the king of po...,Maradona talks about health Im better than eve...,0


In [13]:
df_train[:5]

Unnamed: 0,title1_en,title2_en,label
0,Some people say that cabbage is the king of po...,Maradona talks about health Im better than eve...,0
1,Secretly the pharmacy has a cheap cream put on...,Dont be a fool to get the bags under your eyes...,1
2,Just at 23 28 the Chinese stock market broke a...,China stock market startling news the words of...,0
3,Who was the most powerful legion in World War ...,The three great rumblings of the Second World ...,0
4,Discovery of Miracles of Medical Miracles The ...,Why is red wine fighting cancer,0
