# Imports

In [1]:
# -------------- Preprocessing Packages --------------
# For tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# -------------- General Packages --------------
# General Use
import pandas as pd
import numpy as np

# For Saving Files
import pickle
import os

Using TensorFlow backend.


# Preprocessing Functions

In [2]:
# Filters corrupted, unusable/unused data from the dataset
def filter_dataset(df):
    df = df.drop(columns=['id','tid1','tid2']) # drop id columns
    df = df.drop(columns=['title1_zh','title2_zh']) # drop chinese columns

    # Remove symbols
    df['title1_en'] = df['title1_en'].str.replace('[^a-zA-Z0-9 ]','')
    df['title2_en'] = df['title2_en'].str.replace('[^a-zA-Z0-9 ]','')

    # Replace empty strings with NaN values
    df['title1_en'].replace('', np.nan, inplace=True)
    df['title2_en'].replace('', np.nan, inplace=True)
    
    # Remove rows with no label
    labels = ['unrelated','agreed','disagreed']
    df = df[df.label.isin(labels)]
    
    # Remove Unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Drop rows with null values
    df = df.dropna()
    return df

# Returns a datset with an equal sampling of each label
def equalize_dataset_labels(df, seed=1):
    # Get minimum label count
    n = df.label.value_counts().min()
    
    # Grabbing equal amounts of training data from each class
    dfa = df[df['label']=='unrelated'].sample(n,random_state=seed)
    dfb = df[df['label']=='agreed'].sample(n,random_state=seed)
    dfc = df[df['label']=='disagreed'].sample(n,random_state=seed)
    
    # Recombine dataset and shuffle
    df = pd.concat([dfa,dfb,dfc])
    df = df.sample(frac=1,random_state=seed)
    return df

# Convert labels to integers for predictions
def encode_labels(df):
    # encoding the labels
    labels = {'unrelated':0,'agreed':1,'disagreed':2}
    df['label'].replace(labels,inplace=True)
    df = df.reset_index()
    return df

# Tokenizing Functions

In [3]:
# Create a word tokenizer given dataframe(s)
def create_tokenizer(*data, split=' ', filename='tokenizer'):
    # create the tokenizer
    t = Tokenizer(oov_token=True, split=split)
    
    # fit tokenizer
    for df in data:
        t.fit_on_texts(df['title1_en'])  
        t.fit_on_texts(df['title2_en'])  
    
    # save for future use
    pickle_out = open(filename+'.pickle', 'wb')
    pickle.dump(t,pickle_out)
    pickle_out.close()
    return t

# Tokenizes titles and encodes labels, trains a word tokenizer that is saved to a file
def tokenize(t, df, maxlen=20):
    # fit the tokenizer on the documents  
    data1 = pad_sequences(sequences=t.texts_to_sequences(df['title1_en']), maxlen=maxlen)
    data2 = pad_sequences(sequences=t.texts_to_sequences(df['title2_en']), maxlen=maxlen)
    
    # recombine
    df = pd.DataFrame(np.concatenate((data1,data2),axis=1)).join(df['label'])
    return df

# Main Function

In [16]:
# Given train and test data, split into features and labels and save
def save_data(train, test, maxlen=20, save_dir='./split_data/'):
    # Split the data into X and y
    X_train = train.iloc[:,:maxlen*2].to_numpy()
    X_test = test.iloc[:,:maxlen*2].to_numpy()
    y_train = train['label'].to_numpy()
    y_test = test['label'].to_numpy()

    # reshape the X train data for use
    X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1)
    X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)
    
    split_data_dir=save_dir
    os.makedirs(os.path.dirname(split_data_dir), exist_ok=True)

    pickle_out = open(split_data_dir+'X_train.pickle', 'wb')
    pickle.dump(X_train,pickle_out)
    pickle_out.close()

    pickle_out = open(split_data_dir+'X_test.pickle', 'wb')
    pickle.dump(X_test,pickle_out)
    pickle_out.close()

    pickle_out = open(split_data_dir+'y_train.pickle', 'wb')
    pickle.dump(y_train,pickle_out)
    pickle_out.close()
    
    pickle_out = open(split_data_dir+'y_test.pickle', 'wb')
    pickle.dump(y_test,pickle_out)
    pickle_out.close()

# Creating Train/Test Data

In [6]:
input_train = './data/train.csv'
input_validation = './data/validation.csv'

df_train = pd.read_csv(input_train,encoding='utf-8-sig',error_bad_lines=False)
df_test = pd.read_csv(input_validation,encoding='utf-8-sig',error_bad_lines=False)

df_train = filter_dataset(df_train)
df_test = filter_dataset(df_test)

b'Skipping line 7026: expected 8 fields, saw 9\nSkipping line 44341: expected 8 fields, saw 9\nSkipping line 58784: expected 8 fields, saw 10\n'
b'Skipping line 99101: expected 8 fields, saw 10\nSkipping line 104716: expected 8 fields, saw 9\nSkipping line 127866: expected 8 fields, saw 10\n'
b'Skipping line 140436: expected 8 fields, saw 9\nSkipping line 152888: expected 8 fields, saw 11\n'


In [7]:
t = create_tokenizer(df_train,df_test)

In [8]:
df_train = equalize_dataset_labels(df_train)
df_test = equalize_dataset_labels(df_test)

df_train = encode_labels(df_train)
df_test = encode_labels(df_test)

df_train = tokenize(t,df_train,20)
df_test = tokenize(t,df_test,20)

In [13]:
save_data(train=df_train,test=df_test)

In [22]:
t.word_index

{True: 1,
 'the': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'is': 6,
 'and': 7,
 'in': 8,
 'you': 9,
 'not': 10,
 'it': 11,
 'be': 12,
 'for': 13,
 'can': 14,
 'will': 15,
 'with': 16,
 'has': 17,
 'are': 18,
 'this': 19,
 'that': 20,
 'have': 21,
 'do': 22,
 'on': 23,
 'was': 24,
 'rumors': 25,
 'these': 26,
 'eat': 27,
 'new': 28,
 'rumor': 29,
 'no': 30,
 'people': 31,
 'years': 32,
 'by': 33,
 'how': 34,
 'at': 35,
 'been': 36,
 'your': 37,
 'its': 38,
 'days': 39,
 'one': 40,
 'three': 41,
 'dont': 42,
 'more': 43,
 'i': 44,
 'from': 45,
 'after': 46,
 'out': 47,
 'li': 48,
 'up': 49,
 'hair': 50,
 'weight': 51,
 'but': 52,
 'white': 53,
 'his': 54,
 'wang': 55,
 'all': 56,
 'as': 57,
 'get': 58,
 'two': 59,
 'what': 60,
 'her': 61,
 'good': 62,
 'cancer': 63,
 'who': 64,
 'old': 65,
 'he': 66,
 'only': 67,
 'police': 68,
 'zhang': 69,
 'about': 70,
 'an': 71,
 'day': 72,
 'blood': 73,
 'than': 74,
 'yang': 75,
 '2018': 76,
 's': 77,
 'love': 78,
 'so': 79,
 'rumour': 80,
 'know': 81,
 'yu

# Resources Used

- https://keras.io/preprocessing/text/
- https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/
- https://www.youtube.com/watch?v=j-3vuBynnOE - Loading in your own data - Deep Learning basics with Python, TensorFlow and Keras p.2

- http://faroit.com/keras-docs/1.2.2/preprocessing/text/#tokenizer
