# Imports

In [1]:
# -------------- Preprocessing Packages --------------
# For tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# -------------- General Packages --------------
# General Use
import pandas as pd
import numpy as np

# For Saving Files
import pickle
import os

Using TensorFlow backend.


# Preprocessing Functions

In [3]:
# Filters corrupted, unusable/unused data from the dataset
def filter_dataset(df):
    df = df.drop(columns=['id','tid1','tid2']) # drop id columns
    df = df.drop(columns=['title1_zh','title2_zh']) # drop chinese columns

    # Remove symbols
    df['title1_en'] = df['title1_en'].str.replace('[^a-zA-Z0-9 ]','')
    df['title2_en'] = df['title2_en'].str.replace('[^a-zA-Z0-9 ]','')

    # Replace empty strings with NaN values
    df['title1_en'].replace('', np.nan, inplace=True)
    df['title2_en'].replace('', np.nan, inplace=True)
    
    # Remove rows with no label
    labels = ['unrelated','agreed','disagreed']
    df = df[df.label.isin(labels)]
    
    # Remove Unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Drop rows with null values
    df = df.dropna()
    return df

# Returns a datset with an equal sampling of each label
def equalize_dataset_labels(df, seed=1):
    # Get minimum label count
    n = df.label.value_counts().min()
    
    # Grabbing equal amounts of training data from each class
    dfa = df[df['label']=='unrelated'].sample(n,random_state=seed)
    dfb = df[df['label']=='agreed'].sample(n,random_state=seed)
    dfc = df[df['label']=='disagreed'].sample(n,random_state=seed)
    
    # Recombine dataset and shuffle
    df = pd.concat([dfa,dfb,dfc])
    df = df.sample(frac=1,random_state=seed)
    return df

# Pads/truncates entries of a series to a max length and tokenizes sentences
def pad_data(t,data,maxlen):
    return pad_sequences(sequences=t.texts_to_sequences(data), maxlen=maxlen)

# Tokenizes titles and encodes labels, trains a word tokenizer that is saved to a file
def tokenize(df, output_dir='./', maxlen=20, save=True):
    # encoding the labels
    labels = {'unrelated':0,'agreed':1,'disagreed':2}
    df['label'].replace(labels,inplace=True)
    
    df = df.reset_index()
    
    # create the tokenizer
    t = Tokenizer(oov_token=True, split=" ")

    # fit the tokenizer on the documents
    t.fit_on_texts(df['title1_en'])    
    data1 = pad_data(t,df['title1_en'],maxlen)
    data2 = pad_data(t,df['title2_en'],maxlen)
    
    # recombine
    df = pd.DataFrame(np.concatenate((data1,data2),axis=1)).join(df['label'])
    
    # save the tokenizer for future use
    if save:
        tokenizer_dir = output_dir+'tokenizers/'
        os.makedirs(os.path.dirname(tokenizer_dir), exist_ok=True)
            
        pickle_out = open(tokenizer_dir+'{}len-tokenizer.pickle'.format(maxlen), 'wb')
        pickle.dump(t,pickle_out)
        pickle_out.close()
        
    return df

# Primary preprocessing function, calls other preprocessing functions and stores data
def preprocess(df, output_dir='./', name='', equalize=True, maxlen=20, save=True):
    df = filter_dataset(df)    
    if(equalize): df = equalize_dataset_labels(df)
    df = tokenize(df, output_dir=output_dir, maxlen=maxlen, save=save)

    # Split the data into X and y
    X = df.iloc[:,:maxlen*2].to_numpy()
    y = df['label'].to_numpy()

    # reshape the X train data for use
    X = X.reshape(X.shape[0],X.shape[1],1)
    
    if save:
        split_data_dir = output_dir+'split_data/'
        os.makedirs(os.path.dirname(split_data_dir), exist_ok=True)
        
        pickle_out = open(split_data_dir+'X'+name+'.pickle', 'wb')
        pickle.dump(X,pickle_out)
        pickle_out.close()
        
        pickle_out = open(split_data_dir+'y'+name+'.pickle', 'wb')
        pickle.dump(y,pickle_out)
        pickle_out.close()
    return df

# Creating Train/Test Data

In [4]:
input_train = './data/train.csv'
input_validation = './data/validation.csv'

df = pd.read_csv(input_train,encoding='utf-8-sig',error_bad_lines=False)
df = preprocess(df, name='_train')

df = pd.read_csv(input_validation,encoding='utf-8-sig',error_bad_lines=False)
df = preprocess(df, name='_test')

b'Skipping line 7026: expected 8 fields, saw 9\nSkipping line 44341: expected 8 fields, saw 9\nSkipping line 58784: expected 8 fields, saw 10\n'
b'Skipping line 99101: expected 8 fields, saw 10\nSkipping line 104716: expected 8 fields, saw 9\nSkipping line 127866: expected 8 fields, saw 10\n'
b'Skipping line 140436: expected 8 fields, saw 9\nSkipping line 152888: expected 8 fields, saw 11\n'


# Resources Used

- https://keras.io/preprocessing/text/
- https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/
- https://www.youtube.com/watch?v=j-3vuBynnOE - Loading in your own data - Deep Learning basics with Python, TensorFlow and Keras p.2
