# Imports

In [1]:
# -------------- Preprocessing Packages --------------
# For tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# -------------- General Packages --------------
# General Use
import pandas as pd
import numpy as np

# For Saving Files
import pickle
import os

Using TensorFlow backend.


# Preprocessing Functions

In [2]:
# Filters corrupted, unusable/unused data from the dataset
def filter_dataset(df):
    df = df.drop(columns=['id','tid1','tid2']) # drop id columns

    # Replace garbage/unusable symbols
    EN_replacements = {'-':' ',
                       '&':' and ',
                       '< / [a-zA-Z] >': '',
                       '< [a-zA-Z] >':'',
                       '[^a-zA-Z0-9 ]':'',
                       ' s ':' ',
                       '  ':' '}
    for replacee in EN_replacements:
        df['title1_en'] = df['title1_en'].str.replace(replacee,EN_replacements[replacee])
        df['title2_en'] = df['title2_en'].str.replace(replacee,EN_replacements[replacee])    
    df['title1_zh'] = df['title1_zh'].str.replace('[^㕛-马0-9 ]','')
    df['title2_zh'] = df['title2_zh'].str.replace('[^㕛-马0-9 ]','')
        
    # Replace empty strings with NaN values
    df['title1_en'].replace('', np.nan, inplace=True)
    df['title2_en'].replace('', np.nan, inplace=True)
    df['title1_zh'].replace('', np.nan, inplace=True)
    df['title2_zh'].replace('', np.nan, inplace=True)
    
    # Remove rows with no label
    labels = ['unrelated','agreed','disagreed']
    df = df[df.label.isin(labels)]
    
    # Remove Unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Drop rows with null values
    df = df.dropna()
    
    # Split chinese characters for tokenization
    df['title1_zh'] = df['title1_zh'].str.replace('',' ')
    df['title2_zh'] = df['title2_zh'].str.replace('',' ')
    return df

# Returns a datset with an equal sampling of each label
def equalize_dataset_labels(df, seed=1):
    # Get minimum label count
    n = df.label.value_counts().min()
    
    # Grabbing equal amounts of training data from each class
    dfa = df[df['label']=='unrelated'].sample(n,random_state=seed)
    dfb = df[df['label']=='agreed'].sample(n,random_state=seed)
    dfc = df[df['label']=='disagreed'].sample(n,random_state=seed)
    
    # Recombine dataset and shuffle
    df = pd.concat([dfa,dfb,dfc])
    df = df.sample(frac=1,random_state=seed)
    return df

# Convert labels to integers for predictions
def encode_labels(df):
    # encoding the labels
    labels = {'unrelated':0,'agreed':1,'disagreed':2}
    df['label'].replace(labels,inplace=True)
    df = df.reset_index()
    return df

# Tokenizing Functions

In [3]:
# Create a word tokenizer given dataframe(s)
def old_create_tokenizer(*data, num_words=None, lower=True, split=' ', oov_token=None, directory='./', filename='tokenizer'):
    # create the tokenizer
    t = Tokenizer(num_words=num_words, lower=lower, split=split, oov_token=oov_token)
    
    # fit tokenizer
    for df in data:
        t.fit_on_texts(df['title1_en'])  
        t.fit_on_texts(df['title2_en'])  
    
    # save for future use
    os.makedirs(directory, exist_ok=True)
    pickle_out = open(os.path.join(directory+os.sep,filename+'.pickle'), 'wb')
    pickle.dump(t,pickle_out)
    pickle_out.close()
    return t

# Tokenizes titles and encodes labels, trains a word tokenizer that is saved to a file
def old_tokenize(t, df, maxlen=20):
    # fit the tokenizer on the documents  
    data1 = pad_sequences(sequences=t.texts_to_sequences(df['title1_en']), maxlen=maxlen)
    data2 = pad_sequences(sequences=t.texts_to_sequences(df['title2_en']), maxlen=maxlen)
    
    # recombine
    df = pd.DataFrame(np.concatenate((data1,data2),axis=1)).join(df['label'])
    return df

# Create a word tokenizer given dataframe(s)
def create_en_tokenizer(*data, num_words=None, lower=True, split=' ', oov_token=None, directory = './', filename='tokenizer'):
    # create the tokenizer
    t = Tokenizer(num_words=num_words, lower=lower, split=split, oov_token=oov_token)
    
    # fit tokenizer
    for df in data:
        t.fit_on_texts(df['title1_en'])  
        t.fit_on_texts(df['title2_en'])  
    
    # save for future use
    os.makedirs(directory, exist_ok=True)
    pickle_out = open(os.path.join(directory+os.sep,'en_'+filename+'.pickle'), 'wb')
    pickle.dump(t,pickle_out)
    pickle_out.close()
    return t

# Create a word tokenizer given dataframe(s)
def create_zh_tokenizer(*data, num_words=None, lower=True, split=' ', oov_token=None, directory = './', filename='tokenizer'):
    # create the tokenizer
    t = Tokenizer(num_words=num_words, lower=lower, split=split, oov_token=oov_token)
    
    # fit tokenizer
    for df in data:
        t.fit_on_texts(df['title1_zh'])  
        t.fit_on_texts(df['title2_zh'])  
    
    # save for future use
    os.makedirs(directory, exist_ok=True)
    pickle_out = open(os.path.join(directory+os.sep,'zh_'+filename+'.pickle'), 'wb')
    pickle.dump(t,pickle_out)
    pickle_out.close()
    return t

# Tokenizes titles and encodes labels, trains a word tokenizer that is saved to a file
def tokenize(t_en, t_zh, df, maxlen=20):
    # fit the tokenizer on the documents  
    data1 = pad_sequences(sequences=t_en.texts_to_sequences(df['title1_en']), maxlen=maxlen)
    data2 = pad_sequences(sequences=t_en.texts_to_sequences(df['title2_en']), maxlen=maxlen)
    data3 = pad_sequences(sequences=t_zh.texts_to_sequences(df['title1_zh']), maxlen=maxlen)
    data4 = pad_sequences(sequences=t_zh.texts_to_sequences(df['title2_zh']), maxlen=maxlen)
    
    # recombine
    df = pd.DataFrame(np.concatenate((data1,data2,data3,data4),axis=1)).join(df['label'])
    return df

# Save Preprocessed/Tokenized Data

In [4]:
# Given train and test data, split into features and labels and save
def save_data(train, test, maxlen=20, save_dir='./split_data/'):
    # Split the data into X and y
    X_train = train.iloc[:,:maxlen*2].to_numpy()
    X_test = test.iloc[:,:maxlen*2].to_numpy()
    y_train = train['label'].to_numpy()
    y_test = test['label'].to_numpy()
    
    split_data_dir=save_dir
    os.makedirs(os.path.dirname(split_data_dir), exist_ok=True)

    pickle_out = open(os.path.join(split_data_dir+'X_train.pickle'), 'wb')
    pickle.dump(X_train,pickle_out)
    pickle_out.close()

    pickle_out = open(os.path.join(split_data_dir+'X_test.pickle'), 'wb')
    pickle.dump(X_test,pickle_out)
    pickle_out.close()

    pickle_out = open(os.path.join(split_data_dir+'y_train.pickle'), 'wb')
    pickle.dump(y_train,pickle_out)
    pickle_out.close()
    
    pickle_out = open(os.path.join(split_data_dir+'y_test.pickle'), 'wb')
    pickle.dump(y_test,pickle_out)
    pickle_out.close()

# Creating Train/Test Data

In [5]:
input_train = './data/train.csv'
input_validation = './data/validation.csv'

SPLIT_DATA_DIR = os.path.join('./split_data'+os.sep)

OOV_TOKEN = 'UNK'
SENTENCE_SIZE = 25
NUM_WORDS = None

In [6]:
# Loading the data
df_train = pd.read_csv(input_train,encoding='utf-8-sig',error_bad_lines=False)
df_test = pd.read_csv(input_validation,encoding='utf-8-sig',error_bad_lines=False)

b'Skipping line 7026: expected 8 fields, saw 9\nSkipping line 44341: expected 8 fields, saw 9\nSkipping line 58784: expected 8 fields, saw 10\n'
b'Skipping line 99101: expected 8 fields, saw 10\nSkipping line 104716: expected 8 fields, saw 9\nSkipping line 127866: expected 8 fields, saw 10\n'
b'Skipping line 140436: expected 8 fields, saw 9\nSkipping line 152888: expected 8 fields, saw 11\n'


In [7]:
# Initial preprocessing
df_train_filtered = filter_dataset(df_train)
df_test_filtered = filter_dataset(df_test)

In [9]:
# Creatng tokenizers
t_en = create_en_tokenizer(df_train_filtered, oov_token=OOV_TOKEN)
t_zh = create_zh_tokenizer(df_train_filtered, oov_token=OOV_TOKEN)

FileNotFoundError: [WinError 3] The system cannot find the path specified: ''

In [None]:
# Encoding the labels
df_train_encoded = encode_labels(df_train_filtered)
df_test_encoded = encode_labels(df_test_filtered)

In [None]:
# Tokenizing the datasets
df_train_tokenized = tokenize(t_en,t_zh,df_train_encoded,SENTENCE_SIZE)
df_test_tokenized = tokenize(t_en,t_zh,df_test_encoded,SENTENCE_SIZE)

In [None]:
# Saving the feature/label split
save_data(train=df_train_tokenized,
          test=df_test_tokenized,
          maxlen=SENTENCE_SIZE,
          save_dir=SPLIT_DATA_DIR)

# Inverse Tokenization

Some experimentation with inverse tokenizing to double check if it's working.

In [None]:
t.sequences_to_texts([df_train_tokenized.iloc[0,:SENTENCE_SIZE]])

In [None]:
t.sequences_to_texts([df_train_tokenized.iloc[0,SENTENCE_SIZE:SENTENCE_SIZE*2]])

In [None]:
df_train_tokenized.iloc[0,SENTENCE_SIZE*2]

In [None]:
df_train_encoded[:1]

In [None]:
df_train[:5]