In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import *
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics.scorer import make_scorer
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import matplotlib.pyplot as plt
import re
seed = 221

**Load data**

In [3]:
# Read the data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


**All helper functions are listed below**

In [4]:
# Put all the helper functions here

def makeToken(txt, selected_txt):
    """
        replace the selected text inside a text with the same token
    """
    # Join a list of <token> based on the length of the selected text 
    tokens = " ".join("<token>" for i in range(len(selected_txt.split())))
    # replace the text where it matches with selected text with token
    txt = txt.replace(selected_txt, tokens)
    return txt

def build_embedding_from_glove(tokenizer, vocab_size):
    """
        turning the tweets in training data to word vectors
    """
    # Part 1: read in the GloVe word embedding
    # Empty dictionary
    embeddings_index = dict()
    f = open('glove.840B.300d.txt')
    # Write pretrained word embedding layer to a dictoray
    for line in f:
        # split up word and the vector
        values = line.split(" ")
        # Get an index for that word
        word = values[0]
        # Put the word vectors as a numpy array
        coefs = np.array(values[1:], dtype='float32')
        # Put each word vector to map to its index for that word
        embeddings_index[word] = coefs
    # Close the file
    f.close()

    # Part 2: Tranform our data based on the word embedding
    glove_embedding = np.stack(embeddings_index.values())

    # GloVe's word vectors is default to be 300 dimensional, we form a matrix based on sampling from the normal distribution with the same mean and std as the word embedding
    #embedding_matrix = np.random.normal(np.mean(glove_embedding), glove_embedding.std(), (vocab_size, 300))
    embedding_matrix= np.zeros((vocab_size, 300)) # try all zero matrix
    for word, idx in tokenizer.word_index.items():
        # Get the word, strip off all non-letter values from the word to hopefully get the corresponding vector from embedding
        embedding_vectors = embeddings_index.get(re.sub(r"[^A-Za-z]", "", word))
        # If we can find the corresponding word vector in GloVe, we add it to the embedding matrix
        if embedding_vectors is not None:
            # Add the vector to the corresponding row according to the index in the tokenizer
            embedding_matrix[idx] = embedding_vectors
    return embedding_matrix


def jaccard(str1, str2): 
    """
        an evaluation metric specified by Kaggle
    """
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def pad(tokenizer, text, maxlen):
    """
        pad a list of integers with 0 up to the specified maximum length
    """
    # utilize the tokenizer to convert each tweet into a list of integer. These integers are assigned by the tokenizer based on the index of the word in it.
    sequence = tokenizer.texts_to_sequences(text)
    # Pad the list of integer with 0 to a specified length 
    padded_sequence  = pad_sequences(sequence,maxlen=maxlen) 
    return padded_sequence  

def decode(text_sequence, binary_sequence, tokenizer):
    """
        get the corresponding word based on the list of 0 and 1
    """
    # Get a list of indicies where the binary sequence has a 1
    index_list=[key for key, val in enumerate(binary_sequence) if val == 1] 
    # Get a list of strings from the tokenizer based on the indicies in text_sequence corresponding to those 1's positions
    res_list = [tokenizer.index_word[text_sequence[i]] for i in index_list if text_sequence[i]!=0] 
    # Get back the sentence
    decoded_sentence = " ".join(res_list)
    return decoded_sentence 

def evaluate(X, y_true, tokenizer, model, eps=0.1):
    """
        evaluate the overall test score
    """
    # predict based on y
    y_pred = model.predict_classes(X, verbose=0)
    # Transform the data dimension back to the one same as y_test
    y_pred = y_pred.reshape(y_true.shape[0],y_true.shape[1])
    # Likewise, we tranform X
    X = X.reshape(X.shape[0],X.shape[1])
    print(y_pred[0])
    print(y_true[0])
    
    ls = []
    for row in range(len(y_pred)):
        # Decode each row to get the predicted text 
        decoded_pred_text = decode(X[row], y_pred[row], tokenizer)
        # Decode each row to get the actual text 
        actual_text = decode(X[row], y_true[row], tokenizer)
        # compute the score for each row and average them
        ls.append(jaccard(decoded_pred_text, actual_text))
        overall_score = np.mean(ls)
    return overall_score

def build_model(num_node, activation_1, activation_2, optimizer, gru):
    """
        A function that builds either bidirectional GRU or LSTM architecture.
    """
    # Model specification
    model = Sequential()
    #if glove == True:
    # Embedding layer: vocab_size is the count of unique token + 1 for the 0 we added, embed size is the size of the vector space for which the words are embedded. 
    #    model.add(Embedding(vocab_size, 300, weights=[embedding_mat], input_length=(maxlen,), trainable=False))
    #else:
    #    model.add(Embedding(vocab_size, 300, input_length=(maxlen,), trainable=True))
    if gru == True:
        # Bidirectional with Graded Recurrent Unit
        model.add(Bidirectional(GRU(num_node, dropout = 0.3, return_sequences=True, recurrent_dropout=0.3),input_shape=(maxlen, 1)))
    else:
        # Bidirectional with LSTM 
        model.add(Bidirectional(LSTM(num_node, dropout = 0.3, return_sequences=True, recurrent_dropout=0.3),input_shape=(maxlen, 1)))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(Dense(num_node, activation=activation_1, kernel_constraint=max_norm(2))))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(Dense(num_node, activation=activation_2, kernel_constraint=max_norm(2))))
    model.add(TimeDistributed(Dropout(0.5)))
    model.add(TimeDistributed(Dense(1, activation="sigmoid")))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

def tune(X, y, model, grid, num_fold, scoring, random_state):
    """
        this function wraps the methods for grid search cross validation for tuning parameters
    Arguements:
        X (array): the training samples
        y (array):  the response variable
        grid(dict): a dictionary contains information about which parameter to be tuned and the respective values involved.
        num_fold(int): the number of fold for cross-validation
        scoring (str): the scoring metrix
    Return:
        grid_result (dict) : dict of numpy (masked) ndarrays. See sklearn.model_selection.GridSearchCV for details
    """
    # Grid search cross validation
    kfold = KFold(n_splits=num_fold, random_state=random_state)
    # Utilize all CPUs and assign 0 score if the error occurs. 
    grid_search = GridSearchCV(estimator=model, param_grid=grid, cv= kfold, scoring=scoring, error_score=0)
    grid_result = grid_search.fit(X,y)
    # print the best result and respective parameter based on residual sum of squares 
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    return grid_result

def plotLoss(history):
    """
        plot the loss for the neural net training process
    """
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
def plotAccuracy(history):
    """
        plot the accuracy of the neural net training process
    """
    # summarize history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

**Tuning the selected text inside the text with the same label**

In [5]:
# Preprocess: drop the missing value and tokenize all selected text in a text
train = train.dropna().reset_index()

# Replace the selected text in text with a specified token
storage = []
for row in range(train.shape[0]):
    list_of_targets = makeToken(train["text"][row], train["selected_text"][row])
    storage.append(list_of_targets)
train["tokenized_text"] = pd.DataFrame(storage)
train 

Unnamed: 0,index,textID,text,selected_text,sentiment,tokenized_text
0,0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,<token> <token> <token> <token> <token> <toke...
1,1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,<token> <token> I will miss you here in San D...
2,2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is <token> <token>...
3,3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview! <token> <token> <token>
4,4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,<token> <token> <token> why couldn`t they put...
...,...,...,...,...,...,...
27475,27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish we could come see u on Denver husban<to...
27476,27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,I`ve wondered about rake to. The client has ...
27477,27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,<token> <token> <token> <token> <token> <toke...
27478,27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,<token> <token> <token> <token> <token> <token>


**Tokenization: Assign an integer index to all the words**

In [7]:
# Tokenize all text in the text column and the <token> in the tokenized text
tokenizer = Tokenizer(filters='') # specify filter so that it won't strip off the <> for token
tokenizer.fit_on_texts(pd.concat([train.text, train.tokenized_text], axis=0)) # fit in all text to be tokenized

**Build an embedding matrix based on GloVe**

In [9]:
# Build a matrix that utilize indicies from tokenizer and the learned word embedding from Glove
#vocab_size = len(tokenizer.word_index) + 1
#embedding_mat = build_embedding_from_glove(tokenizer, vocab_size)

In [10]:
# Set all the word that has "<token>" in it to have the same index since there are some "<token>.."
for word, idx in tokenizer.word_index.items():
    if "<token>" in word:
        # set any words that contain <token> to have the same index
        tokenizer.word_index[word] = tokenizer.word_index["<token>"]

In [11]:
# Find the longest text in the data for padding other shorter text
maxlen = max(train["text"].apply(lambda x: len(x.split())))

# Tranform the text into a list of integer and pad it with 0. The maximum length of padding is the length of the longest text in data
X = pad(tokenizer, train.text, maxlen)
y = pad(tokenizer, train.tokenized_text, maxlen)

# Binarize the signal in y
# If the text is a token we call it 1 and 0 otherwise
y[y != tokenizer.word_index["<token>"]] = 0
y[y == tokenizer.word_index["<token>"]] = 1

**Splitting Training and Testing Set**

In [12]:
# Split the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
# Transform the data dimension
X_tr_v = np.array(X_train).reshape(X_train.shape[0],X_train.shape[1],1).astype(np.float32) # Change the data type
X_te_v = np.array(X_test).reshape(X_test.shape[0],X_test.shape[1],1).astype(np.float32) # Change the data type
y_tr_v = np.array(y_train).reshape(y_train.shape[0],y_train.shape[1],1)
y_te_v = np.array(y_test).reshape(y_test.shape[0],y_test.shape[1],1)

**Initial search for number epoch with some fixed parameters with the proposed model**

* 20 epoches is enough for training the model based on the loss.

In [None]:
# Define SGD optimizer
sgd = SGD(lr=0.1, momentum=0.9)
# Instantiate a GRU with embedding based on GloVe
glove_emd_GRU_model = build_model(20,"tanh", "relu", sgd, gru=True)
# Fit the model
history = glove_emd_GRU_model.fit(X_tr_v, y_tr_v, batch_size=32,epochs=100,validation_data=(X_te_v,y_te_v),verbose=0)
plotLoss(history)
# Evaluate the model with prob_threshold 0.1
print(evaluate(X_te_v, y_test,tokenizer, glove_emd_GRU_model))
print("--------------------------")

**Try a different optimizer**

In [None]:
# Define optimizer
adam = 'adam'
# Instantiate a GRU with embedding based on GloVe
glove_emd_GRU_model = build_model(20,"tanh", "relu", adam, gru=True)
# Fit the model
history = glove_emd_GRU_model.fit(X_tr_v, y_tr_v, batch_size=32,epochs=40,validation_data=(X_te_v,y_te_v),verbose=0)
plotLoss(history)
# Evaluate the model
print(evaluate(X_te_v, y_test, tokenizer, glove_emd_GRU_model))
print("--------------------------")

**Hyperparameter Tuning with 60 epochs**

We stick with SGD optimizer and use 60 epochs to tune all the parameters

In [None]:
# Parameters to tune
activation = ['tanh', 'relu', 'sigmoid'] # activations
node_num = [20,30,64] # width of the layer
gru = [True, False]
# Due to the large number of parameters to tune, we print the result out so that the memory won't get overloaded
for g in gru:
    for n in node_num:
        for a1 in activation:
            for a2 in activation:
                # Instantiate a GRU with embedding based on GloVe
                model = build_model(n, a1, a2, sgd, gru=g)
                # Fit the model
                history = model.fit(X_tr_v, y_tr_v, batch_size=32,epochs=60,validation_data=(X_te_v,y_te_v),verbose=0)
                # Evaluate the model 
                print("GRU?", g)
                print("Number of nodes: ",n)
                print("Activation 1:", a1)
                print("Activation 2:", a2)
                print("Jaccard Score: ", evaluate(X_te_v, y_test,tokenizer, model))
                print("--------------------------")

In [13]:
X_v = np.array(X).reshape(X.shape[0],X.shape[1],1).astype(np.float32) # Change the data type
y_v = np.array(y).reshape(y.shape[0],y.shape[1],1)

**K-Fold Cross-Validation**

In [None]:
# define 10-fold cross validation test harness
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []
for train, test in kfold.split(X_v, y_v):
    model = build_model(20, "tanh", "relu", sgd, gru=True)# finalize model
    model.fit(X_v[train], y_v[train], epochs=60, batch_size=32, verbose=0)
    # evaluate the model
    scores = model.evaluate(X_v[test], y_v[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

In [None]:
# define 10-fold cross validation test harness
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []
for train, test in kfold.split(X_v, y_v):
    model = build_model(20, "relu",  "tanh", sgd, gru=False)# finalize model
    model.fit(X_v[train], y_v[train], epochs=60, batch_size=32, verbose=0)
    # evaluate the model
    scores = model.evaluate(X_v[test], y_v[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

In [None]:
# Fit in all data to submit for 
lstm = build_model(20, "relu",  "tanh", sgd, gru=False)
lstm.fit(X_v, y_v, epochs=60, batch_size=32, verbose=0)

In [15]:
# Fit in all data to submit for 
sgd = SGD(lr=0.1, momentum=0.9)
gru = build_model(20, "relu",  "tanh", sgd, gru=True)
gru.fit(X_v, y_v, epochs=60, batch_size=32, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x13ca92710>

In [16]:
# Make a submission using GRU
submission = pd.read_csv('data/sample_submission.csv')
sub_test = pad_sequences(tokenizer.texts_to_sequences(test.text), maxlen = maxlen)
# Transform the data dimension
sub_test = np.array(sub_test).reshape(sub_test.shape[0],sub_test.shape[1],1).astype(np.float32)
sub_test_pred = gru.predict_classes(sub_test)
sub_test_pred = sub_test_pred.reshape(sub_test_pred.shape[0],sub_test_pred.shape[1])

# Transform X back
sub_test = sub_test.reshape(sub_test.shape[0],sub_test.shape[1])


In [18]:
# Make a submission using GRU
submission = pd.read_csv('data/sample_submission.csv')
sub_test = pad_sequences(tokenizer.texts_to_sequences(test.text), maxlen = maxlen)
# Transform the data dimension
sub_test = np.array(sub_test).reshape(sub_test.shape[0],sub_test.shape[1],1).astype(np.float32)
sub_test_pred = gru.predict_classes(sub_test, verbose=0)
sub_test_pred = sub_test_pred.reshape(sub_test_pred.shape[0],sub_test_pred.shape[1])
# Transform X back
sub_test = sub_test.reshape(sub_test.shape[0],sub_test.shape[1])

for i in range(len(sub_test_pred)):
  # Decode and fill in the file
  submission["selected_text"][i] = decode(sub_test[i], sub_test_pred[i], tokenizer)
#Write to a csv file
submission.to_csv('submission.csv', index=False)

[[0 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 ...
 [0 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
