# Model

This notebook presents the code for the models used in the dissertation project.

## Preprocessing

First thing to do is create the Twitter lexicon of for spell correction. The lexicon used is available and was developed by S. Rosenthal, P. Nakov, S. Kiritchenko, S. M. Mohammad, A. Ritter, and V. Stoyanov for their paper Semeval-2015 task 10: Sentiment analysis in Twitter. Proceedings of the ninth international workshop on Semantic Evaluation Exercises (SemEval-2015). 2015.

In [None]:
import csv

def make_lexicon(lexicon_file):
    twitter_lexicon = []
    with open(lexicon_file) as lexicon:
        lexicon_reader = csv.reader(lexicon, delimiter='\t')
        for line in lexicon_reader:
            twitter_lexicon.append(line[1])
    return twitter_lexicon

Next we load the tweets and perform the preprocessing that is implemented in the load_tweets.py file. The first time this is performed the data batches need unzipping.

In [None]:
import load_tweets
import os

# load training data
training_filepath = "./drive/My Drive/training_data/"

# first time running only, unzip each batch
first_time = False
if first_time:
    for r, d, f in os.walk(training_filepath):
        for filename in f:
            batch_folder = os.path.join(training_filepath, filename[:-4])
            os.mkdir(batch_folder)
            if '.tgz' in filename:
                load_tweets.unzip_batch(os.path.join(r, filename), batch_folder)
                
# initialise lists for storing the data                
cd_training_data, cp_training_data, dp_training_data = [], [], []
cd_training_labels, cp_training_labels, dp_training_labels = [], [], []

# make the lexicon
lexicon = make_lexicon('./twitter-lexicon.txt')

# preprocess the data batch by batch
for r, d, f in os.walk(training_filepath):
    for batch in d:
        batch_folder = os.path.join(r, batch)
        cd_batch_data, cp_batch_data, dp_batch_data, cd_batch_labels, cp_batch_labels, dp_batch_labels = load_tweets.load_batch(batch_folder, "drive/My Drive/training_data/anonymized_user_info_by_chunk_training.csv", lexicon)
        cd_training_data += cd_batch_data
        cp_training_data += cp_batch_data
        dp_training_data += dp_batch_data
        cd_training_labels += cd_batch_labels
        cp_training_labels += cp_batch_labels
        dp_training_labels += dp_batch_labels

Import necessary libraries for the remainder of the preprocessing and for building the model.

In [None]:
import numpy as np
import random
import keras
from keras import optimizers
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, Flatten, Dense, Bidirectional, LSTM, Dropout, MaxPooling1D, Conv1D, TimeDistributed, concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve

The final steps of the preprocessing is to tokenize the text, convert to sequences, and pad so all tweets are the same length. Then we have a structured dataset where each user has 3000 tweets and each tweet is the same length (we decide on token length of 30 due to the 140 character limit of tweets.

In [None]:
def load_pad_data(data, labels):
    
    # Tokenize the tweets
    tokenizer = Tokenizer()
    max_length = 30
    users_train_X = []
    for user in data:
        user_train_X = []
        for (tweet, score) in user:
            tokenizer.fit_on_texts(tweet)
            user_train_X.append((tweet, score))
        users_train_X.append(user_train_X)

    # Convert text to sequences, then pad sequences    
    train_X = []
    for user in users_train_X:
        raw_user_tweets = [x[0] for x in user]
        user_tweets = []
        temp = tokenizer.texts_to_sequences(raw_user_tweets)
        train_X.append(pad_sequences(temp, maxlen=max_length))
    word_index = tokenizer.word_index # keep the word index for creating the embedding layers
    
    train_y = labels
    
    return train_X, train_y, word_index

## Build Models

Object for building the baseline and few-shot learning models. The current FSL methos that is implemented is method 3, where the data is partitioned pseudo-randomly across tweets for each user.

In [None]:
class BLSTM(object):
    
    # initialise the model with the training data and labels, and the word index
    def __init__(self, model_name, train_X, train_y, word_index):
        self.model_name = model_name
        self.train_X = train_X
        self.train_y = train_y
        self.word_index = word_index
        
        
    # custom embedding method, which uses the Twitter GloVE trained embeddings
    def embed_twitter(self):
        # create embedding dictionary
        self.embeddings_index = {}
        
        # open twitter-trained word embeddings
        txt = open('./drive/My Drive/embeddings/glove.twitter.27B.100d.txt', 'r')
        for line in txt:
            values = line.split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = coefs
        txt.close()
        
        # create embedding matrix for words in the word index
        self.embedding_matrix = np.zeros((len(self.word_index)+1, 100))
        for word, index in self.word_index.items():
            embedding_vector = self.embeddings_index.get(word)
            if embedding_vector is not None:
                self.embedding_matrix[index] = embedding_vector    
    
    # Architecture of the baseline model and task module
    # baseline takes 3000 tweets, task module takes 1500
    def build_model(self, tweets=3000, pretrained=True):
        # define the custom embedding layer 
        embedding_layer = Embedding(len(self.word_index)+1, 100, input_length=(30), weights=[self.embedding_matrix], trainable=True)
      
        # create the embedding branches, one for each tweet
        embedding_inputs = []
        embedding_outputs = []
        for i in range(tweets):
            branch_input = Input(shape=(30,))
            branch_embedding = embedding_layer(branch_input)
            embedding_inputs.append(branch_input)
            embedding_outputs.append(branch_embedding)

        # merge layer combining all embeddings
        merged = concatenate(embedding_outputs)
        # bi-directional LSTM layer with size 50
        blstm = Bidirectional(LSTM(50))(merged)
        # Dense output layer, uses sigmoid activation function for binary classification problem
        out = Dense(1,  activation='sigmoid')(blstm)
        
        # Assemble the model layers and define inputs and output
        self.model = Model(inputs=embedding_inputs, outputs=[out])
        
        # if pretraining used load the weights for the desired layers, by name
        if pretrained == True:
            self.model.load_weights('pretrained_weights.h5', by_name=True)
            
        # compile model, and get weights if desired
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        self.init_weights = self.model.get_weights()

    # build the task module by calling the baseline method above with desired number of tweets
    # number of tweets is 1500 for FSL methods 2 and 3, 3000 for method 1 - currently implemented for method 3
    def build_fsl_task(self):
        self.build_model(tweets=1500)

    # build correction module
    def build_fsl_correction(self, correction_size):
        
        # define the embedding layers
        embedding_layer = Embedding(len(self.word_index)+1, 100, input_length=(30), weights=[self.embedding_matrix], trainable=True)
      
        # create the embedding branches, one for each tweet
        embedding_inputs = []
        embedding_outputs = []
        for i in range(correction_size*3000):
            branch_input = Input(shape=(30,))
            branch_embedding = embedding_layer(branch_input)
            embedding_inputs.append(branch_input)
            embedding_outputs.append(branch_embedding)

        # input the prediction from the task module
        task_prediction = Input(shape=(1,))
        task_dense = Dense(1, activation='relu')(task_prediction)

        # merge all embeddings
        merged = concatenate(embedding_outputs)
        blstm = Bidirectional(LSTM(50))(merged)
        dense = Dense(1,  activation='relu')(blstm)

        # merge the BLSTM outputs with the task module prediction
        task_outputs = [dense, task_dense]
        merged2 = concatenate(task_outputs)
        out = Dense(1, activation='linear')(merged2)

        # dassemble model with inputs and outputs
        all_inputs = embedding_inputs
        all_inputs.append(task_prediction)
        self.correction_model = Model(inputs=all_inputs, outputs=[out])
        
        # define stochastiv gradient descent optimiser and compile model
        sgd = optimizers.SGD(lr=0.01, decay=1, momentum=0.9, nesterov=True)
        self.correction_model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['accuracy'])
        self.init_weights = self.correction_model.get_weights()

        
    def train(self):
        
        # define callbacks for early stopping and reducing learning rate
        callbacks = [keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.0001, verbose=1, patience=4), 
                     keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1)]
        
        print('Beginning model training.')
        self.training = self.model.fit(self.train_X, self.train_y, batch_size=10, validation_split=0, epochs=10, callbacks=callbacks, verbose=1)        
        
        # saving model weights for pretraining or saving the model.
        #self.model.save(self.model_name+'.h5')
        #print('Model weights saved.')   
        
        
    # Training the FSL model, train_X2 trains the task module, train_X3 trains the correction module
    def train_fsl(self, train_X3, train_X2, train_y2):
        
        # define callbacks for early stopping and reducing learning rate
        callbacks = [keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.0001, verbose=1, patience=4), 
                    keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1)]

        
        print('Training FSL Task Module')
        self.training = self.model.fit(self.train_X, self.train_y, batch_size=5, validation_split=0, epochs=10, callbacks=callbacks, verbose=1)

        # make the task predictions
        task_predictions = self.model.predict(train_X2)

        # calculate the true 'corrections' needed on the task predictions 
        # these are the training labels for the correction module
        train_y_prime = np.asarray([float(train_y2[i]-task_predictions[i]) for i in range(len(train_y2))])
        train_X2.append(task_predictions)
        
        # train the correction module with the calculated correction labels
        print('Training FSL Correction Module')
        self.training_2 = self.correction_model.fit(train_X3, train_y_prime, batch_size=5, validation_split=0, epochs=10, callbacks=callbacks, verbose=1)

    # Get the training histories for the models
    # baseline model has one history per training session, FSL has 2 - one for each module
    def get_training(self, fsl=True):
        if fsl:
            return self.training, self.training_2
        else:
            return self.training

    # make predictions for the test data   
    def evaluate(self, test_X):
        predictions = self.model.predict(test_X)
        baseline_y_round = [1 if prediction>0.5 else 0 for prediction in predictions]# round using threshold 0.5
        return baseline_y_round, predictions

    # make predictions for the test data with the FSL model
    def evaluate_fsl(self, test_X1, test_X2):
        # make task predictions 
        task_predictions = self.model.predict(test_X1)
        # add predictions to test data for correction module
        test_X2.append(task_predictions)
        # make correction prediction
        correction_predictions = self.correction_model.predict(test_X2)

        # assemble predictions by summing the task prediction and the correction
        fsl_predictions = [task_predictions[i] + correction_predictions[i] for i in range(len(task_predictions))]
        fsl_y = [1 if prediction>0.5 else 0 for prediction in fsl_predictions]# round using threshold 0.5
        return fsl_y, fsl_predictions

## Depression vs. Control sub-task

## Baseline Model

First load the data, then set up the 5-fold cross validation for this task for the baseline.

In [None]:
train_X, train_y, cd_word_index = load_pad_data(cd_training_data, cd_training_labels)

training = []
classification_reports = []

for i in range(1, 6):

    # separate the test fold from the training data
    length1 = int(0.2*(i-1)*len(train_X))
    length2 = int(0.2*(i)*len(train_X))
    if i == 1:
        train_X1, test_X1 = train_X[length2:], train_X[:length2]
        train_y1, test_y1 = train_y[length2:], train_y[:length2]
    elif i == 5:
        train_X1, test_X1 = train_X[:length1], train_X[length1:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:]
    else:
        train_X1, test_X1 = train_X[:length1], train_X[length1:length2]
        train_X1 += train_X[length2:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:length2]
        train_y1 += train_y[length2:]
    
    
    train_X1 = [np.asarray(list(x)) for x in zip(*train_X1)]
    train_y1 = np.asarray(train_y1)

    test_X1 = [np.asarray(list(x)) for x in zip(*test_X1)]
    test_y1 = np.asarray(test_y1)

    # build model, create embedding and train model
    cd_model = BLSTM('Baseline_cd{}'.format(str(i)), train_X1, train_y1, cd_word_index)
    cd_model.embed_twitter()
    cd_model.build_model()
    cd_model.train()
    
    training.append(cd_model.get_training(fsl=False)) # save training history to list
    
    # get predictions on the test fold
    rounded_predictions, predictions = cd_model.evaluate(test_X1)
    classification_reports.append(classification_report(test_y1, rounded_predictions)) # get classification report

Exploring the classification reports help establish the performance of the model. Precision at 10% false alarm rates uses the unrounded predictions.

## Few-Shot Learning Model

Same process as for the baseline, but need to partition the training data.

In [None]:
train_X, train_y, cd_word_index = load_pad_data(cd_training_data, cd_training_labels)

training = []
classification_reports = []

for i in range(1, 6):

    length1 = int(0.2*(i-1)*len(train_X))
    length2 = int(0.2*(i)*len(train_X))
    if i == 1:
        train_X1, test_X1 = train_X[length2:], train_X[:length2]
        train_y1, test_y1 = train_y[length2:], train_y[:length2]
    elif i == 5:
        train_X1, test_X1 = train_X[:length1], train_X[length1:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:]
    else:
        train_X1, test_X1 = train_X[:length1], train_X[length1:length2]
        train_X1 += train_X[length2:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:length2]
        train_y1 += train_y[length2:]
    
    train_X1 = [np.asarray(list(x)) for x in zip(*train_X1)]
    train_y1 = np.asarray(train_y1)

    test_X1 = [np.asarray(list(x)) for x in zip(*test_X1)]
    test_y1 = np.asarray(test_y1)

    # select indices for partitioning tweets
    random_index = random.sample(range(len(train_X1)), int(0.5*len(train_X1)))
    random_index.sort()

    # partition tweets into two sets
    train_X1, train_X2 = [train_X1[i] for i in random_index], [train_X1[i] for i in range(len(train_X1)) if i not in random_index]
    test_X1, test_X2 = [test_X1[i] for i in random_index], [test_X1[i] for i in range(len(test_X1)) if i not in random_index]

    cd_model = BLSTM('FSL_cd{}'.format(str(i)), train_X1, train_y1, cd_word_index)
    cd_model.embed_twitter()
    cd_model.build_fsl_task() # build task module
    cd_model.build_fsl_correction() # build correction module
    cd_model.train_fsl(train_X2, train_y1)
    training.append(cd_model.get_training())
    
    rounded_predictions, predictions = cp_model.evaluate_fsl(test_X1, test_X2)
    classification_reports.append(classification_report(test_y1, rounded_predictions)) # get classification report

## PTSD vs. Control sub-task

## Baseline

In [None]:
train_X, train_y, cp_word_index = load_pad_data(cp_training_data, cp_training_labels)

training = []
classification_reports = []

for i in range(1, 6):

    # separate the test fold from the training data
    length1 = int(0.2*(i-1)*len(train_X))
    length2 = int(0.2*(i)*len(train_X))
    if i == 1:
        train_X1, test_X1 = train_X[length2:], train_X[:length2]
        train_y1, test_y1 = train_y[length2:], train_y[:length2]
    elif i == 5:
        train_X1, test_X1 = train_X[:length1], train_X[length1:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:]
    else:
        train_X1, test_X1 = train_X[:length1], train_X[length1:length2]
        train_X1 += train_X[length2:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:length2]
        train_y1 += train_y[length2:]
    
    
    train_X1 = [np.asarray(list(x)) for x in zip(*train_X1)]
    train_y1 = np.asarray(train_y1)

    test_X1 = [np.asarray(list(x)) for x in zip(*test_X1)]
    test_y1 = np.asarray(test_y1)

    # build model, create embedding and train model
    cp_model = BLSTM('Baseline_cp{}'.format(str(i)), train_X1, train_y1, cp_word_index)
    cp_model.embed_twitter()
    cp_model.build_model()
    cp_model.train()
    
    training.append(cp_model.get_training(fsl=False)) # save training history to list
    
    # get predictions on the test fold
    rounded_predictions, predictions = cp_model.evaluate(test_X1)
    classification_reports.append(classification_report(test_y1, rounded_predictions)) # get classification report

## Few-Shot Learning Model

In [None]:
train_X, train_y, cp_word_index = load_pad_data(cp_training_data, cp_training_labels)

training = []
classification_reports = []

for i in range(1, 6):

    length1 = int(0.2*(i-1)*len(train_X))
    length2 = int(0.2*(i)*len(train_X))
    if i == 1:
        train_X1, test_X1 = train_X[length2:], train_X[:length2]
        train_y1, test_y1 = train_y[length2:], train_y[:length2]
    elif i == 5:
        train_X1, test_X1 = train_X[:length1], train_X[length1:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:]
    else:
        train_X1, test_X1 = train_X[:length1], train_X[length1:length2]
        train_X1 += train_X[length2:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:length2]
        train_y1 += train_y[length2:]
    
    train_X1 = [np.asarray(list(x)) for x in zip(*train_X1)]
    train_y1 = np.asarray(train_y1)

    test_X1 = [np.asarray(list(x)) for x in zip(*test_X1)]
    test_y1 = np.asarray(test_y1)

    # select indices for partitioning tweets
    random_index = random.sample(range(len(train_X1)), int(0.5*len(train_X1)))
    random_index.sort()

    # partition tweets into two sets
    train_X1, train_X2 = [train_X1[i] for i in random_index], [train_X1[i] for i in range(len(train_X1)) if i not in random_index]
    test_X1, test_X2 = [test_X1[i] for i in random_index], [test_X1[i] for i in range(len(test_X1)) if i not in random_index]

    cp_model = BLSTM('FSL_cp{}'.format(str(i)), train_X1, train_y1, cp_word_index)
    cp_model.embed_twitter()
    cp_model.build_fsl_task() # build task module
    cp_model.build_fsl_correction() # build correction module
    cp_model.train_fsl(train_X2, train_y1)
    training.append(cp_model.get_training())
    
    rounded_predictions, predictions = cp_model.evaluate_fsl(test_X1, test_X2)
    classification_reports.append(classification_report(test_y1, rounded_predictions)) # get classification report

## Depression vs. PTSD sub-task

## Baseline model

In [None]:
train_X, train_y, dp_word_index = load_pad_data(dp_training_data, dp_training_labels)

training = []
classification_reports = []

for i in range(1, 6):

    # separate the test fold from the training data
    length1 = int(0.2*(i-1)*len(train_X))
    length2 = int(0.2*(i)*len(train_X))
    if i == 1:
        train_X1, test_X1 = train_X[length2:], train_X[:length2]
        train_y1, test_y1 = train_y[length2:], train_y[:length2]
    elif i == 5:
        train_X1, test_X1 = train_X[:length1], train_X[length1:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:]
    else:
        train_X1, test_X1 = train_X[:length1], train_X[length1:length2]
        train_X1 += train_X[length2:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:length2]
        train_y1 += train_y[length2:]
    
    
    train_X1 = [np.asarray(list(x)) for x in zip(*train_X1)]
    train_y1 = np.asarray(train_y1)

    test_X1 = [np.asarray(list(x)) for x in zip(*test_X1)]
    test_y1 = np.asarray(test_y1)

    # build model, create embedding and train model
    dp_model = BLSTM('Baseline_dp{}'.format(str(i)), train_X1, train_y1, dp_word_index)
    dp_model.embed_twitter()
    dp_model.build_model()
    dp_model.train()
    
    training.append(dp_model.get_training(fsl=False)) # save training history to list
    
    # get predictions on the test fold
    rounded_predictions, predictions = dp_model.evaluate(test_X1)
    classification_reports.append(classification_report(test_y1, rounded_predictions)) # get classification report

## Few-Shot Learning

In [None]:
train_X, train_y, dp_word_index = load_pad_data(dp_training_data, dp_training_labels)

training = []
classification_reports = []

for i in range(1, 6):

    length1 = int(0.2*(i-1)*len(train_X))
    length2 = int(0.2*(i)*len(train_X))
    if i == 1:
        train_X1, test_X1 = train_X[length2:], train_X[:length2]
        train_y1, test_y1 = train_y[length2:], train_y[:length2]
    elif i == 5:
        train_X1, test_X1 = train_X[:length1], train_X[length1:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:]
    else:
        train_X1, test_X1 = train_X[:length1], train_X[length1:length2]
        train_X1 += train_X[length2:]
        train_y1, test_y1 = train_y[:length1], train_y[length1:length2]
        train_y1 += train_y[length2:]
    
    train_X1 = [np.asarray(list(x)) for x in zip(*train_X1)]
    train_y1 = np.asarray(train_y1)

    test_X1 = [np.asarray(list(x)) for x in zip(*test_X1)]
    test_y1 = np.asarray(test_y1)

    # select indices for partitioning tweets
    random_index = random.sample(range(len(train_X1)), int(0.5*len(train_X1)))
    random_index.sort()

    # partition tweets into two sets
    train_X1, train_X2 = [train_X1[i] for i in random_index], [train_X1[i] for i in range(len(train_X1)) if i not in random_index]
    test_X1, test_X2 = [test_X1[i] for i in random_index], [test_X1[i] for i in range(len(test_X1)) if i not in random_index]

    dp_model = BLSTM('FSL_dp{}'.format(str(i)), train_X1, train_y1, dp_word_index)
    dp_model.embed_twitter()
    dp_model.build_fsl_task() # build task module
    dp_model.build_fsl_correction() # build correction module
    dp_model.train_fsl(train_X2, train_y1)
    training.append(dp_model.get_training())
    
    rounded_predictions, predictions = dp_model.evaluate_fsl(test_X1, test_X2)
    classification_reports.append(classification_report(test_y1, rounded_predictions)) # get classification report