In [23]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import random
from torch.optim.lr_scheduler import ExponentialLR
from torch.optim.lr_scheduler import StepLR
import os
from torch.nn.utils.rnn import pad_sequence
from collections import OrderedDict

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

################################################################################
############################ DATA PREPROCESSING ################################
################################################################################


#transforms the data of given dataframe
def data_preprocessing(df):
#   remove_links(df)
#   remove_hashtags(df)
#   remove_mentions(df)
#   replace_links(df)
#   replace_hashtags(df)
#   replace_mentions(df)
#   replace_numbers(df)
#   replace_upper_words(df)
  remove_non_alpha(df)
  to_lowercase(df)
  lemmatize(df)
  remove_stop_words(df)

def remove_links(df):
  df.update(df.apply(lambda x: re.sub(r'(https?:\/\/[A-Za-z0-9\/.]*)|(bit.ly\/[A-Za-z0-9\/.]*)', '', x, flags=re.MULTILINE)))

def remove_non_alpha(df):
  df.update(df.apply(lambda x: re.sub(r'[^ a-zA-Z]', ' ', x, flags=re.MULTILINE)))

def to_lowercase(df):
  df.update(df.apply(lambda x: x.lower()))

def lemmatize(df):
  lemmatizer = WordNetLemmatizer() 
  df.update(df.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in tokenizer(x)])))

def remove_stop_words(df):
  stop = set(stopwords.words('english'))
  df.update(df.apply(lambda x: ' '.join([word for word in tokenizer(x) if word not in (stop)])))

def remove_hashtags(df):
  df.update(df.apply(lambda x: re.sub(r'(#[A-Za-z0-9]*)', '',x, flags=re.MULTILINE)))

def remove_mentions(df):
  df.update(df.apply(lambda x: re.sub(r'(@[A-Za-z0-9]*)', '',x, flags=re.MULTILINE)))
  
def replace_mentions(df):
  df.update(df.apply(lambda x: re.sub(r'(@[A-Za-z0-9]*)', '<user>',x, flags=re.MULTILINE)))
  
def replace_hashtags(df):
  df.update(df.apply(lambda x: re.sub(r'(#[A-Za-z0-9]*)', '<hashtag>',x, flags=re.MULTILINE)))
  
def replace_numbers(df):
  df.update(df.apply(lambda x: re.sub(r"[-+]?\d*\.\d+|\d+", '<number>',x, flags=re.MULTILINE)))

def replace_upper_words(df):
  df.update(df.apply(lambda x: re.sub(r'( [A-Z]* )', ' <allcaps> ',x, flags=re.MULTILINE)))

def replace_links(df):
  df.update(df.apply(lambda x: re.sub(r'(https?:\/\/[A-Za-z0-9\/.]*)|(bit.ly\/[A-Za-z0-9\/.]*)', '<url>', x, flags=re.MULTILINE)))


################################################################################
############################## UTILITY FUNCTIONS ###############################
################################################################################

def set_seed(seed = 1234):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    

#a tokenizer that recognizes the special_tokens list as tokens
def tokenizer(input_str):
    final_tokens = []
    special_tokens = ['<user>','<allcaps>','<hashtag>','<number>','<url>']
    for token in special_tokens:
        for i in range(input_str.count(token)):
            final_tokens.append(token)
        input_str = input_str.replace(token,'')
    final_tokens += word_tokenize(input_str)
    return final_tokens


#load glove model
#got ideas from https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
def load_pre_trained_model(filename):
  pretrained_model=dict()
  with open(filename,'r') as fd:
    for line in fd:
      tokens = line.split()
      pretrained_model[tokens[0]]=np.array(tokens[1:],dtype=np.float32)

  return pretrained_model


#vectorize df using word embeddings
def vectorize(df,vocabulary):
  document_embeddings = df.apply(
      lambda x: np.array([vocabulary[word] for word in tokenizer(x) if word in vocabulary.keys()])
  )
  return document_embeddings


def ROCplot(testy,predy):
    classesc=[0,1,2]
    test_y = label_binarize(testy,classes=classesc)
    pred_y = label_binarize(predy,classes=classesc)
    n_classes=3

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(test_y[:, i], pred_y[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    # Plot all ROC curves
    plt.figure()
    colors = ['red','blue', 'green']
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(classesc[i], roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()


def learning_curve_plot(epochs, train_scores, test_scores, yaxis='Loss'):
    plt.grid()
    plt.title("Learning curve Result")
    plt.xlabel("Epochs")
    plt.ylabel(yaxis)
    plt.fill_between(
        epochs,
        train_scores,
        alpha=0.1,
        color="r",
    )
    plt.fill_between(
        epochs,
        test_scores,
        alpha=0.1,
        color="g",
    )
    plt.plot(
        epochs, train_scores, "o-", color="r", label="Training score"
    )
    plt.plot(
        epochs, test_scores, "o-", color="g", label="Validation score"
    )
    plt.legend(loc="best")
    plt.show()


#returns a dataframe holding precision,recall,f1,accurasy scores
#test on training and validation sets
def get_scores(trainTrue,trainPred,validationTrue,validationPred):

    scores={}
    scores['set']=[]
    scores['precision']=[]
    scores['f1']=[]
    scores['recall']=[]
    scores['accuracy']=[]

    scores['set'].append('Training')
    scores['precision'].append(metrics.precision_score(trainTrue,trainPred,average='weighted',zero_division=0))
    scores['f1'].append(metrics.f1_score(trainTrue,trainPred,average='weighted',zero_division=0))
    scores['recall'].append(metrics.recall_score(trainTrue,trainPred,average='weighted',zero_division=0))
    scores['accuracy'].append(metrics.accuracy_score(trainTrue,trainPred))

    scores['set'].append('Validation')
    scores['precision'].append(metrics.precision_score(validationTrue,validationPred,average='weighted',zero_division=0))
    scores['f1'].append(metrics.f1_score(validationTrue,validationPred,average='weighted',zero_division=0))
    scores['recall'].append(metrics.recall_score(validationTrue,validationPred,average='weighted',zero_division=0))
    scores['accuracy'].append(metrics.accuracy_score(validationTrue,validationPred))

    return  pd.DataFrame(data=scores).set_index('set')


def display_results(Y_train,Y_validation,Pred_train,Pred_validation,Loss_train,
                    Loss_validation,Score_train,Score_validation,epochs):
    display(get_scores(Y_train,Pred_train,Y_validation,Pred_validation))
    learning_curve_plot(epochs, Loss_train, Loss_validation, yaxis='Loss')
    learning_curve_plot(epochs, Score_train, Score_validation, yaxis='f1')
    ROCplot(Y_validation,Pred_validation)


################################################################################
############################ NEURAL NETWORK ####################################
################################################################################

#got ideas from https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
class NeuralNetwork(nn.Module):
    def __init__(self, feature_size):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.LSTM(feature_size, 100, batch_first=True, bidirectional=True)
        self.layer2 = nn.LSTM(200, 50, batch_first=True, bidirectional=True)
        self.layer3 = nn.LSTM(feature_size, 50, batch_first=True, bidirectional=True)
        self.layer4 = nn.GRU(100, 25, batch_first=True, bidirectional=True)
        self.layer5 = nn.GRU(50, 5, batch_first=True, bidirectional=True)
        self.layer6 = nn.GRU(10, 3, batch_first=True)
        self.layer7 = nn.GRU(50, 3, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        # x = nn.BatchNorm1d(x.shape[1])(x)

    def forward(self, x):
        a = self.layer1(x)[0]
        a = nn.BatchNorm1d(a.shape[1])(a)
        a = self.dropout(a)
        a = self.layer2(a)[0]
        b = self.layer3(x)[0]

        x = torch.add(a,b)
        x = self.layer4(x)[0]
        x = nn.BatchNorm1d(x.shape[1])(x)
        x = self.dropout(x)
        
        a = self.layer5(x)[0]
        a = nn.BatchNorm1d(x.shape[1])(a)
        a = self.dropout(a)
        a = self.layer6(a)[0]

        b = self.layer7(x)[0]

        x = torch.add(a,b)
        return x.mean(dim=1)


class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx],dtype=torch.float32),self.y[idx]

def sequence_padding(batch):
    (xx, yy) = zip(*batch)
    xx = pad_sequence(xx, batch_first=True, padding_value=0)
    return xx, torch.tensor(yy)


def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    loss_sum, score_sum = 0,0

    for X, y in dataloader:
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)
        score_sum += metrics.f1_score(y,pred.argmax(1),average='weighted',zero_division=0)
        loss_sum += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(),2)

        optimizer.step()

    return loss_sum/num_batches, score_sum/num_batches


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    loss_sum, score_sum = 0, 0
    final_pred = torch.zeros(1)

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            final_pred = torch.cat((final_pred, pred.argmax(dim=1)),0)
            loss_sum += loss_fn(pred, y).item()
            score_sum += metrics.f1_score(y,pred.argmax(1),average='weighted',zero_division=0)

    return loss_sum/num_batches, score_sum/num_batches, final_pred[1:]


#init dataloader and train the model
def run_model(X_train, Y_train, X_validation, Y_validation, batch_size, epochs,
              lr_scheduler, loss_fn, optimizer, model):
    Set_train = CustomDataset(X_train,Y_train)
    Set_validation = CustomDataset(X_validation,Y_validation)

    Dataloader_train = DataLoader(Set_train, batch_size=batch_size, collate_fn=sequence_padding)
    Dataloader_validation = DataLoader(Set_validation, batch_size=batch_size, collate_fn=sequence_padding)

    Loss_train=[]
    Loss_validation=[]
    Score_train=[]
    Score_validation=[]

    for current_epoch in range(epochs):
        print("-------------------------------------------------------------------------------")
        print("Epoch ", current_epoch+1, ":")
        print("Learning rate: ", lr_scheduler.get_last_lr())

        epoch_loss_train, epoch_score_train = train_loop(Dataloader_train, model, loss_fn, optimizer)
        epoch_loss_validation, epoch_score_validation, _ = test_loop(Dataloader_validation, model, loss_fn)
        lr_scheduler.step()

        Loss_train.append(epoch_loss_train)
        Loss_validation.append(epoch_loss_validation)
        Score_train.append(epoch_score_train)
        Score_validation.append(epoch_score_validation)

        print("Training: Loss: ", epoch_loss_train, " Score: ", epoch_score_train)
        print("Test    : Loss: ", epoch_loss_validation, " Score: ", epoch_score_validation)

    _, _, Pred_train = test_loop(Dataloader_train, model, loss_fn)
    _, _, Pred_validation = test_loop(Dataloader_validation, model, loss_fn)

    display_results(Y_train,Y_validation,Pred_train,Pred_validation,Loss_train,
                    Loss_validation,Score_train,Score_validation,range(epochs))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dinos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dinos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dinos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
training_set_path = "C:\\Users\\dinos\\Desktop\\vaccine_train_set.csv"
validation_set_path = "C:\\Users\\dinos\\Desktop\\vaccine_validation_set.csv"


#set manual seeds
set_seed()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')


#importing datasets

train = pd.read_csv(training_set_path,usecols=['tweet','label'])
validation = pd.read_csv(validation_set_path,usecols=['tweet','label'])


#data preprocessing

data_preprocessing(train['tweet'])
data_preprocessing(validation['tweet'])

Using cuda device


In [24]:
#model hyperparameters

learning_rate = 0.1
batch_size = 32
epochs = 50


#download the glove pre trained embeddings

# !wget https://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip


#load the pre-trained model

# glove_pretrained_model = load_pre_trained_model("C:\\Users\\dinos\\Desktop\\glove.6B.300d.txt")


#vectorize data

# X_train = vectorize(train['tweet'],glove_pretrained_model)
# X_validation = vectorize(validation['tweet'],glove_pretrained_model)
# Y_train,Y_validation = train['label'], validation['label']


#initialize neural network with prerequisite layers below

model = NeuralNetwork(len(X_train[0][0]))


#Loss function: Cross entropy 
#using label smoothing to avoid minor overfitting

loss_fn = nn.CrossEntropyLoss()


#Optimizer: SGD using momentum
#Regularizer: L2 regularization via weight_decay

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


#LR scheduler: exponential decrease

lr_scheduler = ExponentialLR(optimizer, gamma=0.9)


#train the model and print results

run_model(X_train, Y_train, X_validation, Y_validation, batch_size, epochs,
              lr_scheduler, loss_fn, optimizer, model)

-------------------------------------------------------------------------------
Epoch  1 :
Learning rate:  [0.1]
Training: Loss:  0.9947507845163346  Score:  0.31852726886270444
Test    : Loss:  0.9913745845357577  Score:  0.3634640436550124
-------------------------------------------------------------------------------
Epoch  2 :
Learning rate:  [0.09000000000000001]
Training: Loss:  0.9909972252845765  Score:  0.3269253829507746
Test    : Loss:  0.9905143173204528  Score:  0.3130411472222374
-------------------------------------------------------------------------------
Epoch  3 :
Learning rate:  [0.08100000000000002]
Training: Loss:  0.9901564345359802  Score:  0.30468565433851846
Test    : Loss:  0.9899065759446886  Score:  0.3145009658268488
-------------------------------------------------------------------------------
Epoch  4 :
Learning rate:  [0.07290000000000002]
Training: Loss:  0.9901964709758758  Score:  0.36679078029194123
Test    : Loss:  0.9889249983761046  Score:  0.40

Training: Loss:  0.9879660187959671  Score:  0.3021090345488728
Test    : Loss:  0.9890670677026113  Score:  0.3039686620167521
-------------------------------------------------------------------------------
Epoch  34 :
Learning rate:  [0.0030903154382632653]
Training: Loss:  0.9880283560752868  Score:  0.3021400742725047
Test    : Loss:  0.9867851030495431  Score:  0.3039686620167521
-------------------------------------------------------------------------------
Epoch  35 :
Learning rate:  [0.002781283894436939]
Training: Loss:  0.9881607784032822  Score:  0.30198502883882056
Test    : Loss:  0.9865527177850405  Score:  0.3039686620167521
-------------------------------------------------------------------------------
Epoch  36 :
Learning rate:  [0.002503155504993245]
Training: Loss:  0.9881955523490906  Score:  0.3021104289835914
Test    : Loss:  0.9869197193119261  Score:  0.3039686620167521
-------------------------------------------------------------------------------
Epoch  37 :
L

KeyboardInterrupt: 

In [None]:
print("Model structure: ", model, "\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")