In [1]:
import numpy as np
import pandas as pd
import random
import copy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
import torch.utils.data as d
import tokenization_dim_reduction as tdr

In [2]:
data_dir = r'D:\Researching Data\Youtube data\USvideos.csv'
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [3]:
_, dtext, dlabel = tdr.select_col(data_dir, tdr.cols_t4)
new_TEXT = tdr.combine_text(dtext, 1, [0,2])
#new_label = tdr.multi_to_binary(dlabel, 25) # politics
new_label = tdr.multi_to_binary(dlabel, 24) # entertainments
new_arr = np.concatenate((new_TEXT.reshape([len(new_TEXT),1]), new_label), axis=1)

In [4]:
print((new_label[new_label == 1].shape[0] / new_label.shape[0]) * 100,
      " percent of videos are labelled as the selected category")
print("the baseline precision is ", 
      (new_label[new_label == 1].shape[0] / new_label.shape[0]) * 100,
     " in this model")

25.492048496299795  percent of videos are labelled as the selected category
the baseline precision is  25.492048496299795  in this model


In [5]:
# split train, validation, test
def split_train_test(dt_size, train_valid_test_r):
    '''
    The function randomly selects the indices for
    training, validation, and testing sets
    Inputs:
        dt_size: number of rows
        train_valid_test_r: tuple of ratios
    Return: indices for each subset
    '''
    train_size = int(dt_size * train_valid_test_r[0] // 1)
    valid_size = int(dt_size * train_valid_test_r[1] // 1)
    test_size = int(dt_size - train_size - valid_size)
    print("The size of train, valid and test data are", train_size, valid_size, test_size)
    
    full_indices = np.arange(0, dt_size, 1)
    train_indices = np.random.permutation(full_indices)[:train_size]
    
    sub_indices = set(full_indices) - set(train_indices)
    valid_indices = np.random.permutation(list(sub_indices))[:valid_size]
    
    sub_indicest = set(sub_indices) - set(valid_indices)
    test_indices = np.array(list(sub_indicest))
    
    return train_indices, valid_indices, test_indices 

In [6]:
def split_data(path, arr, train_valid_test_r=(0.4, 0.4, 0.2)):
    '''
    The function split the data to train, validation and test
    sets with randomly selected indices and save them to seperated
    csv files
    Inputs:
        path: directory of the saved files
        arr: the whole dataset
        train_valid_test_r: tuple of ratios
    '''
    train_indices, valid_indices, test_indices = split_train_test(arr.shape[0], train_valid_test_r)
    pd.DataFrame(arr[train_indices]).to_csv(path + "\\train.csv", header=None, index=None)
    pd.DataFrame(arr[valid_indices]).to_csv(path + "\\valid.csv", header=None, index=None)
    pd.DataFrame(arr[test_indices]).to_csv(path + "\\test.csv", header=None, index=None)

In [7]:
path = r'D:\Researching Data\Youtube data'
split_data(path, new_arr, train_valid_test_r=(0.4, 0.4, 0.2))
fields = [("text", TEXT), ("label", LABEL)]
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = path,
                                        train = 'train.csv',
                                        validation = 'valid.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True)
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')
print(f'Number of validation examples:{len(test_data)}')

The size of train, valid and test data are 2540 2540 1271
Number of training examples: 2539
Number of testing examples: 1270
Number of validation examples:1270


In [8]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data), 
         batch_size = BATCH_SIZE,
         sort_key = lambda x: len(x.text),
         sort_within_batch = True, 
         device = device)

The following code is inspired by and modified from the PyTorch Tutorial of Ben Trevett, and assignment code of CAPP 30255, part of the modification will be marked with comments
################################################################################### <br>
First Source: <br>
Topic: Tutorials on getting started with PyTorch and TorchText for sentiment analysis <br>
Source: https://github.com/bentrevett/pytorch-sentiment-analysis <br>
Author: Ben Trevett <br>
Date: 2019 <br>

Second Source: <br>
Topic: Assignment 2 of CAPP 30255, The University of Chicago <br>
Author: Amitabh Chaudhary <br>
Date: 2020 <br>
####################################################################################

In [9]:
def binary_accuracy(preds, y):
    """
    Return accuracy per batch
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [10]:
def binary_precision(preds, y):
    '''
    Return precision per batch
    '''
    rounded_preds = torch.round(torch.sigmoid(preds))
    prec_correct = ((rounded_preds == y) & (rounded_preds == 1)).float()
    prec_total = (rounded_preds == 1).float()
    precision = prec_correct.sum() / prec_total.sum()
    return precision

In [11]:
def binary_recall(preds, y):
    '''
    Return recall per batch
    '''
    rounded_preds = torch.round(torch.sigmoid(preds))
    rec_correct = ((rounded_preds == y) & (rounded_preds == 1)).float()
    rec_total = (y == 1).float()
    recall = rec_correct.sum() / rec_total.sum()
    return recall

In [12]:
class SimpleRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.RNN = nn.RNN(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, output_dim)  
        self.relu = nn.ReLU()
        
    def forward(self, text):
        
        #print("simpleRNN text:, ", text.size())
        emb = self.embedding(text)
        ot1, hidden = self.RNN(emb)
        ot2 = self.relu(hidden.squeeze(0))
        output = self.linear(ot2)
        
        return output

In [13]:
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, 
                 n_layers, bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.RNN = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.linear = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, text):
        
        # Simplify the original version of LSTM implementation
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.RNN(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
            
        return self.linear(hidden)

In [14]:
class CNN(nn.Module):

    def __init__(self, input_dim, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        embedded = self.embedding(torch.transpose(text, 0, 1))
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
            
        return self.fc(cat)

In [15]:
class WordEmbAvg_2linear(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_idx):
        
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)  
        self.relu = nn.ReLU()
                                 
    def forward(self, text):
        
        # Modify the original version of the CAPP 30255 assignment
        emb = self.embedding(text)
        emb = torch.mean(emb, dim=0).squeeze(1)
        ot1 = self.linear1(emb)
        ot2 = self.relu(ot1)
        output = self.linear2(ot2)
        
        return output

In [16]:
class Training_module( ):

    def __init__(self, model):
        
        self.model = model
        self.loss_fn = (nn.BCEWithLogitsLoss()).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.01)
    
    def train_epoch(self, iterator):
        '''
        Train the model for one epoch. For this repeat the following, 
        going through all training examples.
        1. Get the next batch of inputs from the iterator.
        2. Determine the predictions using a forward pass.
        3. Compute the loss.
        4. Compute gradients using a backward pass.
        5. Execute one step of the optimizer to update the model paramters.
        '''
        epoch_loss = 0
        epoch_acc = 0
    
        for batch in iterator:

            self.optimizer.zero_grad()
            
            predictions = self.model(batch.text).squeeze(1)
            loss = self.loss_fn(predictions, batch.label)
            accuracy = binary_accuracy(predictions, batch.label)
        
            loss.backward()
            self.optimizer.step()
            epoch_loss += loss.item()
            epoch_acc += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def train_model(self, train_iterator, dev_iterator):
        """
        Train the model for multiple epochs, and after each evaluate on the
        development set.  Return the best performing model.
        """  
        dev_accs = [0.]
        for epoch in range(5):
            self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print(f"Epoch {epoch}: Dev Accuracy: {dev_acc[1]} Dev Loss:{dev_acc[0]}")
            if dev_acc[1] > max(dev_accs):
                best_model = copy.deepcopy(self)
            dev_accs.append(dev_acc[1])
        return best_model.model
                
    def evaluate(self, iterator):
        '''
        Evaluate the performance of the model on the given examples.
        '''
        epoch_loss = 0
        epoch_acc = 0
        epoch_prec = 0
        epoch_rec = 0
    
        with torch.no_grad():
    
            for batch in iterator:

                predictions = self.model(batch.text).squeeze(1)
                loss = self.loss_fn(predictions, batch.label)
                acc = binary_accuracy(predictions, batch.label)
                precision = binary_precision(predictions, batch.label)
                recall = binary_recall(predictions, batch.label)
        
                epoch_loss += loss.item()
                epoch_acc += acc.item()
                epoch_prec += precision.item()
                epoch_rec += recall.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator), \
               epoch_prec / len(iterator), epoch_rec / len(iterator)

In [17]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

N_FILTERS = 100
FILTER_SIZES = [3,4,5]

model_wordem = WordEmbAvg_2linear(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
model_rnn = SimpleRNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
model_BLSTM = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
                   N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
model_CNN = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
MODEL_DICT = {"avg_embedding": model_wordem, "SimpleRNN": model_rnn, "BLSTM": model_BLSTM, "CNN": model_CNN}

In [18]:
def model_selection(model_dict, model_txt="avg_embedding"):
    '''
    Helper function for model selection
    '''
    return model_dict[model_txt]

# Comparing the performance of different models, the original 
# version of the CAPP 30255 assignment
def compare_models(model_dict):
    '''
    The function presents and compare the performances of
    different neural network models and store the best
    models of each model type in the output dictionary
    
    Inputs: model_dict: dictionary of model types used in training
    Return: dictionary of best models of each model type
    '''
    best_models = {}
    for key, value in model_dict.items():
        print("currently training the model: ", key)

        model = model_selection(model_dict, key)
        model = model.to(device)
        tm = Training_module(model)
        best_model = tm.train_model(train_iterator, valid_iterator)
        best_models[key] = best_model
        
        tm.model = best_model
        test_loss, test_acc, test_prec, test_rec = tm.evaluate(test_iterator)
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
        print(f'Test Prec: {test_prec*100:.3f}% | Test Rec: {test_rec*100:.3f}%')
        
    return best_models

# Searching for phrases with highest norm values, modify the original 
# version of the CAPP 30255 assignment
def get_effective_norms(best_models, selected_mkey="avg_embedding"):
    '''
    The model presents 10 most effective and 10 less effective
    phrases used in the classification
    Inputs: 
        best_models: dictionary of best model of each model type
        selected_mkey: selected model type
    '''
    best_model = best_models[selected_mkey]
    strong_words = []
    weak_words = []
    emb_weight = best_model.embedding.weight.data
    top_indices = torch.norm(emb_weight, p=2, dim=1).detach().topk(10).indices
    bottom_indices = torch.norm(emb_weight, p=2, dim=1).detach().topk(10, largest=False).indices

    for idx in top_indices:
        strong_words.append(TEXT.vocab.itos[idx])
    
    for idx in bottom_indices:
        weak_words.append(TEXT.vocab.itos[idx])
    
    print("most effective words: ", strong_words)
    print("less effective words: ", weak_words)

In [19]:
best_models = compare_models(MODEL_DICT)
get_effective_norms(best_models)

currently training the model:  avg_embedding
Epoch 0: Dev Accuracy: 0.7498637348413467 Dev Loss:0.5612356200814247
Epoch 1: Dev Accuracy: 0.8765897527337074 Dev Loss:0.3659035500138998
Epoch 2: Dev Accuracy: 0.8795058146119118 Dev Loss:0.3773972377181053
Epoch 3: Dev Accuracy: 0.8787336483597755 Dev Loss:0.46051947474479676
Epoch 4: Dev Accuracy: 0.8769712939858436 Dev Loss:0.4851145945489407
Test Loss: 0.402 | Test Acc: 86.63%
Test Prec: 76.760% | Test Rec: 64.411%
currently training the model:  SimpleRNN
Epoch 0: Dev Accuracy: 0.7467387348413468 Dev Loss:0.5705512657761573
Epoch 1: Dev Accuracy: 0.7416606098413467 Dev Loss:0.5794503778219223
Epoch 2: Dev Accuracy: 0.7467387348413468 Dev Loss:0.5686986550688744
Epoch 3: Dev Accuracy: 0.7467387348413468 Dev Loss:0.5735377579927444
Epoch 4: Dev Accuracy: 0.7400981098413467 Dev Loss:0.5935619696974754
Test Loss: 0.585 | Test Acc: 73.53%
Test Prec: nan% | Test Rec: 0.000%
currently training the model:  BLSTM
Epoch 0: Dev Accuracy: 0.74986