In [1]:
import numpy as np
import pandas as pd
import random
import copy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
import torch.utils.data as d
import tokenization_dim_reduction as tdr
import ngrams as ng

data_dir = r'D:\Researching Data\Youtube data\USvideos.csv'
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [2]:
_, dtext, dlabel = tdr.select_col(data_dir, tdr.cols_t4)
new_TEXT = tdr.combine_text(dtext, 1, [0,2])
#new_label = tdr.multi_to_binary(dlabel, 25) # politics
new_label = tdr.multi_to_binary(dlabel, 24) # entertainments
new_arr = np.concatenate((new_TEXT.reshape([len(new_TEXT),1]), new_label), axis=1)

In [3]:
def ngram_creater(n, sentence, whole_grams):
    '''
    The function creates n-grams dictionary with its count
    Input:
        n: n for n-grams
        sentence: single sentence passed in
        whole_grams: dictionary with accumulated count
    Returns:
        grams: n-grams with its counts for the sentence
        updated whole_grams
    '''
    grams = {}
    word_lst = ng.clean_punctuation(sentence).split()
    for i in range(len(word_lst) - n + 1):
        gram = " ".join(word_lst[i:i + n])
        if not grams.get(gram):
            grams[gram] = 0
            whole_grams[gram] = 0
        grams[gram] += 1
        whole_grams[gram] += 1
    
    return grams, whole_grams

In [4]:
# with n-grams, used in simple machine learning models
def word_ngrams(n, txt_arr):
    '''
    The function creates the ngrams with its corresponding
    counts for the whole dataset
    Inputs:
        n: n for n-grams
        txt_arr: all array with train, valid and test sets
    Returns:
        d-grams: dictionary of ngrams and counts of each row
        whole_grams: dictionary with accumulated count
    '''
    whole_grams = {}
    d_grams = {}
    for didx, txt in enumerate(txt_arr):
        grams, whole_grams = ngram_creater(n, txt, whole_grams)
        d_grams[didx] = grams

    return d_grams, whole_grams

In [5]:
def word_to_index(n, txt_arr):
    '''
    The function assigns index to each unique words
    or n-grams.
    Inputs:
        n: n for n-grams
        txt_arr: all array with train, valid and test sets
    Returns:
        word_to_idx: dictionary mapping word to index
        wtorch: dictionary mapping word to counts
    '''
    count = 0
    word_to_idx = {}
    _, whole_grams = word_ngrams(n, txt_arr)
    wtorch = torch.zeros(len(whole_grams))
    
    for ngrams, ct in whole_grams.items():
        word_to_idx[ngrams] = count
        wtorch[count] = ct
        count += 1
    
    return word_to_idx, wtorch

In [13]:
def sentence_torch(grams, word_to_idx, wtorch):
    '''
    The function transfers a list of phrase to an
    embedding vector.
    Inputs:
        grams: list of splitted sentence
        word_to_idx: dictionary mapping word to index
        wtorch: dictionary mapping word to counts
    Return: an embedding vector 
    '''
    bow_vec = torch.zeros(len(word_to_idx))
    for gram in grams:
        bow_vec[word_to_idx[gram]] = wtorch[word_to_idx[gram]]
        
    return bow_vec

In [14]:
# split train, validation, test
def split_train_test(dt_size, train_valid_test_r):
    '''
    The function randomly selects the indices for
    training, validation, and testing sets
    Inputs:
        dt_size: number of rows
        train_valid_test_r: tuple of ratios
    Return: indices for each subset
    '''
    train_size = int(dt_size * train_valid_test_r[0] // 1)
    valid_size = int(dt_size * train_valid_test_r[1] // 1)
    test_size = int(dt_size - train_size - valid_size)
    print("the size of train, valid and test data are", train_size, valid_size, test_size)
    
    full_indices = np.arange(0, dt_size, 1)
    train_indices = np.random.permutation(full_indices)[:train_size]
    
    sub_indices = set(full_indices) - set(train_indices)
    valid_indices = np.random.permutation(list(sub_indices))[:valid_size]
    
    sub_indicest = set(sub_indices) - set(valid_indices)
    test_indices = np.array(list(sub_indicest))
    
    return train_indices, valid_indices, test_indices 

In [15]:
train_indices, valid_indices, test_indices = split_train_test(new_arr.shape[0], (0.4, 0.4, 0.2))
X_train = new_TEXT[train_indices]
y_train = new_label[train_indices]
X_valid = new_TEXT[valid_indices]
y_valid = new_label[valid_indices]
X_test = new_TEXT[test_indices]
y_test = new_label[test_indices]

the size of train, valid and test data are 2540 2540 1271


In [16]:
word_to_idx, wtorch = word_to_index(2, new_TEXT)
dgrams = word_ngrams(2, X_train)[0]
VOCAB_SIZE = len(word_to_idx)
NUM_LABELS = 2
n = 2

The following code is inspired by and modified from the PyTorch Tutorial of Robert Guthrie, part of the modification will be marked with comments.
####################################################################################### <br>
Topic: DEEP LEARNING WITH PYTORCH <br>
Author: Robert Guthrie <br>
Source: https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html#sphx-glr-beginner-nlp-deep-learning-tutorial-py <br>
Date: 2017 <br>
########################################################################################

In [17]:
label_to_ix = {0.0: 0, 1.0: 1}
def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

In [18]:
class BoWNN(nn.Module):

    def __init__(self, input_size, output_size):

        super(BoWNN, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, bow_vec):
        
        return self.linear(bow_vec)
    
model = BoWNN(VOCAB_SIZE, NUM_LABELS)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

In [19]:
best_acc = 0
for epoch in range(5):
    
    for idx in range(len(train_indices)):
        
        model.zero_grad()
        
        sentence = X_train[idx]
        grams, _ = ngram_creater(n, sentence, {})
        bow_vec = sentence_torch(grams, word_to_idx, wtorch)
        
        pred = model(bow_vec.view(1,-1))
        loss = loss_function(pred, make_target(y_train[idx, 0], label_to_ix))
        loss.backward()
        optimizer.step()
    
    acc_count = 0
    with torch.no_grad():
        for idx in range(len(valid_indices)):
            sentence = X_valid[idx]
            grams, _ = ngram_creater(n, sentence, {})
            bow_vec = sentence_torch(grams, word_to_idx, wtorch)
            pred = model(bow_vec.view(1,-1))

            y_pred = np.argmax(pred[0].detach().numpy())
            if y_valid[idx, 0] == y_pred:
                acc_count += 1

    print("For epoch number ", epoch, ", the accuracy for validation set is ", 
          acc_count / len(valid_indices))
    
    if (acc_count / len(valid_indices)) > best_acc:
        best_model = model

acc_count = 0
with torch.no_grad():
    for idx in range(len(test_indices)):
        sentence = X_test[idx]
        grams, _ = ngram_creater(n, sentence, {})
        bow_vec = sentence_torch(grams, word_to_idx, wtorch)
        pred = best_model(bow_vec.view(1,-1))
        y_pred = np.argmax(pred[0].detach().numpy())
        if y_test[idx, 0] == y_pred:
            acc_count += 1

print("the accuracy for test set is ", acc_count / len(test_indices))

For epoch number  0 , the accuracy for validation set is  0.8484251968503937
For epoch number  1 , the accuracy for validation set is  0.8842519685039371
For epoch number  2 , the accuracy for validation set is  0.8818897637795275
For epoch number  3 , the accuracy for validation set is  0.8842519685039371
For epoch number  4 , the accuracy for validation set is  0.8818897637795275
the accuracy for test set is  0.8662470495672698
