# Function Definitions:

In [55]:
import numpy as np
import random
import csv
import torch
import torch.nn as nn

#source: https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python

#This function takes in the path to the glove pretrained vectors that is
#originally stored as a .txt and then converts it into a dictionary. To access
#a certain vector of a certain word, use the word as the key, and the dictionary
#will return the vector.
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r',encoding="utf-8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

#This function takes in the path to our modified csv file and returns individual
#lists containing title, summary, and score, respectively. 
#
#Example: title[k] returns the movie at index k's title in the form of a string
#
#@ Params: csvFile - the path to our modified csv file containing only the usable
#                    movies
#
#@ Return: title - a list of all the movie titles inside of csvFile
#          summary - a list of all the summaries inside of csvFile
#          score - a list of all the scores (voter averages) inside of csvFile
def parseCSV(csvFile):
    #define the column names
    
    #reading the file to parse
    with open(csvFile, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        read_list = list(reader)
    
    #stores the respective data in lists
    title = []
    summary = []
    score = []
    
    # len(read_list)
    for i in range(1, len(read_list)):
        title.append(read_list[i][0])
        summary.append(read_list[i][1])
        score.append(float(read_list[i][2]))
    #because the first index of each list is the header, we get rid of
    #the first element in each list

    
    return title, summary, score

#This function takes in the summary list and then randomly generates a test
#set and a training set. The hardcoded value is 80 percent training and 20
#percent test. This returns a trainingSet, a testSet, a testSetIndex, and
#a trainingSetIndex list. 
#
#@ Params: summary - the summary list generated by the function parseCSV
#@ Return: trainingSet - the summaries inside of the parameter summary to be used
#                        as training data
#          testSet - the summaries inside of the parameter summary to be used for
#                    testing purposes
#          testSetIndex - a list of integers that represent which indices inside
#                         summary are being used for testing purposes
#          trainingSetIndex - a list of integers that represent which indices inside
#                             summary are being used for training purposes.
def generateSets(summary):
    #definition of constants throughout this function
    training_percentage = .8
    list_size = len(summary) - 1
    
    #initialize the things to return
    testSetIndex = []
    testSet = []
    trainingSet = []
    trainingSetIndex = []
    
    #Steps to generate the test set:
    #1. Randomly generate an integer between 0 and list_size
    #2. Check to see if this index is already within our test set
    #3. If it isn't, document this index inside our testSetIndex and go on
    #4. If it is, generate another random number until we find one that isn't
    #5. Repeat until testSetIndex is of size (1-training_percentage)
    for i in range(int(list_size * (1-training_percentage))):
        rand_index = random.randint(0, list_size)
        
        #keep generating until it is NOT inside of testSetIndex
        while(rand_index in testSetIndex):
            rand_index = random.randint(0, list_size)
            
        testSetIndex.append(rand_index)
        testSet.append(summary[rand_index])
        
    #After this, we will have int[(1-training_percentage)*list_size] elements
    #inside of testSet. Training set is everything else. 
    for k in range(list_size):
        
        if(k not in testSetIndex):
            trainingSet.append(summary[k])
            trainingSetIndex.append(k)
            
    return trainingSet, testSet, testSetIndex, trainingSetIndex

#taken and modified from: 
#https://stackoverflow.com/questions/9797357/dividing-a-string-at-various-punctuation-marks-using-split

#This function splits the sentence by spaces and punctuation marks. Punctuation
#marks refer to non-alphanumeric characters that can appear validly inside
#of the English sentence. One exception to this rule is the string "'s" as this
#has it's own vector inside of the glove pretrained vector set. Therefore, this
#particular string will get it's own index. 
#
#@Param: text- the text to be parsed. This must a valid English block of text,
#              as in, it must be readable to the everyday person
#@Retuen: sentence - a list containing the parsed version of the input text.
#                    Each index of this will contain either a punctuation mark
#                    by itself, the string "'s" by itself, or an English word.
def parseSentence(text):
    sentence = ("".join((char if char.isalnum() else (" "+ char + " ")) for char in text).split())
    
    #join the instances of "'s" and ONLY "'s"
    i = 0
    while i in range (len(sentence)):
        if(sentence[i] == "'" and i+1 in range(len(sentence)) and sentence[i+1] == 's'):
            sentence[i] = "'s"
            sentence.pop(i+1)
        i += 1
            
    #join the rest of the punctuation marks. AKA non-alphanumeric characters
    j = 0
    while j in range (len(sentence)): 
        if(sentence[j] != "'s" and not sentence[j].isalnum()):
            k = j+1
            while (k in range (len(sentence)) and not sentence[k].isalnum()):
                sentence[j] = sentence[j] + sentence[k]
                sentence.pop(k)
        
        j += 1
    
    #Because the glove database is all lowercase, we need to lowercase everything
    for k in range(len(sentence)):
        sentence[k] = sentence[k].lower()
    
    
    return sentence

#Definition of our MLP model. The input layer has input size of glove_dim which
#is the dimensionality of the GloVe dataset we are currently using. This has 5
#hidden layers, all with 100 nodes and a output layer of 1, which is our predicted
#user rating(voter average)
#
#@Param: glove_dim - the dimensionality of the GloVe dataset we are currently using
class MLP(nn.Module):
    def __init__(self, glove_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            #Input layer: dimension of GloVe embedding we are currently using
            nn.Linear(glove_dim, 100),
            nn.ReLU(),
            nn.Linear(100, 100),
            nn.ReLU(),
            nn.Linear(100, 100),
            nn.ReLU(),
            nn.Linear(100, 100),
            nn.ReLU(),
            nn.Linear(100, 100),
            nn.ReLU(),
            #output layer: 1
            nn.Linear(100, 1),
        )
        
    def forward(self, x):
        x = self.layers(x)
        return x
    
#This function defines a LSTM and a MLP and trains the MLP in order to generate
#predicted ratings for a given summary. It takes in the dictionary that contains
#all the GloVe pre-trained word embeddings. It first calls parseCSV to parse the
#CSV file containing all the movies. Then it calls generateSets to define our
#test and training sets. For each summary inside of our training set, it feeds
#the summary into the LSTM and then uses the final hidden state as the vector
#representation for the entire summary. The MLP takes in this vector and generates
#a predicted rating. We perform SGD on the loss function in order to train the MLP.
#After training, for every test set, the MLP predicts a rating and then the
#difference is appended to a list and the list is returned after going through
#all the test sets.
#
#@Param: vec_dict - the dictonary containing the GloVe pre-trained vector representations
#                   of words
#        vec_dict_dim - the dimensionality of the GloVe dataset you are using.
#                       Should be one of the following numbers: 50, 100, 200, 300
#@Return: differences - A list containing tuples of:
#                       (abs(predicted_value - actual_value),
#                       the index of where this difference occured)
def train(vec_dict, vec_dict_dim):
    #Define our constants
    vec_model = vec_dict
    loss_fn = torch.nn.MSELoss()
    title, summary, score = parseCSV('tmdb_5000_movies_modified.csv')
    trainingSet, testSet, testSetIndex, trainingSetIndex = generateSets(summary)
    torch.manual_seed(1)
    

    glove_dim = vec_dict_dim
    
    #Define our models 
    mlp_model = MLP(glove_dim)
    optimizer = torch.optim.SGD(mlp_model.parameters(), lr=0.001)
    lstm = nn.LSTM(glove_dim,glove_dim)
   
    
######################### START TRAINING ######################################
    
    #For each training set
    for i in range(len(trainingSet)):
        parsed_summary = parseSentence(trainingSet[i])
        associated_index = trainingSetIndex[i]
        inputs = []
        
        #populate the input list with our word embeddings
        for j in range(len(parsed_summary)):
            #Make sure the string exists inside of the glove model
            if(parsed_summary[j] in vec_model):
                inputs.append(torch.from_numpy(vec_model[parsed_summary[j]]).float())
                
        #initialize hidden states
        hidden = (torch.ones(1, 1, glove_dim), torch.ones(1, 1, glove_dim))
        
        #for each index inside of our inputs, we input it into the lstm sequentially
        for k in inputs:
            out, hidden = lstm(k.view(1, 1,-1), hidden)
            
        #we do not use the cell state, so we need to call hidden[0] when inputting
        #into MLP
        predicted_value = mlp_model(hidden[0])
        
        #use associated_index to find the true value inside of our score list
        actual_value = torch.tensor(score[associated_index], requires_grad=True)
        
        #clears the gradient as Pytorch accumulates gradients
        optimizer.zero_grad()
        
        #find our total loss
        loss = loss_fn(predicted_value, actual_value)
        
        #gradient descent
        loss.backward()
        optimizer.step()
        
        if (i % 50 == 0):
            print("Progress", "{0:.2f}".format((i*100) / len(trainingSet)), "%", end="\r")
            
        
######################### END TRAINING #######################################
    
######################### START TESTING ######################################
    
    #This list will store the difference between our predicted value vs actual
    differences = []
    #For each test set
    for i in range(len(testSet)):
        parsed_summary = parseSentence(testSet[i])
        associated_index = testSetIndex[i]
        inputs = []
        
        #populate the input list with our word embeddings
        for j in range(len(parsed_summary)):
            #Make sure the string exists inside of the glove model
            if(parsed_summary[j] in vec_model):
                inputs.append(torch.from_numpy(vec_model[parsed_summary[j]]).float())
                
        #initialize hidden states
        hidden = (torch.ones(1, 1, glove_dim), torch.ones(1, 1, glove_dim))
        
        #for each index inside of our inputs, we input it into the lstm sequentially
        for k in inputs:
            out, hidden = lstm(k.view(1, 1,-1), hidden)
            
        #we do not use the cell state, so we need to call hidden[0] when inputting
        #into MLP
        predicted_value = mlp_model(hidden[0]).item()
        actual_value = score[associated_index]

        temp_tuple = (abs(predicted_value - actual_value), associated_index)
        differences.append(temp_tuple)
        
######################### END TESTING ########################################
        
    return differences

#This function takes in the differences list from the train() function and it
#outputs the accuracy of the testing, the maximum difference, and the index at
#where this maximum difference occured.
#
#@Param: differences - the list returned by the function train(). This should
#                      be a list containing tuples of the form:
#                      (abs(predicted_value - actual_value), 
#                       index at where this difference occured)
#@Return: final_accuracy - A float that represents the accuracy of our testing
#         max_diff - The maximum difference inside of the list difference
#         max_diff_index - the index at where max_diff occurs
def calculate_accuracy(differences):
    #initialize the differences to the first element
    max_diff = differences[0][0]
    max_diff_index = differences[0][1]

    #initialize our constant
    size = len(differences)
    
    #initialize our counter

    num_correct_0 = 0
    num_correct_25 = 0
    num_correct_5 = 0
    num_correct_1 = 0
    
    #definition of what is "accurate". Anything <= MARGIN_ERROR is deemed accurate.
    MARGIN_ERROR = .25
    
    #iterate through the list of differences
    for element in differences:
        
        #check for maximum difference
        if element[0] > max_diff:
            max_diff = element[0]
            max_diff_index = element[1]
            
        #this gets the difference in predicted vs actual
        diff = element[0]
        
        if diff <= .25:
            num_correct_25 += 1
        if diff == 0:
            num_correct_0 += 1
        if diff <= .5:
           num_correct_5 += 1
        if diff <= 1:
            num_correct_1 += 1

    
    #accuracy = (accurate predictions / size of list)
    final_accuracy_0 = num_correct_0/size
    final_accuracy_25 = num_correct_25/size
    final_accuracy_5 = num_correct_5/size
    final_accuracy_1 = num_correct_1/size
    
    array = np.array(differences)
    return final_accuracy_0, final_accuracy_25,final_accuracy_5, final_accuracy_1, round(np.average(array,axis=0)[0],3)  # , max_diff, max_diff_index
        

# Run Functions:

In [5]:
# Define path variables
GLOVE_PATH= './glove.6B.50d.txt'
MOVIE_CSV = 'tmdb_5000_movies_modified.csv'
MARGIN_ERROR = 1 # How large of difference we are willing to accept to count as good prediction

start = timer()
vect_dict = loadGloveModel(GLOVE_PATH)
end = timer()
elapsed_sec = end - start # Time in seconds, e.g. 5.38091952400282
print("Finished loading vect_dict in", elapsed_sec, "seconds \n")

Loading Glove Model
Done. 400000  words loaded!
Finished loading vect_dict in 14.713267134231355 seconds 



# Run a Bunch of Train()

In [56]:
from timeit import default_timer as timer
print("Running train() .... \n")
start = timer()

print("margin 0, margin, .25, margin .5 margin 1, average difference")
for i in range(0,5):
    result_list = train(vect_dict,50) # (predicted_value - actual_value, the index at where this difference occurred)
    print(calculate_accuracy(result_list))


end = timer()
elapsed_sec = end - start # Time in seconds, e.g. 5.38091952400282
print("Finished running train() in ", elapsed_sec, "seconds \n")
print("Number of tests: ",len(result_list))


Running train() .... 

margin 0, margin, .25, margin .5 margin 1, average difference
(0.0, 0.22597676874340022, 0.41816261879619854, 0.7106652587117213, 0.759)
(0.0, 0.21647307286166842, 0.4033790918690602, 0.7096092925026399, 0.774)
(0.0, 0.20802534318901794, 0.40971488912354803, 0.7001055966209081, 0.77)
(0.0, 0.21119324181626187, 0.4192185850052798, 0.7296726504751848, 0.75)
(0.0, 0.21647307286166842, 0.40549102428722283, 0.7074973600844773, 0.764)
Finished running train() in  1732.6820447935934 seconds 

Number of tests:  947


In [38]:
print(result_list)

[(0.04676160812377894, 72), (0.6944161415100094, 129), (0.261024761199951, 552), (0.03531847000122035, 318), (0.24917917251586896, 645), (0.7661668777465822, 326), (0.5672202110290527, 248), (0.24793081283569318, 9), (0.47242059707641637, 474), (0.08882951736450195, 488), (0.5497303962707516, 869), (0.6482645988464357, 705), (1.9735659599304203, 634), (0.7537967681884767, 730), (0.9493054389953617, 181), (0.23553447723388654, 224), (0.8555500984191893, 390), (0.5705685615539551, 524), (0.38662748336791974, 773), (0.2318656921386717, 53), (1.3706443786621092, 343), (1.2720825195312502, 912), (0.4417523384094242, 708), (0.2132426261901852, 383), (0.7628609657287599, 417), (0.7565648078918459, 671), (0.7627913475036623, 475), (0.1541954994201662, 807), (1.1146456718444826, 668), (0.04058504104614258, 424), (0.8642809867858885, 930), (0.41985559463500977, 132), (1.7619869232177736, 493), (0.4546193122863773, 546), (0.255088520050049, 18), (0.030128574371337535, 890), (1.744834232330322, 85