# Function Definitions:

In [43]:
import numpy as np
import random
import csv
import torch
import torch.nn as nn
import torch.optim as optim
from timeit import default_timer as timer # For timing runtime of function
from torch.autograd import Variable
#source: https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python

#This function takes in the path to the glove 50d pretrained vectors that is
#originally stored as a .txt and then converts it into a dictionary. To access
#a certain vector of a certain word, use the word as the key, and the dictionary
#will return the vector.
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r',encoding="utf-8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

#This function takes in the path to our modified csv file and returns individual
#lists containing title, summary, and score, respectively. 
#
#Example: title[k] returns the movie at index k's title in the form of a string
#
#@ Params: csvFile - the path to our modified csv file containing only the usable
#                    movies
#
#@ Return: title - a list of all the movie titles inside of csvFile
#          summary - a list of all the summaries inside of csvFile
#          score - a list of all the scores (voter averages) inside of csvFile
def parseCSV(csvFile):
    #define the column names
    
    #reading the file to parse
    with open(csvFile, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        read_list = list(reader)
    
    #stores the respective data in lists
    title = []
    summary = []
    score = []
    
    # To parse all movies, set range(1, len(read_list))
    for i in range(1, 50):
        title.append(read_list[i][0])
        summary.append(read_list[i][1])
        score.append(float(read_list[i][2]))
    #because the first index of each list is the header, we get rid of
    #the first element in each list

    return title, summary, score

#This function takes in the summary list and then randomly generates a test
#set and a training set. The hardcoded value is 80 percent training and 20
#percent test. This returns a trainingSet, a testSet, a testSetIndex, and
#a trainingSetIndex list. 
#
#@ Params: summary - the summary list generated by the function parseCSV
#@ Return: trainingSet - the summaries inside of the parameter summary to be used
#                        as training data
#          testSet - the summaries inside of the parameter summary to be used for
#                    testing purposes
#          testSetIndex - a list of integers that represent which indices inside
#                         summary are being used for testing purposes
#          trainingSetIndex - a list of integers that represent which indices inside
#                             summary are being used for training purposes.
def generateSets(summary):
    #definition of constants throughout this function
    training_percentage = .8
    list_size = len(summary) - 1
    
    #initialize the things to return
    testSetIndex = []
    testSet = []
    trainingSet = []
    trainingSetIndex = []
    
    #Steps to generate the test set:
    #1. Randomly generate an integer between 0 and list_size
    #2. Check to see if this index is already within our test set
    #3. If it isn't, document this index inside our testSetIndex and go on
    #4. If it is, generate another random number until we find one that isn't
    #5. Repeat until testSetIndex is of size (1-training_percentage)
    for i in range(int(list_size * (1-training_percentage))):
        rand_index = random.randint(0, list_size)
        
        #keep generating until it is NOT inside of testSetIndex
        while(rand_index in testSetIndex):
            rand_index = random.randint(0, list_size)
            
        testSetIndex.append(rand_index)
        testSet.append(summary[rand_index])
        
    #After this, we will have int[(1-training_percentage)*list_size] elements
    #inside of testSet. Training set is everything else. 
    for k in range(list_size):
        
        if(k not in testSetIndex):
            trainingSet.append(summary[k])
            trainingSetIndex.append(k)
            
    return trainingSet, testSet, testSetIndex, trainingSetIndex


#taken and modified from: 
#https://stackoverflow.com/questions/9797357/dividing-a-string-at-various-punctuation-marks-using-split

#This function splits the sentence by spaces and punctuation marks. Punctuation
#marks refer to non-alphanumeric characters that can appear validly inside
#of the English sentence. One exception to this rule is the string "'s" as this
#has it's own vector inside of the glove pretrained vector set. Therefore, this
#particular string will get it's own index. 
#@Param: text- the text to be parsed. This must a valid English block of text,
#              as in, it must be readable to the everyday person
#@Retuen: sentence - a list containing the parsed version of the input text.
#                    Each index of this will contain either a punctuation mark
#                    by itself, the string "'s" by itself, or an English word.
def parseSentence(text):
    sentence = ("".join((char if char.isalnum() else (" "+ char + " ")) for char in text).split())
    
    #join the instances of "'s" and ONLY "'s"
    i = 0
    while i in range (len(sentence)):
        if(sentence[i] == "'" and i+1 in range(len(sentence)) and sentence[i+1] == 's'):
            sentence[i] = "'s"
            sentence.pop(i+1)
        i += 1
            
    #join the rest of the punctuation marks. AKA non-alphanumeric characters
    j = 0
    while j in range (len(sentence)): 
        if(sentence[j] != "'s" and not sentence[j].isalnum()):
            k = j+1
            while (k in range (len(sentence)) and not sentence[k].isalnum()):
                sentence[j] = sentence[j] + sentence[k]
                sentence.pop(k)
        
        j += 1
    
    #Because the glove database is all lowercase, we need to lowercase everything
    for k in range(len(sentence)):
        sentence[k] = sentence[k].lower()
    
    
    return sentence


    
def LSTM(vec_dict):
    model = vec_dict
    title, summary, score = parseCSV(MOVIE_CSV)
    trainingSet, testSet, testSetIndex, trainingSetIndex = generateSets(summary)
    
    inputs = []
    #for each summary inside of out trainingSet
    for i in range(len(trainingSet)):
        parsed_summary = parseSentence(trainingSet[i])
        
        #For each word, punctuation, or instance of 's
        for j in range(len(parsed_summary)):
            #Make sure the string exists inside of the glove model
            if(parsed_summary[j] in model):
                inputs.append(torch.from_numpy(model[parsed_summary[j]]).float())

    torch.manual_seed(1)            
    lstm = nn.LSTM(50, 50)
    
    hidden = (torch.randn(1, 1, 50), torch.randn(1, 1, 50))
    
    for k in inputs:
        out, hidden = lstm(k.view(1, 1,-1), hidden)
        
    return hidden[0]

    
# https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html
# https://www.kaggle.com/pinocookie/pytorch-simple-mlp
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(50, 40),
            nn.Sigmoid(),
            nn.Linear(40, 30),
            nn.ReLU(),
            nn.Linear(30, 20),
            nn.Sigmoid(),
            nn.Linear(20, 10),
            nn.ReLU(),
            nn.Linear(10, 1),
        )
        
    def forward(self, x):
        x = self.layers(x)
        return x
    
    
"""LSTM and MLP working

Added: train(vec_dict), vec_dict is the loaded glove embeddings. train contains our LSTM (defined inside function) and 
       MLP (defined outside of function, see the class definition of MLP right above the train function). 

What this function does:

1) For each of the summaries inside trainingSet, it inputs each word embedding inside of a summary into the LSTM
   with randomized inital hidden states.

2) After the LSTM returns the final hidden state, it inputs the final hidden state into the MLP

3) Using what the MLP returns and the true vote_average for a certain movie, 
   a loss function is used to calculate the loss. This loss function can be changed,
   I chose one arbitrarily. Performs gradient descent.

4) After training, it goes through the testSet and uses the same LSTM (with randomized inital hidden states) 
   and the same MLP to obtain a predicted value.

5) This data is stored inside a list. The list differences contain the tuple:
   (predicted_value - actual_value, the index at where this difference occurred)

TODO: We need to define when we deem a prediction to be "accurate". We need to represent the data in a meaningful way(histogram? trainingPercentage vs accuracy?)

NOTE: I am only using the first 2000 movies for the sake of speed during debugging. To use the entire movie list:

Go to parseCSV function, inside for loop change 2000 -> len(read_list)
"""
def train(vec_dict):
    #define our constants
    vec_model = vec_dict
    mlp_model = MLP()
    optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.001)
    loss_fn = torch.nn.MSELoss()
    title, summary, score = parseCSV(MOVIE_CSV)
    trainingSet, testSet, testSetIndex, trainingSetIndex = generateSets(summary)
    torch.manual_seed(1)
    lstm = nn.LSTM(50,50)
   
    
######################### START TRAINING ######################################
    
    #For each training set
    for i in range(len(trainingSet)):
        parsed_summary = parseSentence(trainingSet[i])
        associated_index = trainingSetIndex[i]
        inputs = []
        
        #populate the input list with our word embeddings
        for j in range(len(parsed_summary)):
            #Make sure the string exists inside of the glove model
            if(parsed_summary[j] in vec_model):
                inputs.append(torch.from_numpy(vec_model[parsed_summary[j]]).float())
                
        #initialize hidden states
        hidden = (torch.randn(1, 1, 50), torch.randn(1, 1, 50))
        
        #for each index inside of our inputs, we input it into the lstm sequentially
        for k in inputs:
            out, hidden = lstm(k.view(1, 1,-1), hidden)
            
        #we do not use the cell state, so we need to call hidden[0] when inputting
        #into MLP
        predicted_value = mlp_model(hidden[0])
        
        #use associated_index to find the true value inside of our score list
        actual_value = torch.tensor(score[associated_index], requires_grad=True)
        
        #clears the gradient as Pytorch accumulates gradients
        optimizer.zero_grad()
        
        #find our total loss
        loss = loss_fn(predicted_value, actual_value)
        
        #gradient descent
        loss.backward()
        optimizer.step()
        
######################### END TRAINING #######################################
    
######################### START TESTING ######################################
    
    #This list will store the difference between our predicted value vs actual
    differences = []
    
    #For each test set
    for i in range(len(testSet)):
        parsed_summary = parseSentence(testSet[i])
        associated_index = testSetIndex[i]
        inputs = []
        
        #populate the input list with our word embeddings
        for j in range(len(parsed_summary)):
            #Make sure the string exists inside of the glove model
            if(parsed_summary[j] in vec_model):
                inputs.append(torch.from_numpy(vec_model[parsed_summary[j]]).float())
                
        #initialize hidden states
        hidden = (torch.randn(1, 1, 50), torch.randn(1, 1, 50))
        
        #for each index inside of our inputs, we input it into the lstm sequentially
        for k in inputs:
            out, hidden = lstm(k.view(1, 1,-1), hidden)
        
        #we do not use the cell state, so we need to call hidden[0] when inputting
        #into MLP
        predicted_value = mlp_model(hidden[0]).item()
        actual_value = score[associated_index]
        #create a tuple (difference of actual vs pred, index of movie)
        
        temp_tuple = (predicted_value - actual_value, associated_index)
        differences.append(temp_tuple)
        
    return differences
######################### END TESTING ######################################
    
def calculate_accuracy(differences):
    size = len(differences)
    num_correct = 0
    for element in differences:
        diff = element[0]
        if diff <= MARGIN_ERROR:
            num_correct += num_correct
    return num_correct/size
        
def yolo():
    print("yolo")

# Run Functions:

In [46]:
# Define path variables
GLOVE_PATH= './glove.6B.50d.txt'
MOVIE_CSV = 'tmdb_5000_movies_modified.csv'
MARGIN_ERROR = 1 # How large of difference we are willing to accept to count as good prediction

vect_dict = loadGloveModel(GLOVE_PATH)

print("Running train() .... \n")
start = timer()

result_list = train(vect_dict) # (predicted_value - actual_value, the index at where this difference occurred)

end = timer()
elapsed_sec = end - start # Time in seconds, e.g. 5.38091952400282
print("Finished running train() in ", elapsed_sec, "seconds \n")
print("Number of tests: ",len(result_list), "\n", result_list)

Loading Glove Model
Done. 400000  words loaded!
Running train() .... 

Finished running train() in  5.717856443490746 seconds 

Number of tests:  9 
 [(-5.3791069984436035, 35), (-5.780598664283753, 32), (-5.28012444972992, 5), (-6.878690123558044, 46), (-6.979372715950012, 42), (-5.879568338394165, 38), (-6.779308104515076, 47), (-5.479766941070556, 4), (-5.681228387355804, 33)]


In [47]:
print(calculate_accuracy(result_list))

0.0
