In [2]:
#this cell downloads the data needed for this jupyter notebook from kaggle and stores it in the-movies-dataset folder in the current directory
#if the folder has been created and the files are already in that folder than this cell does nothing and requires no credentials
import opendatasets as od

#Data Source Information:
#https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?select=movies_metadata.csv

od.download("https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset")

Skipping, found downloaded files in ".\the-movies-dataset" (use force=True to force download)


In [1]:
#This cell is for combining certain data from the necessary csv files into a single dataframe (complete_df)
import pandas as pd


#Rows are removed from each dataframe when they do not have sufficent data for a column or the data from that column does not exist.
#This kind of row removal is done before multiple copies of the same movie data becomes present in multple rows, saving time and space.
#Iteration through rows of a dataframe at this level is inefficient compared to list iteration.
#This is why they are converted into lists before iteration and then back again to dataframes, 
#so they can use the merge function to combine data into a single data frame


pd.set_option('display.max_colwidth', None)

movies_df = pd.read_csv('./the-movies-dataset/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={'genres':"string","id":"string","title": "string", "tagline": "string","overview":"string",
                                    "production_companies" :"string"})[["genres","id" ,"title","tagline", "overview","production_companies"]]
movies_df.dropna(inplace = True)
movies_lst = [row for row in movies_df.values.tolist() if not (row[0][len(row[0])  - 2:] == "[]" or row[5][len(row[5]) - 2:] == "[]")]
movies_df = pd.DataFrame(movies_lst, columns = ("genres","id" ,"title","tagline", "overview","production_companies"), dtype = str)



ratings_df = pd.read_csv('./the-movies-dataset/ratings.csv', usecols = ("userId", "movieId", "rating"),
                       dtype={"userId": "string","movieId": "string","rating": "string"})[["userId", "movieId", "rating"]]
ratings_df.rename(columns={"movieId": "id"}, inplace = True)
ratings_df.dropna(inplace = True)


#Question: What if the removal of duplicate movie ids per user was processed here instead of the cell below???
#Answer: The duplicate removal function can be ran here, but the complete_list in the cell below can also be iterated over with relative complexity in order to remove duplicates.
#The iteration in the next cell also populates the gap list which is critical to be ran directly before the function that determines bounds for users rated movies.
#So, omitting the no duplicate function in this cell and making it run in the next cell avoids redundant iteration.


#Question: What if the test and train ratings bounds was enforced here instead of the cell below???
#Answer: The merge functions below needs to be executed before determining test and train users, because merge will remove dead rows and therefore user ratings...
#before enforcing the users to a ratings bounds. The current timing of this function will ensure that the final users are within the set train or test bounds.


keywords_df = pd.read_csv('./the-movies-dataset/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})[["id", "keywords"]]
keywords_df.dropna(inplace = True)
keywords_lst = [row for row in keywords_df.values.tolist() if not (row[1][len(row[1])  - 2:] == "[]")]
keywords_df = pd.DataFrame(keywords_lst, columns = ("id", "keywords"), dtype = str)


credits_df = pd.read_csv("./the-movies-dataset/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})[["cast", "id"]]
credits_df.dropna(inplace = True)
credits_lst = [row for row in credits_df.values.tolist() if (not row[0][len(row[0])  - 2:] == "[]")]
credits_df = pd.DataFrame(credits_lst, columns = ("cast", "id"), dtype = str)


#default merge is inner: this only keeps movies that have the id existing in both dataframes
complete_df =  pd.merge(movies_df, ratings_df, on ="id")
complete_df =  pd.merge(complete_df,keywords_df, on ="id")
complete_df  = pd.merge(complete_df,credits_df, on ="id")

#for testing
print(complete_df.head())

complete_df.sort_values(by = 'userId', inplace = True)


#master dataframe: For each (user id, movie id) row combination there is the combined movie data from movies_df, ratings_df, keywords_df, and credits_df for the movie id in question
#reorder the columns
complete_df  = complete_df.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]


del movies_df
del ratings_df
del keywords_df
del credits_df

#for testing
print(complete_df.head())


#tested on personal machine:
#old run with dataframe iteration (old code): 1m 5.7
#new run with list conversion for iteration (current code): 37.1 seconds


                                                                                              genres  \
0  [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]   
2  [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]   
3  [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]   
4  [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]   

     id    title                                    tagline  \
0  8844  Jumanji  Roll the dice and unleash the excitement!   
1  8844  Jumanji  Roll the dice and unleash the excitement!   
2  8844  Jumanji  Roll the dice and unleash the excitement!   
3  8844  Jumanji  Roll the dice and unleash the excitement!   
4  8844  Jumanji  Roll the dice and unleash the excitement! 

In [2]:
import ast
import random
import time

start_time = time.time()


#LOOK: how to comment and docstring???:
#https://www.askpython.com/python/python-comments

# seed for consistent results across runtime
seed_int = 2
random.seed(seed_int)


def populate_names(item):
    """used to extract names from the syntax of certain data entries"""
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    index = 0
    for item in jsons:
        if(index == len(jsons)-1):
            temp_dict = ast.literal_eval(item)
            names+=str(temp_dict["name"])
        else:
            temp_dict = ast.literal_eval(item+"}")
            names+=str(str(temp_dict["name"])+" ")
        index += 1
    return names


def provide_data(row):
    """extract data from row of complete_list"""
    movie_data = []
    movie_data.append(int(row[0]))
    movie_data.append(int(row[1]))
    movie_data.append(float(row[2]))
    movie_data.append(row[3])  

    movie_data.append(populate_names(row[4]))
    movie_data.append(populate_names(row[5]))
    movie_data.append(populate_names(row[6]))
    movie_data.append(populate_names(row[7]))

    movie_data.append(str(row[8]))
    movie_data.append(str(row[9]))
    return movie_data
    


#The list of rows with users id and raw movie data
#Note: It is sorted by user_id
complete_list = complete_df.values.tolist()

print("Complete number of users:", len(list(complete_df["userId"].unique()))) #260788

del complete_df

#the complete list of user rows without ratings of the same movie more than once for a given user
complete_list_no_dups = []

#distinquish the users the row belongs to 
last_id = complete_list[0][0]

#the set of movies that a user has rated
#used to omit later ratings of a movie that the user has already rated
movie_set = set()

#the number of rows of movie data a single user takes up for each user 
gaps = []

#appended to gaps when all of a users rows of movie data have been counted
gap_len = 0


#populates gaps and complete_list_no_dups by omitting movies that already have a rating in respect to each user
for row in complete_list:
    if last_id != row[0]:
        movie_set= set()
        complete_list_no_dups.append(row)
        movie_set.add(row[1])
        gaps.append(gap_len)
        gap_len = 1
    else:
        if row[1] not in movie_set:
            complete_list_no_dups.append(row)
            gap_len+=1
            movie_set.add(row[1])
    last_id = row[0]

#add the last gap_len
gaps.append(gap_len)


del complete_list


#index in the complete_list_no_dups list
full_index = 0 
#bounds of the indices of a train users rated movies in complete_list_no_dups for each train user 
#(train users fall into 50 <= gaps[user_index] and 70 >= gaps[user_index])
bounds_train = [] 
#bounds of the indices of a test users rated movies in complete_list_no_dups for each test user 
#(test users fall into 5 <= gaps[user_index] and 10 >= gaps[user_index])
bounds_test = [] 


#populates bounds_train and bounds_test
for user_index in range(len(gaps)):
    if 50 <= gaps[user_index] and 70 >= gaps[user_index]:
        bounds_train.append([full_index, full_index+gaps[user_index]])

    elif 5 <= gaps[user_index] and 10 >= gaps[user_index]:
        bounds_test.append([full_index, full_index+gaps[user_index]])

    full_index+=gaps[user_index]    



#Question: Why save bounds of the users movie indices instead of just storing movie information for each movie rated by each user???
#Answer: Because it saves memory and it doesn't make sense to save all the movie information when only a...
#subset of the train and test users will be selected below (in this case 5000 and 1000 respectively)

#14314 train users in range 50 <= gaps[user_index] and 70 >= gaps[user_index]
print("Number of train users before random selection:",len(bounds_train)) 
#68048 test users in range 5 <= gaps[user_index] and 10 >= gaps[user_index]
print("Number of test users before random selection:", len(bounds_test)) 


del gaps


nof_train_bounds_to_select = 5000
nof_test_bounds_to_select = 1000

bounds_train_sample = random.sample(bounds_train, nof_train_bounds_to_select)
bounds_test_sample = random.sample(bounds_test, nof_test_bounds_to_select)


del bounds_train
del bounds_test


#transformed data from movie data of the selected train users and test users (in that order)
sampled_data = []

avg = 0
cnt = 0

for bound in bounds_train_sample:
    for movie in complete_list_no_dups[bound[0]:bound[1]]:
        movie_data = provide_data(movie)
        movie_data[0] = cnt
        sampled_data.append(movie_data)
        avg += 1
    cnt+=1

for bound in bounds_test_sample:
    for movie in complete_list_no_dups[bound[0]:bound[1]]:
        movie_data = provide_data(movie)
        movie_data[0] = cnt
        sampled_data.append(movie_data)
        avg += 1
    cnt+=1



del complete_list_no_dups
del bounds_train_sample
del bounds_test_sample



print("Average number of ratings for the users chosen:", float(avg/cnt))


print("Minutes taken:", float((time.time()-start_time)/60))


#results

# Complete number of users: 260788
# Number of train users before random selection: 14314
# Number of test users before random selection: 68048
# Average number of ratings for the users chosen: 50.236
# Minutes taken: 4.779916667938233


Complete number of users: 260788
Number of train users before random selection: 14314
Number of test users before random selection: 68048
Average number of ratings for the users chosen: 50.236
Minutes taken: 4.841017043590545


In [3]:
#save in a constructed_data/constructed_data.csv file so that cells below can run without running this cell and above
import csv
import os

current_directory = os.getcwd()
final_directory = os.path.join(current_directory, 'constructed_data')
if not os.path.exists(final_directory):
   os.makedirs(final_directory)

with open("constructed_data/constructed_data.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    writer.writerows(sampled_data)

In [4]:
#this is a starting point if the data is already saved to the constructed_data/constructed_data.csv
import csv

data_list =[]

with open("constructed_data/constructed_data.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    data_list = list(csv_reader)

data_list = data_list[1:]


In [5]:
import random

# seed for consistent results across runtime
seed_int = 2
random.seed(seed_int)

#These are lists of user data
#Each user in each list contains movie data for all the movies they rated
user_to_data_train = []
user_to_data_test = []

user_id = data_list[0][0]
ratings = []

#For each user, distinuish whether they are a train or test user by the number of their corresponsing movie rows.
#Each movie entry for a user will be in consecutive rows in data_list becuase data_list is sorted by user
#Note: Technically in the third cell there is clearly distinquished test and train users before combining them in sampled_data...
#so there is an arguably redundant processes here
#However, this cell executes instantly so there is no need to persist any user type labels

for row in data_list:
    if (row[0]!=user_id):
        if(5 <= len(ratings) and 10 >= len(ratings)):
            user_to_data_test.append(ratings)
        else:
            user_to_data_train.append(ratings)
        user_id = row[0]
        ratings = [row]
    else:
        ratings.append(row)


#distinuish whether the last user is a test or train user
if(5 <= len(ratings) and 10 >= len(ratings)):
    user_to_data_test.append(ratings)
else:
    user_to_data_train.append(ratings)



#This is where smaller or equal samples of train and test users are selcted from constructed_data.csv

#Note: suppose you executec cell 3 for a excessive amount of train and test users and they are saved in constructed_data.csv
#then the cell above(cell 5) and this cell(cell 6) can be executed to select a smaller subset of those for the model
#without having to run the relatively slow(cell 3) over again

user_to_data_train = random.sample(user_to_data_train, 5000)
user_to_data_test = random.sample(user_to_data_test, 1000)


In [6]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
import random
from ordered_set import OrderedSet
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel

# seed for consistent results across runtime
seed_int = 2
random.seed(seed_int)


class user_type_vars():
    """Each of the variables in this class represent the data structures for a user type"""
    def __init__(self):
        #for each user, a dictionary of movie_id to the movies rating for each movie the user watched
        self.user_to_movie_id_to_rating = [] 

        #for each user, a random choice of movie_id from all the movies the user watched to represent the target movie
        self.user_to_target_movie_id = [] 

        #for each user, this is the index of the users target movie in the order of train movies_in_order
        #(train_users only)
        self.user_to_target_index_full = [] 

        #for each user, includes ratings for all the movies in the entire train set 
        #missing ratings and target movie ratings are set to that movies average rating
        #(train_users only)
        self.user_to_ratings_full = [] 

        #for each user, includes ratings for all the movies in the entire train set
        #the movies mean rating is subtracted from each rating
        #missing ratings and target movie ratings are set to zero
        #(train_users only)
        self.user_to_ratings_full_transform = []

        #this is a set of every unique target movie for the train set
        #this is used to check if a movie is in the train set but only as a target movie
        #(train_users only)
        self.target_movies = set()

        #for every movie watched by the user_type, a list of ratings
        self.movie_id_to_ratings = dict()

        #all the movie ids in the order of where they appear first in the list of user ratings (either train or test users) 
        self.movies_in_order = OrderedSet()

        #model input features x for each user
        self.feature_1 = []
        self.feature_2 = []
        self.feature_3 = []

        #model output feature y for each user
        self.user_to_target_rating  = [] 


#user_type_vars can represent a group of train users and a group of test users
train_users = user_type_vars()
test_users = user_type_vars()


wnl = WordNetLemmatizer()



def load_feature_1_and_2(target_movies, movies_in_order, user_to_data, movie_id_to_ratings, user_to_movie_id_to_rating, user_to_target_movie_id, user_to_target_rating, feature_1, feature_2):
    """
    This is ran once to be used to populate features 1 and 2 for the train_users...
    and ran again to be used to populate features 1 and 2 for the test_users
    It also populates the train and test version of these variables: target_movies, movies_in_order, movie_id_to_ratings,
    user_to_movie_id_to_rating, user_to_target_movie_id, user_to_target_rating
    These variables are used in the load_feature_3 function 
    """ 
    #these are used to calculate the overall train rating
    #the overall_average_train (which is overall_rating_sum/overall_rating_count) is only set to the output of the "train" function call
    #this is used to fill in ratings for movies that are only target movies for a certain set of users 
    #this is because in practice and full application of the program/model, the target movie ratings are unknown (they need to be predicted)
    overall_rating_sum = 0
    overall_rating_count = 0

    for i in range(len(user_to_data)):
        movie_id_to_words = dict()
        movie_id_to_rating = dict()
        index = 0
        rand_index = random.randint(0, len(user_to_data[i])-1)
        for movie_data in user_to_data[i]:
            if index == rand_index:    
                target_movies.add(movie_data[1])
                user_to_target_movie_id.append(movie_data[1])
            else:
                #the program should train and test while simulating the possibility of
                #some new movies having no existing rating data in the database.
                #This is critical training and testing if the resulting model were realistically be tried on completely new data...
                #where target ratings are unknown (they need to be predicted)
                
                #this is why target ratings are omitted from movie_id_to_ratings
                #the same logic stands for overall_average_train which is formed by overall_rating_sum and overall_rating_count
                overall_rating_sum += float(movie_data[2])
                overall_rating_count += 1

                if movie_data[1] in movie_id_to_ratings.keys():
                    movie_id_to_ratings[movie_data[1]].append(float(movie_data[2]))
                else:
                    movie_id_to_ratings[movie_data[1]] = [float(movie_data[2])]

            movie_string = ""

            #use this to apply all the text data and combine in to a single list of words (repeats allowed):
            # for j in range (3,len(movie_data)):
            #     if(j!= len(movie_data)-1):
            #         movie_string+= movie_data[j]+" "
            #     else:
            #         movie_string+= movie_data[j]

            #all of the text columns and a few combinations of certain text columns were tested but they were not helpful in...
            #increasing model performance (see README.md)

            #Use this truncated code to only include the genre column strings:
            movie_string = movie_data[4]

            #lematization and conversion to lists of words
            cleaned_string = remove_stopwords(movie_string)
            cleaned_list = [wnl.lemmatize(word) for word in cleaned_string.split(" ")]
            cleaned_list = [word[:-1] for word in cleaned_list if word.endswith(".")] + [word for word in cleaned_list if not word.endswith(".")]

            movie_id_to_words[movie_data[1]] = cleaned_list
            movie_id_to_rating[movie_data[1]] = float(movie_data[2])
            movies_in_order.add(movie_data[1])
            index+=1

        user_to_movie_id_to_rating.append(movie_id_to_rating)

        #Question: This function assumes that all movies have their corpus information somewhere in the entire "the-movies-dataset"
        #inorder to make a word count vector.
        #what happens if with the application of this program/model, a completely new movie to the data base is found???
        #Answer: Then feature_2 would not function as a model parameter
        #Note: For the runtime of this notebook, since the train and test data all come from "the-movies-dataset" there is no risk of this happening
        #Note: The model below is currently fed feature_1 and feature_3 which do not need this corpus data

        #The current users set of words from all the movies they rated
        users_words_in_order = OrderedSet()
        for movie_id in movie_id_to_words.keys():
            for word in movie_id_to_words[movie_id]:
                users_words_in_order.add(word)


        word_counts = [] #list of word counts in the order of users_words_in_order for each movie (excluding target)
        target_word_counts = [] #word counts in the order of users_words_in_order for the target movie


        #populate words_counts and target_word_counts
        for movie_id in movie_id_to_words.keys():
            if movie_id != user_to_target_movie_id[-1]:
                temp_dict = Counter(movie_id_to_words[movie_id])
                temp_list = []
                for word in users_words_in_order:
                    if word in temp_dict.keys():
                        temp_list.append(temp_dict[word])
                    else:
                        temp_list.append(0)  
                word_counts.append(temp_list)  
            else:
                temp_dict = Counter(movie_id_to_words[movie_id])
                temp_list = []
                for word in users_words_in_order:
                    if word in temp_dict.keys():
                        temp_list.append(temp_dict[word])             
                    else:
                        temp_list.append(0) 
                target_word_counts = temp_list
        

        #construct the normalized tf-idf before applying cossine similairity/linear kernel
        #this places value on terms that are un-common in alot of documents,
        #while still placing value on how common they are in the document at hand
        #in this case documents are word counts for the corpuses of a single movie the user rated
        #this should lead to a more powerful quantifier for cossine similairity/linear kernel between documents

        complete_word_counts = word_counts.copy()
        complete_word_counts.append(target_word_counts)
        transformed_word_counts = TfidfTransformer().fit_transform(complete_word_counts).toarray()


        #populate ratings without the target rating
        ratings = []
        for movie_id in movie_id_to_rating.keys():
            if movie_id != user_to_target_movie_id[-1]:
                ratings.append(movie_id_to_rating[movie_id])
            else:
                #add the target movie rating for a single user (each user has only one target movie rating)
                user_to_target_rating.append(movie_id_to_rating[movie_id])
    

        def predict():
            "Use the word counts and ratings to add predictions to feature_1 list and feature_2 list"
            #pred_1 is unweighted average of all of the users movie
            pred_1 = 0 
            #pred_2 is weighted average of all the users movies (weights are based on (cossine similarity/linear kernel))
            #unless denominator is zero (see below)
            pred_2 = 0

            sum = 0
            for i in range(len(ratings)):
                sum += ratings[i]
            pred_1 = float(sum/len(ratings))

            cosine_sim = linear_kernel(X = transformed_word_counts[0:-1],Y = [transformed_word_counts[-1]])
            cosine_sim = np.reshape(cosine_sim,  (len(cosine_sim)))
            numerator = 0
            denominator = 0
            pred_2 = pred_1
            for i in range(len(ratings)):
                numerator += float(cosine_sim[i]*ratings[i])
                denominator += cosine_sim[i]
    
            #in case of potential division by zero
            if denominator != 0:
                pred_2 = float(numerator/denominator)
    
            return (pred_1, pred_2)
        
        predictions = predict()

        feature_1.append(predictions[0])
        feature_2.append(predictions[1])
            
        
    return float(overall_rating_sum/overall_rating_count)

#populate train data (feature 1 and feature 2)
#the overall_average_train which is overall_rating_sum/overall_rating_count is only set to the output of the "train" function call
#this is used to fill in ratings for movies that are only target movies for a certain set of users
overall_average_train = load_feature_1_and_2(train_users.target_movies, train_users.movies_in_order, user_to_data_train, train_users.movie_id_to_ratings, train_users.user_to_movie_id_to_rating, 
                                                         train_users.user_to_target_movie_id, train_users.user_to_target_rating, train_users.feature_1, train_users.feature_2)

#populate test data (feature 1 and feature 2)
load_feature_1_and_2(set(), test_users.movies_in_order, user_to_data_test, test_users.movie_id_to_ratings, test_users.user_to_movie_id_to_rating, 
               test_users.user_to_target_movie_id,
               test_users.user_to_target_rating, test_users.feature_1, test_users.feature_2)


def pre_svd(movie_id_to_average_rating, movies_in_order, user_to_ratings_full_transform, user_to_ratings_full, user_to_target_index_full, 
               user_to_movie_id_to_rating, user_to_target_movie_id):
    """
    Populate the lists user_to_ratings_full and user_to_ratings_full_transform 
    User_to_ratings_full_transform is used for svd because it includes entries from all movies in movies_in_order
    and transforms the data in user_to_ratings_full by subtracting the movie rating mean.
    This means that the transformed value at the indices of unwatched movies and index coresponding to target movies are zero
    """
    for i in range(len(user_to_movie_id_to_rating)):
        ratings = []
        transformed_ratings = []

        #the index of the target movie within the entire movies_in_order ordered set
        index = 0

        for movie_id in movies_in_order:
            if movie_id == user_to_target_movie_id[i]:
                user_to_target_index_full.append(index)
                ratings.append(movie_id_to_average_rating[movie_id])
                transformed_ratings.append(movie_id_to_average_rating[movie_id] - movie_id_to_average_rating[movie_id]) 
            elif movie_id in user_to_movie_id_to_rating[i].keys():
                ratings.append(user_to_movie_id_to_rating[i][movie_id])
                transformed_ratings.append(user_to_movie_id_to_rating[i][movie_id] - movie_id_to_average_rating[movie_id])
            else:
                ratings.append(movie_id_to_average_rating[movie_id])
                transformed_ratings.append(movie_id_to_average_rating[movie_id] - movie_id_to_average_rating[movie_id])
            index +=1
        #user_to_ratings_full is just for demonstration
        user_to_ratings_full.append(ratings)
        #per movie averages have been subtracted (data is ready for svd)
        user_to_ratings_full_transform.append(transformed_ratings)



def svd_full(user_to_ratings_full_transform, n, movie_id_to_average_rating):
    """
    1. get the svd of the user_to_ratings_full_transform 
    2. truncate each factor to 20 components
    3. multiply the truncated components together (U X s) X V 
    4. scale back the values to the orginal rating scale (1-5) and return result
    """
    U, S, V = np.linalg.svd(user_to_ratings_full_transform, full_matrices=False)
    
    #simplify factors to n features
    U=U[:,0:n]
    S=np.diag(S)
    S=S[0:n,0:n]
    V=V[0:n,:]

    #reconstruct to a new array
    US = np.dot(U,S)
    USV = np.dot(US,V)

    #this tranforms the UsV row by row into the original rating scale (1-5)
    USV = USV + np.tile(list(movie_id_to_average_rating.values()), (USV.shape[0],1))

    #be consistent with data structures...
    return list(USV)



def load_feature_3():
    """
    populate feature_3 with a method loosely outlined here:
    1. find the average ratings for movies 
    2. pre_svd writes a rating for every movie for every user as well as a transformed version of those rating using the averages found above
    3. then use the output of the svd_full function by row for user and by column for the target movie rating prediction
    """

    #Every movie ever seen by any user in either the test and train sets
    all_movies_in_order = train_users.movies_in_order|test_users.movies_in_order


    #When a movie has a number of target ratings and non-target ratings, then only the non-target ratings are used...
    #to form the movies average rating

    #There is a difference between non-target ratings between movie_id_to_average_rating_train and movie_id_to_average_rating_full.
    #movie_id_to_average_rating_train considers the train set and movie_id_to_average_rating_full considers the train and test set

    #When a movie has only target ratings in either the train of full dataset,
    #instead of using the mean of the actual target ratings for movie_id_to_average_rating_train or movie_id_to_average_rating_full,
    #the movies average rating takes on the value of overall_average_train.
    #this is used to simlulate the potential application of this model when there are movies to be rated for a new user that have no ratings in the existing data.

    #The code below deliniates two different averages for valid movies, a train average and a train+test or full average.
    #The train average is used to normalize the ratings of the movies for train users in the first pre_svd call.
    #The train+test averages are used to normalize the ratings of the movies for train+test users in the second pre_svd call.

    movie_id_to_average_rating_train = dict()
    movie_id_to_average_rating_full = dict()

    for movie in all_movies_in_order:
        temp = 0
        if(movie in train_users.movie_id_to_ratings and movie in test_users.movie_id_to_ratings):
            for rating in train_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_train[movie] = float(temp/len(train_users.movie_id_to_ratings[movie])) 

            for rating in test_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_full[movie] = float(temp/(len(train_users.movie_id_to_ratings[movie])+len(test_users.movie_id_to_ratings[movie])))  

        elif(movie in train_users.movie_id_to_ratings):
            for rating in train_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_train[movie] = float(temp/len(train_users.movie_id_to_ratings[movie]))
            movie_id_to_average_rating_full[movie] = movie_id_to_average_rating_train[movie]

        elif(movie in test_users.movie_id_to_ratings):        
            if(movie in train_users.target_movies):
                movie_id_to_average_rating_train[movie] = overall_average_train

            for rating in test_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_full[movie] = float(temp/len(test_users.movie_id_to_ratings[movie]))
        else:
            if(movie in train_users.target_movies):
                movie_id_to_average_rating_train[movie] = overall_average_train
                movie_id_to_average_rating_full[movie] = overall_average_train
            else:
                movie_id_to_average_rating_full[movie] = overall_average_train
   

    #Note: the three variables below are mirrored in the train_users object
    #these variables are for the full set (all_movies_in_order)

    #all the ratings for each user for the movies corresponding to the order of the ordered set (all_movies_in_order)
    full_user_to_ratings_full = []
    #all the transformed ratings for each user for the movies corresponding to the order of the ordered set (all_movies_in_order)
    full_user_to_ratings_full_transform = [] 
    #for each user, the index of the target movie corresponding to the order of (all_movies_in_order)
    full_user_to_target_index_full = [] 


    #combining the watched movies per user in this order (train and then test users)
    full_user_to_movie_id_to_rating  = train_users.user_to_movie_id_to_rating + test_users.user_to_movie_id_to_rating
    #combining the id of the target movie per user in this order (train and then test users)
    full_user_to_target_movie_id = train_users.user_to_target_movie_id + test_users.user_to_target_movie_id


    #the two function calls below are used to populate user_to_ratings_full_transform, user_to_ratings_full, and user_to_target_index_full (both train and full versions). 
    #user_to_ratings_full_transform is scaled with the movie_id_to_average_rating (both train and full versions).

    pre_svd(movie_id_to_average_rating_train, train_users.movies_in_order, train_users.user_to_ratings_full_transform, train_users.user_to_ratings_full, 
            train_users.user_to_target_index_full, train_users.user_to_movie_id_to_rating, train_users.user_to_target_movie_id)

    pre_svd(movie_id_to_average_rating_full, all_movies_in_order, full_user_to_ratings_full_transform, full_user_to_ratings_full, full_user_to_target_index_full, 
                full_user_to_movie_id_to_rating, full_user_to_target_movie_id)


    #In practice, there is a train and a test set, the train set is a selection of what the database has on record.
    #The test data will usually be data that hasn't been seen before that can include any number of test users.
    #When train_users.user_to_ratings_full_transform is used as the input of the svd function below, 
    #svd_out_train is used to produce predictions used to train the model
    #When full_user_to_ratings_full_transform is used as the input of the svd function below,
    #svd_out_full is used to produce predictions used to test the model

    #n = 20 is close to the highest performing constant for 100 min ratings for train and test users
    #n = 10 is close to the highest performing constant for 50-75 min rating per train users and 5-10 min ratings per test user

    #Note: different values of n per function call below were tested with 50-75 min rating per train users and 5-10 min ratings per test user
    #this did not lead to performance benefits, it was best that both values of n were close to 10 for perfrormance

    svd_out_train = svd_full(train_users.user_to_ratings_full_transform, 10, movie_id_to_average_rating_train)
    svd_out_full = svd_full(full_user_to_ratings_full_transform, 10, movie_id_to_average_rating_full)

    #here the smaller svd provides predictions used to train the model
    for i in range(len(train_users.user_to_ratings_full_transform)):
        train_users.feature_3.append(svd_out_train[i][train_users.user_to_target_index_full[i]])

    #here the larger svd provides predictions used to test the model
    for i in range(len(full_user_to_ratings_full_transform) - len(train_users.user_to_ratings_full_transform)):
        test_users.feature_3.append(svd_out_full[i+len(train_users.user_to_ratings_full_transform)][full_user_to_target_index_full[i+len(train_users.user_to_ratings_full_transform)]])

#populate train and test data (feature 3)
load_feature_3()


#this is just used to show how the features approximate the target rating
print("Feature_1 to target comparison (train):")
print(train_users.feature_1[0:5])
print(train_users.user_to_target_rating[0:5])

print("Feature_1 to target comparison (test):")
print(test_users.feature_1[0:5])
print(test_users.user_to_target_rating[0:5])

print("Feature_2 to target comparison (train):")
print(train_users.feature_2[0:5])
print(train_users.user_to_target_rating[0:5])

print("Feature_2 to target comparison (test):")
print(test_users.feature_2[0:5])
print(test_users.user_to_target_rating[0:5])

print("Feature_3 to target comparison (train):")
print(train_users.feature_3[0:5])
print(train_users.user_to_target_rating[0:5])

print("Feature_3 to target comparison (test):")
print(test_users.feature_3[0:5])
print(test_users.user_to_target_rating[0:5])



del user_to_data_train
del user_to_data_test


#tested on personal machine
#corpus: genres
#runtime: 39.6 seconds

#corpus: all columns
#runtime: 7 minutes


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jackson\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Feature_1 to target comparison (train):
[3.6440677966101696, 3.830188679245283, 3.925, 4.292307692307692, 2.694915254237288]
[4.0, 4.0, 4.0, 4.0, 1.0]
Feature_1 to target comparison (test):
[3.875, 2.5714285714285716, 4.25, 2.5, 3.5714285714285716]
[4.0, 3.0, 5.0, 2.0, 5.0]
Feature_2 to target comparison (train):
[3.745144639601795, 3.8076768875564437, 3.9182777682591516, 4.335568335872769, 2.8208415705890246]
[4.0, 4.0, 4.0, 4.0, 1.0]
Feature_2 to target comparison (test):
[3.3499243716804292, 2.432058151190643, 4.485467687220292, 2.0, 3.197617080180566]
[4.0, 3.0, 5.0, 2.0, 5.0]
Feature_3 to target comparison (train):
[4.030184398469862, 4.205698935046528, 3.793402238618873, 4.174767174650713, 3.0054013377449036]
[4.0, 4.0, 4.0, 4.0, 1.0]
Feature_3 to target comparison (test):
[3.239058885293259, 4.197714366341513, 3.269048587799359, 3.51315448216471, 4.222877788398437]
[4.0, 3.0, 5.0, 2.0, 5.0]


In [7]:
#Build models based off multiple features 
#The features themselves are reasonably accuracte predictors of the traget rating for a (movie, user) combination

from sklearn.linear_model import LinearRegression
#the mlp model is not currently being used
from sklearn.neural_network import MLPRegressor
#feature scaling is not necessary because linear regression converges fast enough
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score
#the alternative evaluation metric
from sklearn.metrics import mean_squared_error



def test_parameters(nof_runs, layers, train_input_features, test_input_features):
    """Test_parameters for a number of runs and return performance results"""
    train_inputs = [list(pair) for pair in train_input_features]
    test_inputs = [list(pair) for pair in test_input_features]
    return average_results(nof_runs, layers, train_inputs, test_inputs)
    

def average_results(nof_runs, layers, train_inputs, text_inputs):
    """Average the performance results for a number of models with identical inputs"""
    no_rounding = 0
    rounding = 0
    for _ in range(nof_runs):
        pair = train_and_test(layers, train_inputs, text_inputs)
        no_rounding+=pair[0]
        rounding+=pair[1]
    return float(no_rounding/nof_runs), float(rounding/nof_runs)


def train_and_test(layers, train_inputs, test_inputs):
    """Build, train, and test a model, then return accuracy scores"""

    # nn model (worse performance):
    # reg = MLPRegressor(hidden_layer_sizes = layers, solver = "adam",  max_iter = 1000)

    # linear regression model (better performance):
    reg = LinearRegression()

    #train model
    reg.fit(train_inputs, train_users.user_to_target_rating)

    #print importance of the different input features to the model
    results = permutation_importance(reg, train_inputs, train_users.user_to_target_rating)
    importances = results["importances_mean"]
    print("Feature Importance scores:", "First feature:", importances[0],"Second feature:", importances[1])

    #make predictions for test inputs
    predictions = reg.predict(test_inputs)

    #test with and without roundings...
    #note: the actual ratings a user makes must be divisable by .5 
    rounded_predictions = []
    for item in predictions:
        rounded_predictions.append(float(round(item*2)/2.0))

    #evaluation metric 1:
    return(r2_score(test_users.user_to_target_rating, predictions), 
        r2_score(test_users.user_to_target_rating, rounded_predictions))

    #evaluation metric 2:
    # return(mean_squared_error(test_users.user_to_target_rating, predictions), 
    #         mean_squared_error(test_users.user_to_target_rating, rounded_predictions))



# the current test is the average accuracy scores (currently r2_score) for 100 models trained on the same inputs
# the hidden layers are (10,10,10) but the linear model is the current model being used which does not user layers
# and the highest scoring inputs features (feature_1 and feature_3) are used here

avg_scores = test_parameters(100, (10,10,10), 
    zip(train_users.feature_1, train_users.feature_3),
      zip(test_users.feature_1, test_users.feature_3))


print("Average r2_score without rounding:",avg_scores[0])
print("Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5):",avg_scores[1])



# misc... 
# results:

# linear regression, train users with 50-75 ratings, test users with 5-10 ratings, using feature_1 and feature_3, n = 20 for svd_full
# Average r2_score without rounding:  
# Average r2_score with rounded prediction to nearest .5 (note: actual users ratings must be divisibl by .5):  

# BEST:
# linear regression, train users with 50-75 ratings, test users with 5-10 ratings, using feature_1 and feature_3, n = 10 for svd_full
# Average r2_score without rounding:  0.15453027266541675
# Average r2_score with rounded prediction to nearest .5 (note: actual users ratings must be divisibl by .5):  0.13160695282014914

# linear regression, train users with 50-75 ratings, test users with 5-10 ratings, using feature_1 and feature_3, n = 5 for svd_full
# Average r2_score without rounding:  0.1531792173890736
# Average r2_score with rounded prediction to nearest .5 (note: actual users ratings must be divisibl by .5):  0.13107484923731846

# linear regression, train users with 50-75 ratings, test users with 5-10 ratings, using feature_1 and feature_3, n = 15 for svd_full
# Average r2_score without rounding:  0.14856750114822856
# Average r2_score with rounded prediction to nearest .5 (note: actual users ratings must be divisibl by .5):  0.13160695282014914

# linear regression, train users with 50-75 ratings, test users with 5-10 ratings, using feature_1 and feature_3, n = (20,10) for svd_full
# Average r2_score without rounding:  0.14704697205805803
# Average r2_score with rounded prediction to nearest .5 (note: actual users ratings must be divisibl by .5):  0.12415750266051795
#Note: two different n values leads to worse results...

# linear regression, train users with 50-75 ratings, test users with 5-10 ratings, using feature_1 and feature_3, n = (15,10) for svd_full
# Average r2_score without rounding:  0.1497370871224573
# Average r2_score with rounded prediction to nearest .5 (note: actual users ratings must be divisibl by .5):  0.12841433132316396

# new feature_1 and feature_3
# Average r2_score without rounding: 0.25426664596078186
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.22858769671550502

# new feature_2 anfd feature_3
# Average r2_score without rounding: 0.2268076644799909
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.20028436068912575



Feature Importance scores: First feature: 0.15922848810167606 Second feature: 0.3443379214071639
Feature Importance scores: First feature: 0.15920975402650253 Second feature: 0.34748315780016703
Feature Importance scores: First feature: 0.1585245890160587 Second feature: 0.3459012612097386
Feature Importance scores: First feature: 0.1564854881388439 Second feature: 0.34798739801720097
Feature Importance scores: First feature: 0.15221589967514887 Second feature: 0.3393385459704432
Feature Importance scores: First feature: 0.1604417690861475 Second feature: 0.3486921564579412
Feature Importance scores: First feature: 0.15911192296395937 Second feature: 0.3381394206220282
Feature Importance scores: First feature: 0.1650248419319896 Second feature: 0.3478245584873635
Feature Importance scores: First feature: 0.1606293062788783 Second feature: 0.34536289373413853
Feature Importance scores: First feature: 0.15618506724862175 Second feature: 0.3450539224577714
Feature Importance scores: First