In [1]:
# This cell downloads the data needed for this jupyter notebook from kaggle and stores it in a new folder (the-movies-dataset) in the current directory.

# Upon running this cell, the user will be asked for their username and key which can be found in a fresh api token from kaggle.

# Instructions to get api token to authenticate the data request (Note: kaggle account required):
# 1. Sign into kaggle.
# 2. Go to the 'Account' tab of your user profile and select 'Create New Token'. 
# 3. This will trigger the download of kaggle.json, a file containing your API credentials.

# If the folder has been created and the files are already in that folder, than this cell does nothing and requires no credentials.
import opendatasets as od

# Data Source Information:
# https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?select=movies_metadata.csv

od.download("https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:

Your Kaggle Key:Downloading the-movies-dataset.zip to ./the-movies-dataset


100%|██████████| 228M/228M [00:23<00:00, 10.2MB/s] 





In [1]:
# This cell is for combining certain data from the necessary csv files into a single dataframe (complete_df).
import pandas as pd
import time

start_time = time.time()

# Rows are removed from each dataframe when they do not have sufficent data for a column or the data from a column does not exist.
# This kind of row removal is done before multiple copies of the same movie data becomes present in multple rows, to save time and space.
# Iteration through rows of a dataframe at this level is inefficient compared to list iteration.
# This is why the dataframes are converted into lists before iteration and then back again to dataframes,... 
# so the merge function can be applied to combine the data into a single dataframe (complete_df).


pd.set_option('display.max_colwidth', None)

movies_df = pd.read_csv('./the-movies-dataset/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={'genres':"string","id":"string","title": "string", "tagline": "string","overview":"string",
                                    "production_companies" :"string"})[["genres","id" ,"title","tagline", "overview","production_companies"]]
movies_df.dropna(inplace = True)
movies_lst = [row for row in movies_df.values.tolist() if not (row[0][len(row[0])  - 2:] == "[]" or row[5][len(row[5]) - 2:] == "[]")]
movies_df = pd.DataFrame(movies_lst, columns = ("genres","id" ,"title","tagline", "overview","production_companies"), dtype = str)



ratings_df = pd.read_csv('./the-movies-dataset/ratings.csv', usecols = ("userId", "movieId", "rating"),
                       dtype={"userId": "string","movieId": "string","rating": "string"})[["userId", "movieId", "rating"]]
ratings_df.rename(columns={"movieId": "id"}, inplace = True)
ratings_df.dropna(inplace = True)


# Question: What if the removal of duplicate movie ids per user was processed here instead of the cell below???
# Answer: The duplicate removal function can be ran here,...
# but the complete_list in the cell below can also be iterated over with relative complexity in order to remove duplicates.
# The iteration in the next cell also populates the gap list...
# which is critical to be ran directly before the function that determines bounds for users rated movies.
# So, omitting the no duplicate function in this cell and making it run in the next cell avoids redundant iteration.


# Question: What if the test and train ratings bounds was enforced here instead of the cell below???
# Answer: The merge functions below needs to be executed before determining test and train users, because merge will remove rows and ratings from users...
# before enforcing the users to be in a certain bounds for the number of their ratings. 
# The current timing of this function will ensure that the final users are within the set train or test bounds.


keywords_df = pd.read_csv('./the-movies-dataset/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})[["id", "keywords"]]
keywords_df.dropna(inplace = True)
keywords_lst = [row for row in keywords_df.values.tolist() if not (row[1][len(row[1])  - 2:] == "[]")]
keywords_df = pd.DataFrame(keywords_lst, columns = ("id", "keywords"), dtype = str)


credits_df = pd.read_csv("./the-movies-dataset/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})[["cast", "id"]]
credits_df.dropna(inplace = True)
credits_lst = [row for row in credits_df.values.tolist() if (not row[0][len(row[0])  - 2:] == "[]")]
credits_df = pd.DataFrame(credits_lst, columns = ("cast", "id"), dtype = str)


# Default merge is inner: This only keeps movies that have the id existing in both dataframes.
complete_df =  pd.merge(movies_df, ratings_df, on ="id")
complete_df =  pd.merge(complete_df,keywords_df, on ="id")
complete_df  = pd.merge(complete_df,credits_df, on ="id")


complete_df.sort_values(by = 'userId', inplace = True)


# Master dataframe: For each (user id, movie id) row combination there is the combined movie data from movies_df, ratings_df, keywords_df, and credits_df for the movie id in question.
# The columns are reordered.
complete_df  = complete_df.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]



# For testing:
print("Minutes taken:", (time.time()-start_time)/60)
print(complete_df.head())



# Tested on personal machine:
# Old run with dataframe iteration (old code): 1 minute and 5.7 seconds
# New run with list conversion before iteration (current code): 37.1 seconds

Minutes taken: 0.5768450776735942
        userId    id rating               title  \
6566765      1  1246    5.0        Rocky Balboa   
6880303      1  2959    4.0      License to Wed   
2083077      1  2762    4.5  Young and Innocent   
1492304      1  1968    4.0       Fools Rush In   
2638962      1   147    4.5       The 400 Blows   

                                                                                                genres  \
6566765                                                                  [{'id': 18, 'name': 'Drama'}]   
6880303                                                                 [{'id': 35, 'name': 'Comedy'}]   
2083077                                     [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}]   
1492304  [{'id': 18, 'name': 'Drama'}, {'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]   
2638962                                                                  [{'id': 18, 'name': 'Drama'}]   

                      

In [59]:
import ast
import random
import time

start_time = time.time()


# Seed for consistent results across runtimes:
seed_int = 2
random.seed(seed_int)


def populate_names(item):
    """Extract names from the syntax of certain data entries:"""
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    index = 0
    for item in jsons:
        if(index == len(jsons)-1):
            temp_dict = ast.literal_eval(item)
            names+=str(temp_dict["name"])
        else:
            temp_dict = ast.literal_eval(item+"}")
            names+=str(str(temp_dict["name"])+" ")
        index += 1
    return names


def provide_data(row):
    """Extract data from row of complete_list:"""
    movie_data = []
    movie_data.append(int(row[0]))
    movie_data.append(int(row[1]))
    movie_data.append(float(row[2]))
    movie_data.append(row[3])  

    movie_data.append(populate_names(row[4]))
    movie_data.append(populate_names(row[5]))
    movie_data.append(populate_names(row[6]))
    movie_data.append(populate_names(row[7]))

    movie_data.append(str(row[8]))
    movie_data.append(str(row[9]))
    return movie_data
    


# The list of rows with users id, the users rating for the movie, and raw data for the movie:
# Note: It is sorted by user_id.
complete_list = complete_df.values.tolist()

print("Complete number of users:", len(list(complete_df["userId"].unique()))) # 260788


# The complete list of user rows without ratings of the same movie more than once for a given user:
complete_list_no_dups = []

# Distinquish the user the row belongs to:
last_id = complete_list[0][0]

# The set of movies that a user has rated:
# It is used to omit later ratings of a movie that the user has already rated.
movie_set = set()

# The number of rows of movie data a single user takes up for each user:
gaps = []

# Appended to gaps when all of a users rows of movie data have been counted:
gap_len = 0


# Populates gaps and complete_list_no_dups by omitting movies that already have a rating in respect to each user:
# Note: This code is faster than using dataframe methods.
# Example: Filter data by user and then remove duplicate movie ids for each user.
# This avoids slow dataframe iteration, but the filter method is also slow.
for row in complete_list:
    if last_id != row[0]:
        movie_set= set()
        complete_list_no_dups.append(row)
        movie_set.add(row[1])
        gaps.append(gap_len)
        gap_len = 1
    else:
        if row[1] not in movie_set:
            complete_list_no_dups.append(row)
            gap_len+=1
            movie_set.add(row[1])
    last_id = row[0]

# Add the last gap_len:
gaps.append(gap_len)





# Index in the complete_list_no_dups list:
full_index = 0 
# The start and end indices+1 of the users movies in complete_list_no_dups for each train user:
high_bounds = [] 
# The start and end indices+1 of the users movies in complete_list_no_dups for each test user:
low_bounds = [] 


# Populates bounds_train and bounds_test by testing each user if they are a valid train or test user:
for user_index in range(len(gaps)):
    if 30 <= gaps[user_index] and 50 >= gaps[user_index]:
        high_bounds.append([full_index, full_index+gaps[user_index]])

    elif 5 <= gaps[user_index] and 10 >= gaps[user_index]:
        low_bounds.append([full_index, full_index+gaps[user_index]])

    full_index+=gaps[user_index]    



# Question: Why save bounds of the users movie indices instead of just storing movie information for each movie rated by each user???
# Answer: The current code saves time and memory.
# It is redundant to save all the movie information when only a subset of the train and test users will be selected below.

print("Number of highbound users before random selection:",len(high_bounds)) 
print("Number of lowbound users before random selection:", len(low_bounds)) 

fill_in_users = 6000
filler_user_bounds = random.sample(high_bounds, fill_in_users)



random.shuffle(low_bounds)

train_bound_max = 6000
test_bound_max = 3000

low_bounds_train_sample = low_bounds[test_bound_max:train_bound_max]
low_bounds_test_sample = low_bounds[0:test_bound_max]


# Transformed data of the selected train users and test users (in that order):
sampled_data = []

avg = 0
cnt = 0

for bound in low_bounds_train_sample:
    for movie in complete_list_no_dups[bound[0]:bound[1]]:
        movie_data = provide_data(movie)
        movie_data[0] = cnt
        movie_data.append("train")
        sampled_data.append(movie_data)
        avg += 1
    cnt+=1


for bound in filler_user_bounds:
    for movie in complete_list_no_dups[bound[0]:bound[1]]:
        movie_data = provide_data(movie)
        movie_data[0] = cnt
        movie_data.append("filler")
        sampled_data.append(movie_data)
        avg += 1
    cnt+=1

for bound in low_bounds_test_sample:
    for movie in complete_list_no_dups[bound[0]:bound[1]]:
        movie_data = provide_data(movie)
        movie_data[0] = cnt
        movie_data.append("test")
        sampled_data.append(movie_data)
        avg += 1
    cnt+=1



print("Average number of ratings for the users chosen:", avg/cnt)

print("Minutes taken:", (time.time()-start_time)/60)


# Results, tested on personal machine:

# With train users in range 50 <= gaps[user_index] and 70 >= gaps[user_index]
# With test users in range 5 <= gaps[user_index] and 10 >= gaps[user_index]
# Complete number of users: 260788
# Number of train users before random selection: 14314
# Number of test users before random selection: 68048
# Average number of ratings for the users chosen: 50.236
# Minutes taken: 4.779916667938233

# With train users in range 30 <= gaps[user_index] and 50 >= gaps[user_index]
# With test users in range 5 <= gaps[user_index] and 10 >= gaps[user_index]
# Complete number of users: 260788
# Number of train users before random selection: 27256
# Number of test users before random selection: 68048
# Average number of ratings for the users chosen: 33.36266666666667
# Minutes taken: 3.207816179593404

# With train users in range 11 <= gaps[user_index] and 31 >= gaps[user_index]
# With test users in range 5 <= gaps[user_index] and 10 >= gaps[user_index]
# Complete number of users: 260788
# Number of train users before random selection: 70880
# Number of test users before random selection: 68048
# Average number of ratings for the users chosen: 16.7075
# Minutes taken: 1.6583004673322042


Complete number of users: 260788
Number of highbound users before random selection: 27256
Number of lowbound users before random selection: 68048
Average number of ratings for the users chosen: 22.88625
Minutes taken: 4.955366675059


In [60]:
# Save in a constructed_data/constructed_data.csv file so that cells below can run without running this cell and above:
import csv
import os

current_directory = os.getcwd()
final_directory = os.path.join(current_directory, 'constructed_data')
if not os.path.exists(final_directory):
   os.makedirs(final_directory)

with open("constructed_data/constructed_data.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview", "group"])
    writer.writerows(sampled_data)

In [61]:
# This is the starting cell to run if the data is already saved to the constructed_data/constructed_data.csv. 
import csv

data_list =[]

with open("constructed_data/constructed_data.csv", 'r', encoding="utf-8") as f:
    csv_reader = csv.reader(f)
    data_list = list(csv_reader)

data_list = data_list[1:]


In [62]:
import random

# Seed for consistent results across runtimes:
seed_int = 2
random.seed(seed_int)

# These are lists of user data:
# Each user in each list contains movie data for all the movies they rated.
user_to_data_train = []
user_to_data_filler = []
user_to_data_test = []

#The group type of the last movie
last_type = "train"

# Distinquish the user the row belongs to:
user_id = data_list[0][0]

# Ratings of movies for a user:
ratings = []

# For each user, distinuish whether they are a train or test user by the number of their corresponding movie rows:
# Each movie entry for a user will be in consecutive rows in data_list becuase data_list is sorted by user.
# Note: Technically in the third cell there is clearly distinquished test and train users before combining them in sampled_data...
# so there is an arguably redundant processes here.
# However, this cell executes instantly so there is no need to persist any user type labels.
for row in data_list:
    if last_type =="train" and user_id != row[0]:
        user_to_data_train.append(ratings)
        ratings = [row]
    elif last_type =="filler" and user_id != row[0]:
        user_to_data_filler.append(ratings)
        ratings = [row]
    elif last_type =="test" and user_id != row[0]:
        user_to_data_test.append(ratings)
        ratings = [row]
    else:
        ratings.append(row)
    user_id = row[0]
    last_type = row[10]


# Distinguish whether the last user is a test or train user:
if(last_type == "train"):
    user_to_data_train.append(ratings)
elif(last_type == "filler"):
    user_to_data_filler.append(ratings)
else:
    user_to_data_test.append(ratings)



# This is where smaller or equal samples of train and test users are selcted from constructed_data.csv.
# Note: Suppose you execute (cell 3) for a excessive amount of train and test users and they are saved in constructed_data.csv.
# Then the cell above (cell 5) and this cell(cell 6) can be executed to select a smaller subset of those to use for the model...
# without having to run the relatively slow (cell 3) over again.
user_to_data_train = random.sample(user_to_data_train, 3000)
user_to_data_filler = random.sample(user_to_data_filler, 6000)
user_to_data_test = random.sample(user_to_data_test, 3000)


In [67]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
import random
from ordered_set import OrderedSet
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel
import time

start_time = time.time()

# Seed for consistent results across runtimes:
seed_int = 2
random.seed(seed_int)


class user_type_vars():
    """Each of the variables in this class represent the data structures for a user type."""
    def __init__(self):
        # For each user, a dictionary of movie_id to the movies rating for each movie the user watched:
        self.user_to_movie_id_to_rating = [] 

        # For every movie watched by the user_type, a list of ratings:
        self.movie_id_to_ratings = dict()

        # All the movie ids in the order of where they appear first in the list of user ratings: 
        self.movies_in_order = OrderedSet()

        # Model input features x for each user:
        # train and test users only
        self.feature_1 = []
        self.feature_2 = []
        self.feature_3 = []

        # movie id of the users target movie
        # train and test users only
        self.user_to_target_movie_id = []

        # For each user, this is the index of the users target movie in the order of movies_in_order.
        # train and test users only
        self.user_to_target_index_full = [] 

        # rating of the users traget movie
        # train and test users only
        self.user_to_target_rating  = [] 


# "user_type_vars" can represent a group of train users, a group of filler users, and a group of test users.
train_users = user_type_vars()
filler_users = user_type_vars()
test_users = user_type_vars()

wnl = WordNetLemmatizer()


def load_feature_1_and_2(overall_rating_sum, overall_rating_count, movies_in_order, user_to_data, movie_id_to_ratings,  user_to_movie_id_to_rating, user_to_target_movie_id, user_to_target_rating, feature_1, feature_2):
    """
    This is ran once to be used to populate features 1 and 2 for the train_users...
    and ran again to be used to populate features 1 and 2 for the test_users.
    It also populates the train and test version of these variables: target_movies, movies_in_order, movie_id_to_ratings,...
    user_to_movie_id_to_rating, user_to_target_rating.
    These variables are used in the load_feature_3 function.
    """ 
    for i in range(len(user_to_data)):
        movie_id_to_words = dict()
        movie_id_to_rating = dict()
        index = 0
        rand_index = random.randint(0, len(user_to_data[i])-1)

        for movie_data in user_to_data[i]:
            if index == rand_index:    
                user_to_target_movie_id.append(movie_data[1])
            else:
                # The program should train and test while simulating the possibility of...
                # some new movies having no existing rating data in the database.
                # This is critical training and testing if the resulting model were to realistically be tried on completely new data...
                # where target ratings are unknown (they need to be predicted).
                
                # This is why target ratings are omitted from movie_id_to_ratings.
                # The same logic stands for overall_average_train which is formed by overall_rating_sum and overall_rating_count.
                overall_rating_sum += float(movie_data[2])
                overall_rating_count += 1

                if movie_data[1] in movie_id_to_ratings.keys():
                    movie_id_to_ratings[movie_data[1]].append(float(movie_data[2]))
                else:
                    movie_id_to_ratings[movie_data[1]] = [float(movie_data[2])]

            movie_string = ""

            # Use this code to apply all the text data and combine in to a single list of words (repeats allowed).
            # for j in range (3,len(movie_data)-1):
            #     if(j!= len(movie_data)-2):
            #         movie_string+= movie_data[j]+" "
            #     else:
            #         movie_string+= movie_data[j]


            # Use this truncated code to only include the genre column strings.
            movie_string = movie_data[4]

            # Lematization and conversion to lists of words:
            cleaned_string = remove_stopwords(movie_string)
            cleaned_list = [wnl.lemmatize(word) for word in cleaned_string.split(" ")]
            cleaned_list = [word[:-1] for word in cleaned_list if word.endswith(".")] + [word for word in cleaned_list if not word.endswith(".")]

            movie_id_to_words[movie_data[1]] = cleaned_list
            movie_id_to_rating[movie_data[1]] = float(movie_data[2])
            movies_in_order.add(movie_data[1])
            index+=1

        user_to_movie_id_to_rating.append(movie_id_to_rating)

        # Question: This function assumes that all movies have their corpus information somewhere in selected portion of the "the-movies-dataset",...
        # inorder to make a word count vector.
        # What happens if with the application of this program/model, a completely new movie to the data is found???
        # Answer: Then feature_2 would not function as a model parameter and feature 3 would give poor performance, but feature 1 should still work fine.
        # Note: For the runtime of this notebook, since the train and test data all come from "the-movies-dataset" there is no risk of this happening.

        # The current users set of words from all the movies they rated:
        users_words_in_order = OrderedSet()
        for movie_id in movie_id_to_words.keys():
            users_words_in_order.update(movie_id_to_words[movie_id])

        # List of word counts in the order of users_words_in_order for each movie (excluding target movie):
        word_counts = []

        # Word counts in the order of users_words_in_order for the target movie:
        target_word_counts = [] 


        # Populate words_counts and target_word_counts
        # for movie_id in movie_id_to_words.keys():
        #     if movie_id != target_movie_id:
        #         temp_dict = Counter(movie_id_to_words[movie_id])
        #         temp_list = []
        #         for word in users_words_in_order:
        #             if word in temp_dict.keys():
        #                 temp_list.append(temp_dict[word])
        #             else:
        #                 temp_list.append(0)  
        #         word_counts.append(temp_list)  
        #     else:
        #         temp_dict = Counter(movie_id_to_words[movie_id])
        #         temp_list = []
        #         for word in users_words_in_order:
        #             if word in temp_dict.keys():
        #                 temp_list.append(temp_dict[word])             
        #             else:
        #                 temp_list.append(0) 
        #         target_word_counts = temp_list


        # Question: What if a list comprehension method was used instead of the above method???
        # Answer: The performance benefits remain indifferent even when the corpus columns are maxed out.
        # This code portion is not the bottleneck.

        # Populate words_counts and target_word_counts
        for movie_id in movie_id_to_words.keys():
            temp_dict = Counter(movie_id_to_words[movie_id])
            if movie_id != user_to_target_movie_id[len(user_to_target_movie_id)-1]:
                word_counts.append([(lambda x : temp_dict[x] if x in temp_dict.keys() else 0)(word) for word in users_words_in_order])  
            else:
                target_word_counts = [(lambda x : temp_dict[x] if x in temp_dict.keys() else 0)(word) for word in users_words_in_order]


        # Construct the normalized tf-idf before applying cossine similairity/linear kernel: 
        # This places value on terms that are un-common in alot of documents,...
        # while still placing value on how common they are in the document at hand.
        # In this case documents are word counts for the corpuses of a single movie the user rated.
        # This should lead to a more powerful quantifier for cossine similairity/linear kernel between documents.

        complete_word_counts = word_counts.copy()
        complete_word_counts.append(target_word_counts)
        transformed_word_counts = TfidfTransformer().fit_transform(complete_word_counts).toarray()


        # Populate ratings without the target rating:
        ratings = []
        for movie_id in movie_id_to_rating.keys():
            if movie_id != user_to_target_movie_id[len(user_to_target_movie_id)-1]:
                ratings.append(movie_id_to_rating[movie_id])
            else:
                # Add the target movie rating for the current user:
                # Each user has only one target movie rating.
                user_to_target_rating.append(movie_id_to_rating[movie_id])
    

        def predict():
            "Use the word counts and ratings to add predictions to feature_1 list and feature_2 list:"
            # Pred_1 is unweighted average of all of the users movies.
            pred_1 = sum(ratings) / len(ratings)
            # Pred_2 is weighted average of all the users movies (weights are based on (cossine similarity/linear kernel)),...
            # unless denominator is zero (see below).
            pred_2 = 0

            cosine_sim = linear_kernel(X = transformed_word_counts[0:-1],Y = [transformed_word_counts[-1]])
            cosine_sim = np.reshape(cosine_sim,  (len(cosine_sim)))
            numerator = 0
            denominator = 0
            pred_2 = pred_1
            for i in range(len(ratings)):
                numerator += cosine_sim[i]*ratings[i]
                denominator += cosine_sim[i]
    
            # In case of potential division by zero:
            if denominator != 0:
                pred_2 = numerator/denominator
    
            return (pred_1, pred_2)
        
        predictions = predict()

        feature_1.append(predictions[0])
        feature_2.append(predictions[1])

    return(overall_rating_sum, overall_rating_count)
            
        

#overall_rating_sum, overall_rating_count, filler_users.movies_in_order, user_to_data_filler, filler_users.movie_id_to_ratings, filler_users.user_to_movie_id_to_rating

def load_filler(overall_rating_sum, overall_rating_count, movies_in_order, user_to_data, movie_id_to_ratings, user_to_movie_id_to_rating):
    for i in range(len(user_to_data)):
        movie_id_to_rating = dict()
        index = 0
        rand_index = random.randint(0, len(user_to_data[i])-1)
        for movie_data in user_to_data[i]:
            if index != rand_index:    
                overall_rating_sum += float(movie_data[2])
                overall_rating_count += 1

                if movie_data[1] in movie_id_to_ratings.keys():
                    movie_id_to_ratings[movie_data[1]].append(float(movie_data[2]))
                else:
                    movie_id_to_ratings[movie_data[1]] = [float(movie_data[2])]

           
            movie_id_to_rating[movie_data[1]] = float(movie_data[2])
            movies_in_order.add(movie_data[1])
            index+=1

        user_to_movie_id_to_rating.append(movie_id_to_rating)

    return(overall_rating_sum, overall_rating_count)



    
# "overall_rating_sum" and "overall_rating_count" are used to calculate the overall train rating.
# The overall_average_train (which is overall_rating_sum/overall_rating_count) is only set to the output of the "train" function call.
# This is used to fill in ratings for movies that are only target movies for a certain set of users.
# This is because in practice and full application of the program/model, the target movie ratings are unknown (they need to be predicted).

overall_rating_sum = 0.0
overall_rating_count = 0.0

# Populate train data (feature 1 and feature 2):
overall_rating_sum,overall_rating_count = load_feature_1_and_2(overall_rating_sum, overall_rating_count, train_users.movies_in_order, user_to_data_train, train_users.movie_id_to_ratings, train_users.user_to_movie_id_to_rating, train_users.user_to_target_movie_id, train_users.user_to_target_rating, train_users.feature_1, train_users.feature_2)


overall_rating_sum,overall_rating_count = load_filler(overall_rating_sum, overall_rating_count, filler_users.movies_in_order, user_to_data_filler, filler_users.movie_id_to_ratings, filler_users.user_to_movie_id_to_rating)

overall_average = overall_rating_sum/overall_rating_count

# Populate test data (feature 1 and feature 2):
load_feature_1_and_2(0, 0, test_users.movies_in_order, user_to_data_test, test_users.movie_id_to_ratings, test_users.user_to_movie_id_to_rating, 
                test_users.user_to_target_movie_id, test_users.user_to_target_rating, test_users.feature_1, test_users.feature_2)


def pre_svd(movie_id_to_average_rating, movies_in_order, user_to_ratings_full_transform, user_to_ratings_full, user_to_target_index_full, 
               user_to_movie_id_to_rating_filler,  user_to_movie_id_to_rating, user_to_target_movie_id):
    """
    Populate the lists user_to_ratings_full and user_to_ratings_full_transform:
    User_to_ratings_full_transform is used for svd because it includes entries from all movies in movies_in_order...
    and transforms the data in user_to_ratings_full by subtracting the movie rating mean.
    This means that the transformed value at the indices of unwatched movies and index coresponding to target movies are zero.
    """
    for i in range(len(user_to_movie_id_to_rating_filler)):
        ratings = []
        transformed_ratings = []

        for movie_id in movies_in_order:
            if movie_id in user_to_movie_id_to_rating_filler[i].keys():
                ratings.append(user_to_movie_id_to_rating_filler[i][movie_id])
                transformed_ratings.append(user_to_movie_id_to_rating_filler[i][movie_id] - movie_id_to_average_rating[movie_id])
            else:
                ratings.append(movie_id_to_average_rating[movie_id])
                transformed_ratings.append(movie_id_to_average_rating[movie_id] - movie_id_to_average_rating[movie_id])

        user_to_ratings_full.append(ratings)
        user_to_ratings_full_transform.append(transformed_ratings)

    for i in range(len(user_to_movie_id_to_rating)):
        ratings = []
        transformed_ratings = []
        index = 0
        for movie_id in movies_in_order:
            if movie_id == user_to_target_movie_id[i]:
                user_to_target_index_full.append(index)
                ratings.append(movie_id_to_average_rating[movie_id])
                transformed_ratings.append(movie_id_to_average_rating[movie_id] - movie_id_to_average_rating[movie_id])
            elif movie_id in user_to_movie_id_to_rating[i].keys():
                ratings.append(user_to_movie_id_to_rating[i][movie_id])
                transformed_ratings.append(user_to_movie_id_to_rating[i][movie_id] - movie_id_to_average_rating[movie_id])
            else:
                ratings.append(movie_id_to_average_rating[movie_id])
                transformed_ratings.append(movie_id_to_average_rating[movie_id] - movie_id_to_average_rating[movie_id])
            index+=1
            
        user_to_ratings_full.append(ratings)
        user_to_ratings_full_transform.append(transformed_ratings)



def svd_full(user_to_ratings_full_transform, n, movie_id_to_average_rating):
    """
    1. Get the svd of the user_to_ratings_full_transform 
    2. Truncate each factor to n components
    3. Multiply the truncated components together (U X s) X V 
    4. Scale back the values to the orginal rating scale (1-5) and return result
    """
    U, S, V = np.linalg.svd(user_to_ratings_full_transform, full_matrices=False)
    
    # Simplify factors to n components:
    U=U[:,0:n]
    S=np.diag(S)
    S=S[0:n,0:n]
    V=V[0:n,:]

    # Reconstruct to a new array:
    US = np.dot(U,S)
    USV = np.dot(US,V)

    # This tranforms the UsV row by row into the original rating scale (1-5).
    USV = USV + np.tile(list(movie_id_to_average_rating.values()), (USV.shape[0],1))

    # Be consistent with data structures:
    return list(USV)



def load_feature_3():
    """
    Populate feature_3 with a method loosely outlined here:
    1. Find the average ratings for movies 
    2. Pre_svd writes a rating for every movie for every user as well as a transformed version of those rating using the averages found above
    3. Then use the output of the svd_full function by row for user and by column for the target movie rating prediction
    """

    # Every movie ever seen by any user in either the test and train sets:
    all_movies_in_order_filler_and_train = filler_users.movies_in_order | train_users.movies_in_order
    all_movies_in_order_filler_and_test = filler_users.movies_in_order | test_users.movies_in_order

    # When a movie has a number of target ratings and non-target ratings, then only the non-target ratings are used...
    # to form the movies average rating.

    # There is a difference between non-target ratings between "movie_id_to_average_rating_train" and "movie_id_to_average_rating_full".
    # "movie_id_to_average_rating_train" considers the train set and "movie_id_to_average_rating_full" considers the train and test set.

    # When a movie has only target ratings in either the train of full dataset,...
    # instead of using the mean of the actual target ratings for "movie_id_to_average_rating_train" or "movie_id_to_average_rating_full",...
    # the movies average rating takes on the value of overall_average_train.
    # This is used to simlulate the potential application of this model when there are movies to be rated for a new user that have no ratings in the existing data.

    # The code below deliniates two different averages for valid movies, a train average and a train+test or full average.
    # The train average is used to normalize the ratings of the movies for train users in the first pre_svd call.
    # The train+test averages are used to normalize the ratings of the movies for train+test users in the second pre_svd call.

    movie_id_to_average_rating_filler_and_train = dict()
    for movie in all_movies_in_order_filler_and_train:
        temp = 0
        if(movie in filler_users.movie_id_to_ratings and movie in train_users.movie_id_to_ratings):
            for rating in filler_users.movie_id_to_ratings[movie]:
                temp+=rating
            for rating in train_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_filler_and_train[movie] = temp/(len(filler_users.movie_id_to_ratings[movie])+len(train_users.movie_id_to_ratings[movie]))

        elif(movie in filler_users.movie_id_to_ratings):
            for rating in filler_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_filler_and_train[movie] = temp/(len(filler_users.movie_id_to_ratings[movie]))

        elif(movie in train_users.movie_id_to_ratings):        
            for rating in train_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_filler_and_train[movie] = temp/(len(train_users.movie_id_to_ratings[movie]))
        else:
            movie_id_to_average_rating_filler_and_train[movie] = overall_average


    movie_id_to_average_rating_filler_and_test = dict()
    for movie in all_movies_in_order_filler_and_test:
        temp = 0
        if(movie in filler_users.movie_id_to_ratings and movie in test_users.movie_id_to_ratings):
            for rating in filler_users.movie_id_to_ratings[movie]:
                temp+=rating
            for rating in test_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_filler_and_test[movie] = temp/(len(filler_users.movie_id_to_ratings[movie])+len(test_users.movie_id_to_ratings[movie]))

        elif(movie in filler_users.movie_id_to_ratings):
            for rating in filler_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_filler_and_test[movie] = temp/(len(filler_users.movie_id_to_ratings[movie]))

        elif(movie in test_users.movie_id_to_ratings):        
            for rating in test_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_filler_and_test[movie] = temp/(len(test_users.movie_id_to_ratings[movie]))
        else:
            movie_id_to_average_rating_filler_and_test[movie] = overall_average



    filler_train_user_to_ratings_full = []
    filler_test_user_to_ratings_full = []  

    filler_train_user_to_ratings_full_transform = []
    filler_test_user_to_ratings_full_transform = []  


    #LOOK: In svd need a way to determine when the users stop being filler users and start being train and test users

    #movie_id_to_average_rating, movies_in_order, user_to_ratings_full_transform, user_to_ratings_full, user_to_target_index_full, user_to_movie_id_to_rating_filler,  user_to_movie_id_to_rating, user_to_target_movie_id
    pre_svd(movie_id_to_average_rating_filler_and_train, all_movies_in_order_filler_and_train, filler_train_user_to_ratings_full_transform, filler_train_user_to_ratings_full, 
            train_users.user_to_target_index_full, filler_users.user_to_movie_id_to_rating,  train_users.user_to_movie_id_to_rating, train_users.user_to_target_movie_id)

    pre_svd(movie_id_to_average_rating_filler_and_test, all_movies_in_order_filler_and_test, filler_test_user_to_ratings_full_transform, filler_test_user_to_ratings_full, 
            test_users.user_to_target_index_full, filler_users.user_to_movie_id_to_rating,  test_users.user_to_movie_id_to_rating, test_users.user_to_target_movie_id)


    # In practice, there is a train and a test set, the train set is a selection of what the database has on record.
    # The test data will usually be data that hasn't been seen before that can include any number of test users.
    # When "train_users.user_to_ratings_full_transform" is used as the input of the svd function below,...
    # "svd_out_train" is used to produce predictions used to train the model.
    # When "full_user_to_ratings_full_transform" is used as the input of the svd function below,...
    # "svd_out_full" is used to produce predictions used to test the models

    # Note: The second parameter to this function is refered to as n and depending on other parameters to the model, like the rating bounds of train users,...
    # the highest perfroming values of n can vary.

    svd_out_train = svd_full(filler_train_user_to_ratings_full_transform, 100, movie_id_to_average_rating_filler_and_train)
    svd_out_test = svd_full(filler_test_user_to_ratings_full_transform, 100, movie_id_to_average_rating_filler_and_test)


    # The smaller svd provides predictions used to train the model.
    train_users.feature_3 = [svd_out_train[i+len(filler_users.user_to_movie_id_to_rating)]
                             [train_users.user_to_target_index_full[i]] 
                            for i in range(len(train_users.user_to_target_index_full))]
    
    # The larger svd provides predictions used to test the model.
    test_users.feature_3 = [svd_out_test[i+len(filler_users.user_to_movie_id_to_rating)]
                            [test_users.user_to_target_index_full[i]] 
                            for i in range(len(test_users.user_to_target_index_full))]


# populate train and test data (feature 3):
load_feature_3()


print("Minutes taken:", (time.time()-start_time)/60)

# This is used to show how the features approximate the target rating.
print("Feature_1 to target comparison (train):")
print(train_users.feature_1[0:5])
print(train_users.user_to_target_rating[0:5])

print("Feature_1 to target comparison (test):")
print(test_users.feature_1[0:5])
print(test_users.user_to_target_rating[0:5])

print("Feature_2 to target comparison (train):")
print(train_users.feature_2[0:5])
print(train_users.user_to_target_rating[0:5])

print("Feature_2 to target comparison (test):")
print(test_users.feature_2[0:5])
print(test_users.user_to_target_rating[0:5])

print("Feature_3 to target comparison (train):")
print(train_users.feature_3[0:5])
print(train_users.user_to_target_rating[0:5])

print("Feature_3 to target comparison (test):")
print(test_users.feature_3[0:5])
print(test_users.user_to_target_rating[0:5])



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jackson\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Minutes taken: 0.6868159969647726
Feature_1 to target comparison (train):
[3.857142857142857, 3.375, 3.5, 3.9, 4.5625]
[4.0, 1.0, 4.0, 4.0, 4.0]
Feature_1 to target comparison (test):
[3.142857142857143, 4.444444444444445, 2.4285714285714284, 4.666666666666667, 3.75]
[3.0, 3.0, 3.0, 4.0, 5.0]
Feature_2 to target comparison (train):
[3.6572754869592856, 3.423819225214699, 3.0, 5.0, 4.801648271913668]
[4.0, 1.0, 4.0, 4.0, 4.0]
Feature_2 to target comparison (test):
[3.0017647420497133, 5.000000000000001, 3.0, 4.373114811571005, 3.983298161747997]
[3.0, 3.0, 3.0, 4.0, 5.0]
Feature_3 to target comparison (train):
[4.146995238866574, 3.486141970036194, 3.7390765217618602, 3.776869517518682, 4.10754810238034]
[4.0, 1.0, 4.0, 4.0, 4.0]
Feature_3 to target comparison (test):
[4.028600531576326, 4.2982594791375925, 3.7476635063382386, 4.129001311105638, 3.9932482980535267]
[3.0, 3.0, 3.0, 4.0, 5.0]


In [68]:
# Build models based off multiple features: 
# The features themselves are reasonably accuracte predictors of the traget rating for a (movie, user) combination.
import random
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
# From testing, feature scaling was found not to improve performance in model accuracy or runtime
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score
# The alternative evaluation metric:
from sklearn.metrics import mean_squared_error

start_time  = time.time()

seed_int = 2
random.seed(seed_int)

def test_parameters(nof_runs, layers, train_input_features, test_input_features):
    """Test_parameters for a number of runs and return performance results:"""
    train_inputs = [list(pair) for pair in train_input_features]
    test_inputs = [list(pair) for pair in test_input_features]
    return average_results(nof_runs, layers, train_inputs, test_inputs)
    

def average_results(nof_runs, layers, train_inputs, test_inputs):
    """Average the performance results for a number of models with identical inputs:"""
    no_rounding = 0
    rounding = 0
    for _ in range(nof_runs):
        pair = train_and_test(layers, train_inputs, test_inputs)
        no_rounding+=pair[0]
        rounding+=pair[1]
    return no_rounding/nof_runs, rounding/nof_runs


def train_and_test(layers, train_inputs, test_inputs):
    """Build, train, and test a model, then return accuracy scores:"""

    # MLP model:
    # reg = MLPRegressor(hidden_layer_sizes = layers, solver = "adam", max_iter = 1000, random_state = seed_int)

    # Linear regression model:
    reg = LinearRegression()

    # Train model:
    reg.fit(train_inputs, train_users.user_to_target_rating)

    # Print importance of the different input features to the model:
    results = permutation_importance(reg, train_inputs, train_users.user_to_target_rating)
    importances = results["importances_mean"]
    print("Feature Importance scores:", "First feature:", importances[0],"Second feature:", importances[1])


    # Make predictions for test inputs:
    predictions = reg.predict(test_inputs)

    # Test with and without roundings:
    # Note: The actual ratings a user makes must be divisable by .5: 
    rounded_predictions = [round(item*2)/2 for item in predictions]

    # Evaluation metric 1:
    return(r2_score(test_users.user_to_target_rating, predictions), 
        r2_score(test_users.user_to_target_rating, rounded_predictions))

    # Evaluation metric 2:
    # return(mean_squared_error(test_users.user_to_target_rating, predictions), 
    #    mean_squared_error(test_users.user_to_target_rating, rounded_predictions))



# The current test is to average accuracy scores (currently r2_score) from 100 models trained on the same inputs.
# The hidden layers are (10,10,5).
# Note: layers only work when the mlp model is used.

avg_scores = test_parameters(100, (10,10,5), 
    zip(train_users.feature_1, train_users.feature_3),
      zip(test_users.feature_1, test_users.feature_3))


print("Average r2_score without rounding:",avg_scores[0])
print("Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5):",avg_scores[1])

print("Minutes taken:", (time.time()-start_time)/60)

#LOOK: there is slight variation in results between running on ubuntu and windows but it is very accurat to a certaun decimal



#All 50 70, lineare regression, feature 1 and feature 3
#Note: may need to try re-test since varaibles were changed during runtime

# (2000, 7000, 1000) n = 10, 10
# Average r2_score without rounding: 0.23844596811914204
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.22079020222171086

# (2000, 7000, 1000) n = 5, 5
# Average r2_score without rounding: 0.23385044162563054
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.21526009605616025

# (2000, 7000, 1000) n = 20,20
# Average r2_score without rounding: 0.23900341093359118
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.22346606004375172

# (2000, 7000, 1000) n = 50,50
# Average r2_score without rounding: 0.24181639491410348
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.23113685246693558

# (2000, 7000, 1000) n = 100,100
# Average r2_score without rounding: 0.2406656837381301
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.23131524298840492

# (2000, 7000, 1000) n = 50, 75
# Average r2_score without rounding: 0.23998759527682625
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.2245364031725681

# (2000, 7000, 1000) n = 75, 50
# Average r2_score without rounding: 0.24206648265211061
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.2330991482030988

# (2000, 7000, 1000) n = 75, 75
# Average r2_score without rounding: 0.24024903320249208
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.22578513682285387

# (200, 7000, 100) n = 75, 75:
# Average r2_score without rounding: 0.2282563762053039
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.2325581395348841

# (200, 7000, 100) n = 100, 100:
# Average r2_score without rounding: 0.22792673429229293
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.2271914132379251

# (200, 7000, 100) n = 50,50:
# Average r2_score without rounding: 0.23090533781837752
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.22182468694096613

# (5000, 5000, 1000) n = 50,50:
# Average r2_score without rounding: 0.25437009692571544
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.24149609887569934

# (5000, 5000, 1000) n = 100, 100:
# Average r2_score without rounding: 0.2582537822276442
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.24368093968546722

# (5000, 5000, 1000) n = 150, 150:
# Average r2_score without rounding: 0.2591925290672041
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.24058574853829628


# filer users 100+, linear regression, feature 1 and feature 3:

# (5000, 5000, 1000) n = 100, 100:
# Average r2_score without rounding: 0.2565874236965786
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.24465759221383723


#LOOK: This doesn't seem to be an improvement...


# filter users 30-50, linear regression, feature 1 and feature 3:

# (5000, 5000, 1000) n = 150, 150:

# Average r2_score without rounding: 0.23203987404576865
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.2137118576030375

# (5000, 5000, 1000) n = 20, 20:
# Average r2_score without rounding: 0.2277894327499794
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.21227242850940453

# 3000, 6000, 3000 n = 50,50:
# Average r2_score without rounding: 0.2288480459483372
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.21135503347035947

# 3000, 6000, 3000 n = 100,100:
# Average r2_score without rounding: 0.22824082576769877
# Average r2_score with rounded prediction to nearest .5 (note: the actual ratings a user makes must be divisable by .5): 0.21123777156172363


Feature Importance scores: First feature: 0.19613035777976978 Second feature: 0.17941474161489337
Feature Importance scores: First feature: 0.19623517913930844 Second feature: 0.17439255734489775
Feature Importance scores: First feature: 0.1990057983840691 Second feature: 0.1710636843170918
Feature Importance scores: First feature: 0.18745787426199906 Second feature: 0.1762687711055277
Feature Importance scores: First feature: 0.19558314666489565 Second feature: 0.17636757265018896
Feature Importance scores: First feature: 0.19359219635984665 Second feature: 0.18463137623011366
Feature Importance scores: First feature: 0.18560046986699064 Second feature: 0.18007598656498425
Feature Importance scores: First feature: 0.20017861945794174 Second feature: 0.18011376912135293
Feature Importance scores: First feature: 0.1936504129661147 Second feature: 0.18058825251887245
Feature Importance scores: First feature: 0.1959772082739676 Second feature: 0.17205537570159773
Feature Importance scores