In [2]:
import opendatasets as od
#his cell downloads the data needed for this jupyter notebook from kaggle and stores in the-movies-dataset folder
#if the files are already in that folder than this cell does nothing and requires no credentials

#Data Soruce Information:
#https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?select=movies_metadata.csv

od.download("https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset")

Skipping, found downloaded files in ".\the-movies-dataset" (use force=True to force download)


In [1]:
import pandas as pd
import random



#seed for consistent results across runtimes
seed_int = 2
random.seed(seed_int)


#This cell is for combining certain data from the necessary csv files into a single dataframe (complete)

pd.set_option('display.max_colwidth', None)


movies_full = pd.read_csv('./the-movies-dataset/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={"tagline": "string", "id":"string", 'genres':"string", "title": "string", "tagline": "string","overview":"string", "production_companies" :"string"})
movies_full = movies_full.dropna()
movies_full = movies_full.reset_index()


#filter rows of empty data from movies_full on the columns: "genres", "production_companies"
drop_indices = []
for i in range(len(movies_full)):
    len_1 = len(movies_full.iloc[i].loc["genres"])                   
    if(movies_full.iloc[i].loc["genres"][len_1 -2:] == "[]"):
        drop_indices.append(i)
        continue
    len_2 = len(movies_full.iloc[i].loc["production_companies"])
    if(movies_full.iloc[i].loc["production_companies"][len_2 -2:] == "[]"):
        drop_indices.append(i)    
        continue   

movies_full = movies_full.drop(labels=drop_indices, axis = 0)
movies_full = movies_full.reset_index(names = "index_1")


ratings = pd.read_csv('./the-movies-dataset/ratings.csv', usecols = ("userId", "movieId", "rating"), dtype={"userId": "string","movieId": "string","rating": "string"})
ratings = ratings.rename(columns={"movieId": "id"})
ratings.dropna()
ratings = ratings.reset_index(names = "index_2")


keywords = pd.read_csv('./the-movies-dataset/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})
keywords.dropna()
keywords = keywords.reset_index()

#filter rows of empty data from keyword on the keywords column
drop_indices = []
for i in range(len(keywords)):
    len_1 = len(keywords.iloc[i].loc["keywords"])                   
    if(keywords.iloc[i].loc["keywords"][len_1 -2:] == "[]"):
        drop_indices.append(i)

keywords = keywords.drop(labels=drop_indices, axis = 0)
keywords = keywords.reset_index(names = "index_3")


credits = pd.read_csv("./the-movies-dataset/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})
credits.dropna()
credits = credits.reset_index()

#filter rows of empty data from credits on the cast column 
drop_indices = []
for i in range(len(credits)):
    len_1 = len(credits.iloc[i].loc["cast"])                   
    if(credits.iloc[i].loc["cast"][len_1 -2:] == "[]"):
        drop_indices.append(i)

credits = credits.drop(labels=drop_indices, axis = 0)
credits = credits.reset_index(names = "index_4")


#default merge is inner: this only keeps movies that have the id existing in both dataframes...
complete =  pd.merge(movies_full, ratings, on ="id")
complete =  pd.merge(complete,keywords, on ="id")
complete  = pd.merge(complete,credits, on ="id")


complete = complete.sort_values(by = 'userId')


#use only certain types of columns
complete  = complete.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]

#for testing
print(complete.head())


        userId    id rating               title  \
6566765      1  1246    5.0        Rocky Balboa   
6880303      1  2959    4.0      License to Wed   
2083077      1  2762    4.5  Young and Innocent   
1492304      1  1968    4.0       Fools Rush In   
2638962      1   147    4.5       The 400 Blows   

                                                                                                genres  \
6566765                                                                  [{'id': 18, 'name': 'Drama'}]   
6880303                                                                 [{'id': 35, 'name': 'Comedy'}]   
2083077                                     [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}]   
1492304  [{'id': 18, 'name': 'Drama'}, {'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]   
2638962                                                                  [{'id': 18, 'name': 'Drama'}]   

                                                        

In [2]:
import ast
import random
import time

start_time = time.time()

# seed for consistent results across runtime
seed_int = 2
random.seed(seed_int)


def condition(array):
    """"
    originally used to filter out the rows of data with empty entries for certain columns
    a method simlair to this is used in the previous cell above to reduce the number of checks
    """
    length = len(array[4])
    if(array[4][length-2:] == "[]"):
        return False
    length = len(array[5])
    if(array[5][length-2:] == "[]"):
        return False
    length = len(array[6])
    if(array[6][length-2:] == "[]"):
        return False
    length = len(array[7])
    if(array[7][length-2:] == "[]"):
        return False   
    #this is this unecessary with the dropNa function in the previous cell:
    # length = len(array[8])
    # if(array[8][length-4:]=="<NA>"):
    #     return False
    # length = len(array[9])
    # if(array[9][length-4:]=="<NA>"):
    #     return False 
    return True


def populate_names(item):
    """used to extract names from the syntax of certain data entries"""
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    cnt = 0
    for item in jsons:
        if(cnt == len(jsons)-1):
            temp_dict = ast.literal_eval(item)
            names+=str(temp_dict["name"])
        else:
            temp_dict = ast.literal_eval(item+"}")
            names+=str(str(temp_dict["name"])+" ")
        cnt += 1
    return names


def provide_data(array):
    """extract data from row of complete_array"""
    movie_data = []
    movie_data.append(int(array[0]))
    movie_data.append(int(array[1]))
    movie_data.append(float(array[2]))
    movie_data.append(array[3])  

    movie_data.append(populate_names(array[4]))
    movie_data.append(populate_names(array[5]))
    movie_data.append(populate_names(array[6]))
    movie_data.append(populate_names(array[7]))

    movie_data.append(str(array[8]))
    movie_data.append(str(array[9]))
    return movie_data
    

#LOOK: 
#https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
#iterating over pandas objects is slow!!!
#should check for other datastructures below


#LOOK:
#there is an issue with this!!!
#complete.loc[complete['userId'] == user_id] is too slow since it observes the entire dataframe
#should try to make use of the fact that rows are in user order
#there can be an initial list that is the copy of the dataframe...
#another idea...
#need to populate gaps and list_of_user_ids...



list_of_user_ids = list(complete["userId"].unique())
complete_list = complete.values.tolist()

print("number of users:", len(list_of_user_ids))

#the complete list of user rows without ratings of the same movie more than once
complete_list_no_dups = []
#distinquish the users the row belongs to 
last_id = -1
#the set of movies that a user has rated
#used to prevent later ratings of a movie that the user has already rated
movie_set = set()
#how many rows a single user takes up for each user in the order of their occurance
gaps = []
#added to gaps when all of a users rows have been counted
gap_len = 0
#prevent adding gap_len to gaps until a real user has been iterated over
first_it = True

#populate above variables while omitting movies that already have a rating...
#in respect to each user
for row in complete_list:
    if last_id != row[0]:
        movie_set= set()
        complete_list_no_dups.append(row)
        movie_set.add(row[1])
        if not first_it:    
            gaps.append(gap_len)
        gap_len = 1

        first_it = False
    else:
        if row[1] not in movie_set:
            complete_list_no_dups.append(row)
            gap_len+=1
            movie_set.add(row[1])
    last_id = row[0]


#add the last gap_len
gaps.append(gap_len)

#LOOK: should there be trials with train users with a high number of ratings
#and test users with a low number of ratings???


#min_number_of_users = 100 is used for the extended training and testing 
#but users that have a minimum number of ratings of 100 are rare...
#Another senario is that any number of ratings can be used for test and train users
#this was tested and the average numbef or rating for a given user was around 30
min_number_of_users = 100
index = 0
complete_list_index = 0

#removes users that dont fit the bounds for the acceptable number of users
for _ in range(len(gaps)):
    if gaps[index] < min_number_of_users:
        temp = gaps[index]
        del gaps[index]
        del list_of_user_ids[index]
        del complete_list_no_dups[complete_list_index:complete_list_index+temp]
    else:
        complete_list_index+=gaps[index]
        index+=1




#this is a list for all users to rows of transformed data for each movie they rated
user_to_data = []

#this is the total number of users in the whole dataset
#total number of users for min number of ratings 100: (17378)???
#lOOK: What is the total number of users in the dataset and with at least 100 ratings
total_nof_users = len(list_of_user_ids)
print("Total number of users:", total_nof_users)


#this is the number of desired users out of the total_nof_users:
#note: this number is not exact to the number of users because it is applied in a random operation...
#note: extra 250 is used to account for error with a target of 5000 users: 5250
nof_users_aproximation =  5250

#index of the current movie row for the current users
index  = 0

#this is collected for insight
avg = 0.0
cnt = 0.0


#LOOK: alternate method:
#1. randomly choose 5000 unqiue indices from the indices below the length of total_nof_users and make a list of them
#2. iterate through the list of 5000 unqiue indices
#3. when an indice is reached need to move index to the corerpsodnign movie row
#this seems more complicated thanth method below!!!


#populate user_to_data from complete_array
for i in range(0, total_nof_users):
    #generate a random float to determine a pass for the user
    if (random.random()<float(nof_users_aproximation/total_nof_users)):
        user_to_data.append([])
        last_index = len(user_to_data) -1
        for j in range(index, len(complete_list_no_dups)):
            if complete_list_no_dups[j][0] == list_of_user_ids[i]:
                #orginally: the condition function checked if the movie row had missing values for certain columns and...
                #omitted the movie if it had missing values   
                #a more efficient method is used instead in the second cell  
                # if(condition(complete_array[j])):
                    #transform data...
                transformed = provide_data(complete_list_no_dups[j])
                user_to_data[last_index].append(transformed)    
            else:
                avg += len(user_to_data[last_index])
                cnt+=1
                index = j
                break           
    else:
        index += gaps[i]



#Go through user_to_data and re-index the users in list order since certain users were omitted
#this is for simplicity and readability 
for i in range(len(user_to_data)):
    for j in range(len(user_to_data[i])):
        user_to_data[i][j][0] = i


#How many users are in the final user_to_data 
print("Number of users chosen:", len(user_to_data))

#average number of ratings per users
#note: omits the very last user but this makes little difference
print("Average number of ratings for the number of users chosen:", float(avg/cnt))


print("Minutes taken: ", float((time.time()-start_time)/60))




#MISC below: 

#time to complete Total number of users with 100 min user rating limit: ~50 minutes
#number of users with more than 100 ratings: 17378
#average number of ratings: 181.84876560971398
#is this higher than the other method

#average number of ratings:
#with 1000 users: 185.972
#with 5250 users: 184.63586233124167

#the average number of ratings of the complete list of users shuod be around 30: ~6 minutes
#30.25708986940731 average rating with (12941 out of 260788) users chosen

#the next block can be removing all movies that have already been previously rated
#problem: if the number of ratings dips below 2 then there are issues later
#solution: need to do this process before filtering min number of users

# Number of users chosen: 5260
# Average number of ratings for the number of users chosen: 181.55029473283895
# seconds taken:  1364.9479970932007
# minutes taken:  22.74913328488668


number of users: 260788
gaps [6, 8, 1, 4, 20]
Total number of users: 17360
Number of users chosen: 5260
Average number of ratings for the number of users chosen: 181.55029473283895
seconds taken:  1364.9479970932007
minutes taken:  22.74913328488668


In [3]:
#save in a constructed_data/constructed_data.csv file so that cells below can run without running this cell and above

import csv
import os

current_directory = os.getcwd()
final_directory = os.path.join(current_directory, 'constructed_data')
if not os.path.exists(final_directory):
   os.makedirs(final_directory)

with open("constructed_data/constructed_data.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    for i in range(len(user_to_data)):
        writer.writerows(user_to_data[i])

In [4]:
#this is a starting point if the data is already saved to the constructed_data/constructed_data.csv
import csv

data_list =[]

with open("constructed_data/constructed_data.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    data_list = list(csv_reader)

data_list = data_list[1:]


In [5]:
import random

#seed for consistent results across runtime
#used with every random function except for the last cell where a certain number of models are tested and accumulated with identiacal test and train data
seed_int = 2
random.seed(seed_int)

#user to data rows 
user_to_data = []
user_to_data_train = []
user_to_data_test = []
user_id = -1

#note: works when row[0] is also an index
for row in data_list:
    if (row[0]!=user_id):
        user_id = row[0]
        user_to_data.append([row])
    else:
        user_to_data[int(row[0])].append(row)


#these both can be increased for consistency as long as there is enough data
#with the current configuration there are 4204 users
#this can be increased by increasing the desired_nof_users_before_filter parameter above
for i in range(5000):
    index = random.randint(0, len(user_to_data)-1)
    user_to_data_train.append(user_to_data[index])
    del user_to_data[index]


for i in range(1000):
    index = random.randint(0, len(user_to_data_train)-1)
    user_to_data_test.append(user_to_data_train[index])
    del user_to_data_train[index]


del user_to_data


In [6]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
import random
import json
from ordered_set import OrderedSet
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
#the linalg is used from numpy instea of scipy
import numpy as np
#the version from numpy is used instead
from scipy import linalg
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from scipy.linalg import sqrtm
import math
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import linear_kernel


class user_type_vars():
    def __init__(self):
        #for each user of the user type, a dictionary of movie_id to the movies rating for each movie the user watched
        self.user_to_movie_id_to_rating = [] 

        #for each user, a random choice of movie_id from all the movies the user watched to represent the target movie
        self.user_to_target_movie_id = [] 

        #for each user, this is the index of the users target movie in the order of movies_in_order
        #(train_users only)
        self.user_to_target_index_full = [] 

        #for each user, includes ratings for all the movies in the entire train set 
        #missing ratings and target movie ratings are set to that movies average rating
        #(train_users only)
        self.user_to_ratings_full = [] 

        #for each user, includes ratings for all the movies in the entire train set
        #the movies mean rating is subtracted from each rating
        #missing ratings and target movie ratings are set to zero
        #(train_users only)
        self.user_to_ratings_full_transform = []

        #for every movie watched by the user_type, a list of ratings
        self.movie_id_to_ratings = dict()

        #this is a set of every unique target movie for the user_type
        self.target_movies = set()

        #all the movies in order of the movies ratings for each user of the user type
        self.movies_in_order = OrderedSet()

        #model input features x
        self.feature_1 = []
        self.feature_2 = []
        self.feature_3 = []

        #model output feature y
        self.user_to_target_rating  = [] 


#for most of the variables above a train and test version is used
train_users = user_type_vars()
test_users = user_type_vars()


#This is the users average rating not including the chosen target movie
#this is for all train users in order followed by the test users in order
#(this not being used currently)
user_to_average_rating = []



wnl = WordNetLemmatizer()



def load_feature_1_and_2(target_movies, movies_in_order, user_to_data, movie_id_to_ratings, user_to_movie_id_to_rating, user_to_target_movie_id, user_to_target_rating, feature_1, feature_2):
   
    #these are used to calculate the overall train rating
    #this is used to fill in rating for movies that are only target movies (they dont have ratings)
    overall_rating_sum = 0
    overall_rating_count = 0

    for i in range(len(user_to_data)):
        movie_id_to_words = dict()
        movie_id_to_rating = dict()
        cnt = 0
        total =0
        rand_int = random.randint(0, len(user_to_data[i])-1)
        for movie_data in user_to_data[i]:
            if cnt == rand_int:    
                target_movies.add(movie_data[1])
                user_to_target_movie_id.append(movie_data[1])
            else:
                overall_rating_sum += float(movie_data[2])
                overall_rating_count += 1
                total += float(movie_data[2])

                #this only runs when the movie is not the target movie because
                #the target movies are thought to be the movies whose rating is to be predicted...
                #not ratings that are already on record
                if movie_data[1] in movie_id_to_ratings.keys():
                    movie_id_to_ratings[movie_data[1]].append(float(movie_data[2]))
                else:
                    movie_id_to_ratings[movie_data[1]] = [float(movie_data[2])]

            movie_string = ""

            #use this to apply all the text data and combine in to a single list of words (repeats allowed):
            # for index in range (3,len(movie_data)):
            #     if(index!= len(movie_data)-1):
            #         movie_string+= movie_data[index]+" "
            #     else:
            #         movie_string+= movie_data[index]


            #all of the text columns and a few combinations of certain text columns were tested but they were not helpful in...
            #increasing model perfromance (see below)


            #Use this truncated code to only include the genre column strings:
            movie_string = movie_data[4]

            #lematization and conversion to lists
            cleaned = remove_stopwords(movie_string)
            cleaned = [wnl.lemmatize(word) for word in cleaned.split(" ")]
            cleaned = [word[:-1] for word in cleaned if word.endswith(".")] + [word for word in cleaned if not word.endswith(".")]

            movie_id_to_words[movie_data[1]] = cleaned
            movie_id_to_rating[movie_data[1]] = float(movie_data[2])
            movies_in_order.add(movie_data[1])
            cnt+=1

        user_to_movie_id_to_rating.append(movie_id_to_rating)
        #when a user has only a single rating the division by zero occurs
        #this is a reason why every user must have at least 2 ratings
        user_to_average_rating.append(float(total/(cnt-1)))

        #the current users list of words from all the movies they rated
        users_words_in_order = OrderedSet()
        for movie_id in movie_id_to_words.keys():
            for word in movie_id_to_words[movie_id]:
                users_words_in_order.add(word)


        word_counts = [] #list of word counts for the users_words_in_order for each movie (excluding target)
        target_word_counts = [] #word counts for the users_words_in_order for the target movie

        #these are the scaled versions of variables directly above
        #these are only relevant with user averages scalings opposed to movie average scaling...
        #note: scaling also happens automatically below
        word_counts_transformed = []
        target_word_counts_transformed = []

        #word count sums for each word in users_words_in_order for each user
        sums = dict()

        #for each movie the user watched record the wordcount for each word in users_words_in_order
        for movie_id in movie_id_to_words.keys():
            if movie_id != user_to_target_movie_id[-1]:
                temp_dict = Counter(movie_id_to_words[movie_id])
                temp_list = []
                # sum = 0
                for word in users_words_in_order:
                    if word in temp_dict.keys():
                        temp_list.append(temp_dict[word])
                        # sum+=temp_dict[word]
                        if word in sums.keys():
                            sums[word] += temp_dict[word] 
                        else:
                            sums[word] = temp_dict[word] 
                    else:
                        temp_list.append(0) 
                        if word not in sums.keys():
                            sums[word] = 0  

                word_counts.append(temp_list)  

                # append to word_counts_transformed:
                # avg = float(sum/len(users_words_in_order))
                # word_counts_transformed.append([x - avg for x in temp_list])
            else:

                temp_dict = Counter(movie_id_to_words[movie_id])
                temp_list = []
                # sum = 0
                for word in users_words_in_order:
                    if word in temp_dict.keys():
                        temp_list.append(temp_dict[word])
                        # sum+=temp_dict[word]
                        if word in sums.keys():
                            sums[word] += temp_dict[word] 
                        else:
                            sums[word] = temp_dict[word]             
                    else:
                        temp_list.append(0) 
                        if word not in sums.keys():
                            sums[word] = 0 

                target_word_counts = temp_list

                # set target_word_counts_transformed:
                # avg = float(sum/len(users_words_in_order))
                # target_word_counts_transformed = [x - avg for x in temp_list]
        

        complete_word_counts = word_counts.copy()
        complete_word_counts.append(target_word_counts)
        transformed_word_counts = TfidfTransformer().fit_transform(complete_word_counts).toarray()


        #populate ratings with the exception of the target rating 
        #also record the users target movie rating 


        #need to test if any movie id is rated twice!!!
        somecount = 0

        ratings = []
        for movie_id in movie_id_to_rating.keys():
            if movie_id != user_to_target_movie_id[-1]:
                ratings.append(movie_id_to_rating[movie_id])
            else:
                #this signifies the ratings to be predicted by the model
                user_to_target_rating.append(movie_id_to_rating[movie_id])
                somecount+1
        
        #need to test if any movie id is rated twice!!!
        if(somecount>1):
            print("duplicate movie id")

        

        #potential functions of predict:
        #return the average ratings from movies that are a like the target movie with cosine similairity
        #unweighted average of all of the users movies
        #weighted average of all the users movies (weights are based on cossine similarity)
        def predict():
            item_1 = 0 
            item_2 = 0

            # option 1: 
            # cosine_sim = linear_kernel(X = transformed_word_counts[0:-1],Y = [transformed_word_counts[-1]])
            #or
            #cosine_sim = cosine_similarity(X = transformed_word_counts[0:-1],Y = [transformed_word_counts[-1]])
            # cosine_sim = np.reshape(cosine_sim,  (len(cosine_sim)))
            # combined = zip(cosine_sim, ratings)
            # combined = sorted(combined, key=lambda x: x[0], reverse=True)
            # avg = 0
            # nof = 10.0
            # for i in range(int(nof)):
            #     avg += combined[i][1]
            # item_2 =  float(avg/nof)

            #option 2:
            #note: item 1 is a higher performing feature than any of the other methods in the function
            sum = 0
            for i in range(len(ratings)):
                sum += ratings[i]

            
            #there is potential for this to be zero when a user has only a single rating
            #this effectively means that every user must have at least 2 ratings
            item_1 = float(sum/len(ratings))

            #option 3:
            #when the svd function is used:
            # cosine_sim = cosine_similarity(X = transformed_word_counts[0:-1],Y = [transformed_word_counts[-1]])

            #when the svd function is not used:
            cosine_sim = linear_kernel(X = transformed_word_counts[0:-1],Y = [transformed_word_counts[-1]])
            cosine_sim = np.reshape(cosine_sim,  (len(cosine_sim)))
            numerator = 0
            denominator = 0
            item_2 = item_1
            for i in range(len(ratings)):
                numerator += float(cosine_sim[i]*ratings[i])
                denominator += cosine_sim[i]
            
            if denominator != 0:
                item_2 = float(numerator/denominator)
        
            return (item_1, item_2)
        
        
        items = predict()

        feature_1.append(items[0])
        feature_2.append(items[1])
            
        
    return float(overall_rating_sum/overall_rating_count)


overall_average_train = load_feature_1_and_2(train_users.target_movies, train_users.movies_in_order, user_to_data_train, train_users.movie_id_to_ratings, train_users.user_to_movie_id_to_rating, 
                                                         train_users.user_to_target_movie_id, train_users.user_to_target_rating, train_users.feature_1, train_users.feature_2)


load_feature_1_and_2(test_users.target_movies, test_users.movies_in_order, user_to_data_test, test_users.movie_id_to_ratings, test_users.user_to_movie_id_to_rating, 
               test_users.user_to_target_movie_id,
               test_users.user_to_target_rating, test_users.feature_1, test_users.feature_2)


def pre_svd(movie_id_to_average_rating, movies_in_order, user_to_ratings_full_transform, user_to_ratings_full, user_to_target_index_full, 
               user_to_movie_id_to_rating, user_to_target_movie_id):
    for i in range(len(user_to_movie_id_to_rating)):
        ratings = []
        transformed_ratings = []
        index = 0


        #what if there is no movie_id == user_to_target_movie_id[i]
        #this can happen when a test users target movie is not in the train_users.movies_in_order...

        #solution:

        #this could be run once with only train_movies
        #and then used to populate the train svd
        #and then extract the prediction to train the model


        #then again with all movies train_movies + test_movies
        #then used to populate the full svd
        #and then extract the prediction to test model


        #note: movie_id_to_average_rating_train shouold onyl be used for the train run of this function
        #for the test version of the this movie_id_to_average_rating_full should be used

        for movie_id in movies_in_order:
            if movie_id == user_to_target_movie_id[i]:
                user_to_target_index_full.append(index)
                ratings.append(movie_id_to_average_rating[movie_id])
                #note: per item averages are being subtracted here instead of per user averages
                transformed_ratings.append(movie_id_to_average_rating[movie_id] - movie_id_to_average_rating[movie_id]) 

            #note: It should not matter that user_to_movie_id_to_rating includes movie id equal to user_to_target_movie_id[i] since the above condition will flag before this condition
            elif movie_id in user_to_movie_id_to_rating[i].keys():
                ratings.append(user_to_movie_id_to_rating[i][movie_id])
                #note: per item averages are being subtracted here instead of per user averages
                transformed_ratings.append(user_to_movie_id_to_rating[i][movie_id] - movie_id_to_average_rating[movie_id])
            else:
                ratings.append(movie_id_to_average_rating[movie_id])
                #note: per item averages are being subtracted here instead of per user averages
                transformed_ratings.append(movie_id_to_average_rating[movie_id] - movie_id_to_average_rating[movie_id])
            index +=1
        user_to_ratings_full.append(ratings)
        user_to_ratings_full_transform.append(transformed_ratings)


#note: before passing to this function the data is normalized about the average movie ratings (not average user ratings)
#each user train and test users have a single rating that needs to be trained against in the train case
#and predicted in the test case

#the svd can be applied to the combined data of the train and test sets
#both movies that the user didn't watch and movies that should be guesses are...
#transformed to have a value of zero before svd

#the movie columns are taken from the train dataset...
#senario: suppose a test user has a rating of a movie not part of the train set and it is not the target movie (ignore it)
#senario: suppose a test user has a rating of a movie not part of the train set and it is the target movie (guess the rating instead of using svd)

#...Once the UsV is created...
#take the rating from the new UsV for the user row and movie column for the target movie
#other option: cossine similairty on the U ignoring other test users


def svd_full(user_to_ratings_full_transform, n, movie_id_to_average_rating):
    #is this the source the random variation???
    U, s, V = np.linalg.svd(user_to_ratings_full_transform, full_matrices=False)
    
    #simplify ratings to n features
    s=np.diag(s)
    s=s[0:n,0:n]
    U=U[:,0:n]
    V=V[0:n,:]

    #reconstrcut to a new array
    Us = np.dot(U,s)
    UsV = np.dot(Us,V)
    

    #the keys of movie_id_to_ratings is in the same order of movies_in_order and therefore so is movie_id_to_average_rating_train
    x = np.tile(list(movie_id_to_average_rating.values()), (UsV.shape[0],1))

    #this tranforms the UsV row by row into the original rating scale (1-5)
    UsV = UsV + x

    #be consistent with data structures...
    return list(UsV)






#Unlike the other feature loading functions it only makes sense to run this once since...
#there is significantly difference processes for train and test data
def load_feature_3():

    movie_id_to_average_rating_train = dict()
    movie_id_to_average_rating_full = dict()

    #is all_movies_in_order still in order???
    all_movies_in_order = train_users.movies_in_order|test_users.movies_in_order


    #this is used to populate movie_id_to_average_rating_train and movie_id_to_average_rating_full...
    #without skipping the movies that are target movies and not in (movies not in test_users.movie_id_to_ratings or train_users.movie_id_to_ratings)
    for movie in all_movies_in_order:
        temp = 0
        if(movie in train_users.movie_id_to_ratings and movie in test_users.movie_id_to_ratings):
            for rating in train_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_train[movie] = float(temp/len(train_users.movie_id_to_ratings[movie])) 

            for rating in test_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_full[movie] = float(temp/(len(train_users.movie_id_to_ratings[movie])+len(test_users.movie_id_to_ratings[movie])))  

        elif(movie in train_users.movie_id_to_ratings):
            for rating in train_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_train[movie] = float(temp/len(train_users.movie_id_to_ratings[movie]))
            movie_id_to_average_rating_full[movie] = movie_id_to_average_rating_train[movie]

        elif(movie in test_users.movie_id_to_ratings):
            #is the movie a target movie in the train set that isn't in train_users.movies_id_to_ratings???         
            if(movie in train_users.target_movies):
                movie_id_to_average_rating_train[movie] = overall_average_train

            for rating in test_users.movie_id_to_ratings[movie]:
                temp+=rating
            movie_id_to_average_rating_full[movie] = float(temp/len(test_users.movie_id_to_ratings[movie]))
        else:
            #is the movie a target movie in the train set that isn't in train_users.movie_id_to_ratings???
            #is the movie a target movie in the test set that isn't in test_users.movie_id_to_ratings???
            if(movie in train_users.target_movies):
                movie_id_to_average_rating_train[movie] = overall_average_train
                movie_id_to_average_rating_full[movie] = overall_average_train
            else:
                movie_id_to_average_rating_full[movie] = overall_average_train
   

    #for all users in train and then test order
    full_user_to_ratings_full_transform = []
    full_user_to_ratings_full = []
    full_user_to_target_index_full = []


    #this makes a comprehensive list of the train data followed by the test users data
    full_user_to_movie_id_to_rating  = train_users.user_to_movie_id_to_rating + test_users.user_to_movie_id_to_rating
    full_user_to_target_movie_id = train_users.user_to_target_movie_id + test_users.user_to_target_movie_id


    #This is used to scale the ratings and store in train_users.user_to_ratings_full_transform and full_user_to_ratings_full_transform
    #This will transform the target movie ratings and unrated movies to zero

    #run once with only train data to train model
    #run again with train and test data to evaluate model...

    pre_svd(movie_id_to_average_rating_train, train_users.movies_in_order, train_users.user_to_ratings_full_transform, train_users.user_to_ratings_full, train_users.user_to_target_index_full, 
                train_users.user_to_movie_id_to_rating, train_users.user_to_target_movie_id)

    pre_svd(movie_id_to_average_rating_full, all_movies_in_order, full_user_to_ratings_full_transform, full_user_to_ratings_full, full_user_to_target_index_full, 
                full_user_to_movie_id_to_rating, full_user_to_target_movie_id)


    #In practice, there is a train and a test set, the train set is what the database has on record
    #the test data will usually be data that hasn't been seen before that can include any number of test users

    #When train_users.user_to_ratings_full_transform is used as the input of the svd function, 
    #svd_out_train is used to produce predictions used to train the model

    #When full_user_to_ratings_full_transform is used as the input of the svd function,
    #svd_out_full is used to produce predictions used to test the model
    

    #n = 20 proved to be close to the highest performing constant for the above configuration
    svd_out_train = svd_full(train_users.user_to_ratings_full_transform, 20, movie_id_to_average_rating_train)
    svd_out_full = svd_full(full_user_to_ratings_full_transform, 20, movie_id_to_average_rating_full)

    #here the smaller svd provides predictions used to train the mlp model
    for i in range(len(train_users.user_to_ratings_full_transform)):
        train_users.feature_3.append(svd_out_train[i][train_users.user_to_target_index_full[i]])

    #here the larger svd provides predictions used to test the mlp model
    for i in range(len(full_user_to_ratings_full_transform) - len(train_users.user_to_ratings_full_transform)):
        test_users.feature_3.append(svd_out_full[i+len(train_users.user_to_ratings_full_transform)][full_user_to_target_index_full[i+len(train_users.user_to_ratings_full_transform)]])

load_feature_3()


#this is just used to see how the features aproximate the
print("Feature_1 to target comparison (train):")
print(train_users.feature_1[0:5])
print(train_users.user_to_target_rating[0:5])

print("Feature_1 to target comparison (test):")
print(test_users.feature_1[0:5])
print(test_users.user_to_target_rating[0:5])

print("Feature_2 to target comparison (train):")
print(train_users.feature_2[0:5])
print(train_users.user_to_target_rating[0:5])

print("Feature_2 to target comparison (test):")
print(test_users.feature_2[0:5])
print(test_users.user_to_target_rating[0:5])

print("Feature_3 to target comparison (train):")
print(train_users.feature_3[0:5])
print(train_users.user_to_target_rating[0:5])

print("Feature_3 to target comparison (test):")
print(test_users.feature_3[0:5])
print(test_users.user_to_target_rating[0:5])


#this might not be worth the deletion!!!
del user_to_data_train
del user_to_data_test


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jackson\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Feature_1 to target comparison (train):
[3.5492957746478875, 3.436, 3.201388888888889, 3.0930232558139537, 3.547169811320755]
[4.0, 4.0, 4.0, 4.0, 4.0]
Feature_1 to target comparison (test):
[3.442553191489362, 3.8625, 4.030769230769231, 3.3138686131386863, 3.172043010752688]
[4.0, 4.5, 4.0, 4.5, 3.0]
Feature_2 to target comparison (train):
[3.3913145457695006, 3.421093924060176, 3.0992801105014403, 3.170779617769549, 3.5361874203425403]
[4.0, 4.0, 4.0, 4.0, 4.0]
Feature_2 to target comparison (test):
[3.4130044784167977, 3.8678220640284144, 4.040675013795052, 3.3714347353723984, 3.2305045403735138]
[4.0, 4.5, 4.0, 4.5, 3.0]
Feature_3 to target comparison (train):
[3.586664028637569, 3.464455663106878, 3.8962783724112273, 3.9887608040685083, 3.476283698917307]
[4.0, 4.0, 4.0, 4.0, 4.0]
Feature_3 to target comparison (test):
[3.794273965015983, 3.81797383375422, 3.1125607784767197, 3.8430984065841782, 2.734028543430622]
[4.0, 4.5, 4.0, 4.5, 3.0]


In [11]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


#average the performance results for a number of models with identical inputs
def test_parameters(nof_runs, layers, train_input_features, test_input_features):
    train_inputs = [list(pair) for pair in train_input_features]
    test_inputs = [list(pair) for pair in test_input_features]
    return average_results(nof_runs, layers, train_inputs, test_inputs)
    

def average_results(nof_runs, layers, train_inputs, text_inputs):
    no_rounding = 0
    rounding = 0
    for _ in range(nof_runs):
        #best performance analysis is analysis_1
        pair = analysis_1(layers, train_inputs, text_inputs)
        no_rounding+=pair[0]
        rounding+=pair[1]
    return float(no_rounding/nof_runs), float(rounding/nof_runs)


#no scaling (best performance):
def analysis_1(layers, train_inputs, test_inputs):
    # build and train model
    # nn model (worse performance)
    # reg = MLPRegressor(hidden_layer_sizes = layers, solver = "adam",  max_iter = 1000)
    # linear regression (better performance)
    reg = LinearRegression()
    reg.fit(train_inputs, train_users.user_to_target_rating)

    #show importance of different inputs features to the model
    results = permutation_importance(reg, train_inputs, train_users.user_to_target_rating)
    print("Feature Importances: ", results["importances_mean"])

    #make predictions
    predictions = reg.predict(test_inputs)

    #test with and without roundings...
    #in a sense this is logical sense becasue the actual ratings a user makes must be divisable by .5 
    rounded_predictions = []
    for item in predictions:
        rounded_predictions.append(float(round(item*2)/2.0))

    #evaluation metric 1:
    return(r2_score(test_users.user_to_target_rating, predictions), 
        r2_score(test_users.user_to_target_rating, rounded_predictions))

    #evaluation metric 2:
    # return(mean_squared_error(test_users.user_to_target_rating, predictions), 
    #         mean_squared_error(test_users.user_to_target_rating, rounded_predictions))

#scale inputs and targets:
def analysis_2(layers, train_inputs, test_inputs):
    #scale input features
    train_inputs_scaled = StandardScaler().fit_transform(train_inputs)

    #scale target values
    target_scalar = StandardScaler()
    true_rating_train_scaled = target_scalar.fit_transform(np.reshape(train_users.user_to_target_rating, (-1, 1)))
    true_rating_train_scaled = np.reshape(true_rating_train_scaled, len(true_rating_train_scaled))

    #build and train model
    reg = MLPRegressor(hidden_layer_sizes = layers, solver = "adam",  max_iter = 1000)
    reg.fit(train_inputs_scaled, true_rating_train_scaled)

    #show importance of different inputs features...
    results = permutation_importance(reg, train_inputs_scaled,true_rating_train_scaled)
    print(results["importances_mean"])

    #scale inputs features
    test_inputs_scaled = StandardScaler().fit_transform(test_inputs)

    #predict the scaled verison of ouptuts
    scaled_predictions = reg.predict(test_inputs_scaled)

    #get actual predictions from scaled predictions...
    predictions = target_scalar.inverse_transform(scaled_predictions.reshape(-1, 1))
    predictions = list(predictions.reshape(len(predictions)))

    #test with and without roundings...
    #in a sense this is logical sense becasue the actual ratings a user makes must be divisable by .5 
    rounded_predictions = []
    for item in predictions:
        rounded_predictions.append(float(round(item*2)/2.0))

    #evaluation metric 1:
    return(r2_score(test_users.user_to_target_rating, predictions), 
        r2_score(test_users.user_to_target_rating, rounded_predictions))

    #evaluation metric 2:
    # return(mean_squared_error(test_users.user_to_target_rating, predictions), 
    #         mean_squared_error(test_users.user_to_target_rating, rounded_predictions))

#only scale inputs:
def analysis_3(layers, train_inputs, test_inputs):
    #scale input features
    train_inputs_scaled = StandardScaler().fit_transform(train_inputs)

    #build and train model
    reg = MLPRegressor(hidden_layer_sizes = layers, solver = "adam",  max_iter = 1000)
    reg.fit(train_inputs_scaled, train_users.user_to_target_rating)

    #show importance of different inputs features...
    results = permutation_importance(reg, train_inputs_scaled, train_users.user_to_target_rating)
    print(results["importances_mean"])

    #scale inputs features
    test_inputs_scaled = StandardScaler().fit_transform(test_inputs)

    #predict the scaled verison of ouptuts
    predictions = reg.predict(test_inputs_scaled)

    #test with and without roundings...
    #in a sense this is logical sense becasue the actual ratings a user makes must be divisable by .5 
    rounded_predictions = []
    for item in predictions:
        rounded_predictions.append(float(round(item*2)/2.0))

    #evaluation metric 1:
    return(r2_score(test_users.user_to_target_rating, predictions), 
        r2_score(test_users.user_to_target_rating, rounded_predictions))

    #evaluation metric 2:
    # return(mean_squared_error(test_users.user_to_target_rating, predictions), 
    #         mean_squared_error(test_users.user_to_target_rating, rounded_predictions))



#the current test is the average of the r2 scores for 100 different models trained on the same input
#the hidden layers are (10,10,10) and the best combination of inputs features(feature_2 and feature_3) are used



avg_scores = test_parameters(100, (10,10,10), 
    zip(train_users.feature_2),
      zip(test_users.feature_2))


print("average r2_score without rounding: ",avg_scores[0])
print("average r2_score with rounded prediction to nearest .5 (note: actual users ratings must be divisibl by .5): ",avg_scores[1])




#don't remove this until readme explains results (they are iuumportant result to the conclusion):

#with linear regression:
#with cossine similarity:

#feature_2 and feature_3:
#(0.3749823647027071, 0.348993902575555)
#(0.3749823647027071, 0.348993902575555)

#feature_1 and feature_3: 
#(0.37665923268552526, 0.35436777366278627)
#(0.37665923268552526, 0.35436777366278627)


#with linear regression:
#with linear_kernel:

#feature_2 and feature_3:
#(0.3749823647027071, 0.348993902575555)...

#feature_1 and feature_3: 
#(0.37665923268552526, 0.35436777366278627)...
#(0.3692252282147528, 0.35771948109887597)



#with nn model:
#feature_2 and feature_3:
#(0.368986238493678, 0.34709385529828507)
#feature_1 and feature_3: 
#(0.3692192733203262, 0.34905915672447185)


#feature 1 and 3:
#average r2_score without rounding:  0.4393485303942193

#feature 2 and 3:
#average r2_score without rounding:  0.43574994324307864

#feature 1:
#average r2_score without rounding:  0.17208657031405555

#feature 2:
#average r2_score without rounding: 0.16069330580025995

Feature Importances:  [0.27801114]
Feature Importances:  [0.28174239]
Feature Importances:  [0.28963231]
Feature Importances:  [0.28858317]
Feature Importances:  [0.27693054]
Feature Importances:  [0.2743256]
Feature Importances:  [0.28069541]
Feature Importances:  [0.27421776]
Feature Importances:  [0.28900726]
Feature Importances:  [0.2925632]
Feature Importances:  [0.27277945]
Feature Importances:  [0.2807576]
Feature Importances:  [0.28400971]
Feature Importances:  [0.28442598]
Feature Importances:  [0.2842645]
Feature Importances:  [0.29246741]
Feature Importances:  [0.28656351]
Feature Importances:  [0.29703024]
Feature Importances:  [0.27748278]
Feature Importances:  [0.2832791]
Feature Importances:  [0.28734202]
Feature Importances:  [0.2753373]
Feature Importances:  [0.28668609]
Feature Importances:  [0.27931031]
Feature Importances:  [0.27887718]
Feature Importances:  [0.2812503]
Feature Importances:  [0.2824563]
Feature Importances:  [0.27727884]
Feature Importances:  [0.286