In [1]:
import pandas as pd

#This code is for combining certain data from the necessary csv files into a single dataframe (complete)
pd.set_option('display.max_colwidth', None)

movies_full = pd.read_csv('newdata/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={"tagline": "string", "id":"string", 'genres':"string", "title": "string", "tagline": "string","overview":"string", "production_companies" :"string"})
ratings = pd.read_csv('newdata/ratings.csv', usecols = ("userId", "movieId", "rating"), dtype={"userId": "string","movieId": "string","rating": "string"})
ratings = ratings.rename(columns={"movieId": "id"})

keywords = pd.read_csv('newdata/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})
credits = pd.read_csv("newdata/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})

complete =  pd.merge(movies_full, ratings, on ="id")
complete =  pd.merge(complete,keywords, on ="id")
complete  = pd.merge(complete,credits, on ="id")


complete = complete.sort_values(by = 'userId')

complete  = complete.dropna()

complete  = complete.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]


In [2]:
import ast


#used to filter out the rows of data with empty entries
def condition(array):
    length = len(array[4])
    if(array[4][length-2:] == "[]"):
        return False
    length = len(array[5])
    if(array[5][length-2:] == "[]"):
        return False
    length = len(array[6])
    if(array[6][length-2:] == "[]"):
        return False
    length = len(array[7])
    if(array[7][length-2:] == "[]"):
        return False   
    length = len(array[8])
    if(array[8][length-4:]=="<NA>"):
        return False
    length = len(array[9])
    if(array[9][length-4:]=="<NA>"):
        return False 
    return True


#used to extract names
def populate_names(item):
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    cnt = 0
    for item in jsons:
        if(cnt == len(jsons)-1):
            temp_dict = ast.literal_eval(item)
            names+=str(temp_dict["name"])
        else:
            temp_dict = ast.literal_eval(item+"}")
            names+=str(str(temp_dict["name"])+" ")
        cnt += 1
    return names

#extract data from row of complete_array
def provide_data(array):
    movie_data = []
    movie_data.append(int(array[0]))
    movie_data.append(int(array[1]))
    movie_data.append(float(array[2]))
    movie_data.append(array[3])  

    movie_data.append(populate_names(array[4]))
    movie_data.append(populate_names(array[5]))
    movie_data.append(populate_names(array[6]))
    movie_data.append(populate_names(array[7]))

    movie_data.append(str(array[8]))
    movie_data.append(str(array[9]))
    return movie_data
    


#convert the dataframe into an array and build a dictionary
user_to_data = dict()
complete_array = complete.to_numpy()


#get all unique user ids
list_of_user_ids = []
last_id  = -1
for item in complete_array:
    if(item[0]!= last_id):
        list_of_user_ids.append(item[0])
        last_id = item[0]


index  = 0
#this has been tested with 5000, 10000, 20000, 100000
nof_users = 20000
#populate user_to_data from complete_array
for i in range(0, nof_users):
    user_to_data[list_of_user_ids[i]] = []
    for j in range(index, len(complete_array)):
        if complete_array[j][0] == list_of_user_ids[i]:
            #condition is checked for complete_array[j]
            if(condition(complete_array[j])):
                #this is where data is tranformed
                transformed = provide_data(complete_array[j])
                user_to_data[list_of_user_ids[i]].append(transformed)         
        else:
            #ignore if the number of ratings for a user is too small
            if (len(user_to_data[list_of_user_ids[i]])<10):
                del user_to_data[list_of_user_ids[i]]
            index = j+1
            break
        

In [3]:
#save in a file so that cells below can run without running this cell and above
import csv

with open("constructedData/constructedData.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    for key in user_to_data.keys():
        writer.writerows(user_to_data[key])

In [2]:
#this is a starting point if the data is already saved to the constructedData.csv file
import csv

data_list =[]

with open("constructedData/constructedData.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    data_list = list(csv_reader)

data_list = data_list[1:]


In [3]:
#movie id to list of its ratings by all users
movie_to_ratings = dict()

#user id to the ratings of movies by the user
#actually includes whole data row from constucted data...
user_to_ratings = dict()

#The list created by the constructed data csv is in order by user id
#This code populates movie_to_ratings and user_to_ratings
user_id = -1
for row in data_list:
    if (row[0]!=user_id):
        user_id = row[0]
        user_to_ratings[row[0]] = [row]
    else:
        user_to_ratings[row[0]].append(row)

    if(row[1] in movie_to_ratings.keys()):
        movie_to_ratings[row[1]].append(row[2])
    else:
        movie_to_ratings[row[1]] = [row[2]]


In [4]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer

#dictionary of user id to a list of strings of combined textual features for each movie rated by the user
#the strings do not include ratings or movie id
user_to_corpus_list = dict()


# WordNetLemmatizer().lemmatize(token.lower())
wnl = WordNetLemmatizer()



for key in user_to_ratings.keys():
    movie_strings = []
    for movie_data in user_to_ratings[key]:
        movie_string = ""
        #avoid the first three data points (user id, movieid, and rating)
        #use only the text data
        for index in range (3,len(movie_data)):
            if(index!= len(movie_data)-1):
                movie_string+= movie_data[index]+" "
            else:
                movie_string+= movie_data[index]
        cleaned = remove_stopwords(movie_string)
        cleaned = " ".join([wnl.lemmatize(word) for word in cleaned.split(" ")])
        movie_strings.append(cleaned)
    user_to_corpus_list[key] = movie_strings

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import copy
from scipy.stats import kurtosis
from scipy.stats import skew
import statistics
import math


#seed for consistent results across runtime
seed_int = 1
random.seed(seed_int)

#note user_to_corpus_list is a dictionary of users to list of strings with all the text data for the coresponding movies
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering


#convert text data into a list of word occurances
#seperate test and train data

user_to_word_count = dict()



#this is a list of all the words for each movie for each user
#note: keys are in insertion order for python 3.7+

list_of_words = []

for key in user_to_corpus_list:
    for item in user_to_corpus_list[key]:
        list_of_words.append(item)


#this logic is used for training and simalir logic is used to fill in the
#missing ratings for movies that the user hasn't seen
#is there a more accuracte method besides using a single rating from the user
#yes but this is a step towrds the more accurate user comaprison model

for key in user_to_corpus_list.keys():
    #make sure a copy happens here...
    cv = CountVectorizer().fit_transform(user_to_corpus_list[key])
    rand_index = random.randint(0, len(cv)-1)
    rand_test_item = cv[rand_index]
    del cv[rand_index]
    cosine_sim = cosine_similarity(X = cv ,Y = [rand_test_item])
    max = 0
    pred_rating = 0
    #find the rating of the most similair movie
    for sim, rat in zip(cosine_sim, ratings):
        if sim>max:
            max = sim
            pred_rating = rat


#now for the user comparison logic (need user to list of movie ratings)
#need an order for the movie ratings...
#fill in ratings that the user hasn't watched with the method above
#then cluster the users by their ratings



#there is a dictionary of usernames to list of rating placeholders for movie ratings for all movies initially set to -1
#go through the initial data list and keep a dictionary of movie names to an index which is the place
#of the movie in coresponding user dictionary value list
#while doing this change the rating from -1 into the coresponding place in the dictionary of usernames to list of rating placeholders











#get average rating for a single movie amoung all users who rated it
def get_avg_movie_rating(movie_id):
    ret =0 
    cnt = 0
    for item in movie_to_ratings[movie_id]:
        ret+= float(item)
        cnt+=1
    return float(ret/cnt)


#get all the movie ratings from a single user
def get_user_ratings(user_id):
    ret = []
    for item in user_to_ratings[user_id]:
        ret.append(float(item[2]))
    return ret


#user to model independent var X
user_to_features = dict()
#user to model dependent var y
user_to_rand_rating = dict()

#note: agglomerative clustering might make more sense here since k-means has random init for centroids...
#note: to guess a new users rating requires that none of that users ratings have been used to train the model
#The data needs to be split into test and train before modeling the algorithm on the train data

#Training process:
#split data into test and train data
#proceed with train data...
#cluster movies by the tokens with range for k
#cluster users by the ratings with range for k and (fill in ratings for movies a users hasn't watched with some guess)
#guess: this can be obtained by clustering the movies that the user has watched...
#for each movie the user hasn't watched find the cluster that it belongs to with the highest possible k value
#that the user has at least one movie belonging to one of the clusters and then take the average of those movies
#this is exactly like a later training step excpet it is applied to all the movies the user watched

#for a single randomly chosen movie from each user in the trainging data...

#find the cluster the movie belongs to 
#find the movies part of that same cluster that the user has scored at the highest possible k value
#take the average score of these movies
#find the cluster the user belongs to
#find the average rating of the movie for users in that cluster at the highest possible k value
#train an mlp model with both averages and perhaps some extra statistics as features...
#using the given movie ratings as actuals


#The process of predicting a rating:
#1. find the cluster the movie belongs to 
#2. find the movies part of that same cluster that the user has scored at the highest possible k value
#3. take the average score of these movies
#4. find the cluster the user belongs to
#5. find the average rating of the movie for users in that cluster at the highest possible k value
#6. input into the trained mlp model both averages and perhaps some extra statistics
#7. make predictions and test against the randomly chosen movies actual ratings


#summary:
#find cluster for movie -> find movies part of the same clusters that the users rated -> average
#question: are the clusters unique to the movies the user has watched or to all movies???
#what is the technical difference???
#is this the same as finding the most simimlair movie the user rated and copying the rating???

#find cluster for user -> find the ratings for the movie by people in the same cluster -> average



#other avenues considered:
#idea 1:
#for the first process, instead of averaging the movies that only the user rated, find other users that are...
#like the user in question and find the average for that movie cluster
#Problem: it is better to get the users raw opionion rather than generalizing it to some like minded users
#there is an extra costly step to this
#idea 2: 
#for the second process, instead of finding the average rating for the movie in the same cluster of users...
#also find the average rating of movies that are like the movie in question 
#Problem, it is better to get the movies rating itself as it would be the most accurate indicator
#there is an extra costly step to this



#populate user_to_features and user_to_rand_rating
for key in user_to_corpus_list.keys():

    count_matrix = CountVectorizer().fit_transform(user_to_corpus_list[key]).toarray().tolist()
    rand_index = random.randint(0, len(count_matrix)-1)
    rand_test_item = count_matrix[rand_index]
    del count_matrix[rand_index]

    #find similarity by the count of each word between the random selected movie and the other movies rated by the user
    cosine_sim = cosine_similarity(X = count_matrix ,Y = [rand_test_item])

    #technically this should not include the current users rating for the randomly selected movie...
    #that is what we want to find out...
    ratings = copy.deepcopy(get_user_ratings(key))
    similairities = np.reshape(cosine_sim,  (len(cosine_sim)))

    random_rating = ratings[rand_index]
    user_to_rand_rating[key] = random_rating
    del ratings[rand_index]

    #technically this should not include the current users rating for the randomly selected movie...
    #that is what we want to find out...
    movie_rating_avg = get_avg_movie_rating(user_to_ratings[key][rand_index][1])

    user_rating_avg =  float(np.sum(ratings)/(len(ratings)))
    user_rating_skew = skew(ratings)
    if(math.isnan(user_rating_skew)):
        user_rating_skew = 0
    user_rating_kurt = kurtosis(ratings)
    if(math.isnan(user_rating_kurt)):
        user_rating_kurt = 0
    user_rating_var = statistics.variance(ratings)


    sim_average = float(np.sum(similairities)/(len(similairities)))
    sim_skew = skew(similairities) 
    if(math.isnan(sim_skew)):
        sim_skew = 0
    sim_kurt = kurtosis(similairities)
    if(math.isnan(sim_kurt)):
        sim_kurt = 0
    sim_var = statistics.variance(similairities)


    # there are many curve defining features used here that may be impotent and can be cut or kept in the next cell...
    # there may stil be other distribution measures that improve the model...
    # might try inputing some function of sim and rating rather than incluing them on their own


    for sim, rating in zip(similairities, ratings):
        if key not in user_to_features:
            # user_to_features[key] = [[(rating - user_rating_avg)*sim, movie_rating_avg, user_rating_skew, user_rating_kurt, user_rating_var, sim_average, sim_skew, sim_kurt, sim_var]]
            user_to_features[key] = [[rating, sim, user_rating_avg, movie_rating_avg, user_rating_skew, user_rating_kurt, user_rating_var, sim_average, sim_skew, sim_kurt, sim_var]]
        else:
            # user_to_features[key].append([(rating - user_rating_avg)*sim, movie_rating_avg, user_rating_skew, user_rating_kurt, user_rating_var, sim_average, sim_skew, sim_kurt, sim_var])
            user_to_features[key].append([rating, sim, user_rating_avg, movie_rating_avg, user_rating_skew, user_rating_kurt, user_rating_var, sim_average, sim_skew, sim_kurt, sim_var])

  user_rating_skew = skew(ratings)
  user_rating_kurt = kurtosis(ratings)
  user_rating_skew = skew(ratings)
  user_rating_kurt = kurtosis(ratings)
  user_rating_skew = skew(ratings)
  user_rating_kurt = kurtosis(ratings)
  user_rating_skew = skew(ratings)
  user_rating_kurt = kurtosis(ratings)
  user_rating_skew = skew(ratings)
  user_rating_kurt = kurtosis(ratings)
  user_rating_skew = skew(ratings)
  user_rating_kurt = kurtosis(ratings)


In [8]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
import random
import time
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance


#test the time taken to train and predict
start = time.time()

#this is where you may select certain features to be used to build the model
new_user_to_features = dict()

for key in user_to_features.keys():
    new_list = []
    for item in user_to_features[key]:
        # can try reducing the features like below:
        # old inputs...
        # item[0:4]+ item[6:7]+ item[10:11]
        # item[0:4]
        # item[2:8] + item[8:]
        #new inputs:
        
        new_list.append(item)
    new_user_to_features[key] = new_list

#seed
seed_int = 1
random.seed(seed_int)

#instead of using test train split...
user_to_X_train = dict()
user_to_y_train = dict()
user_to_X_test = dict()
user_to_y_test = dict()

#There is a problem with using the same users in training and testing and this code ensures that it doesn't happen
#The model should beable to be used effectively for new users and not just memorized for existing users
c1 = 0
c2 = 0
for key in new_user_to_features.keys():
    if(random.randint(0,10) == 0):
        user_to_X_test[key] = new_user_to_features[key]
        user_to_y_test[key] = user_to_rand_rating[key]
        c1+=1

    else:
        user_to_X_train[key] = new_user_to_features[key]
        user_to_y_train[key] = user_to_rand_rating[key]
        c2+=1

#used to train model
X_train = [] 
y_train = []

#populate X_train and y_train
for key in user_to_X_train.keys():
    for item in user_to_X_train[key]:
        X_train.append(item)
        y_train.append(user_to_y_train[key])


# scale training features...
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)

#data transformation
#https://datascience.stackexchange.com/questions/45900/when-to-use-standard-scaler-and-when-normalizer


#train model
#orginal model layers
#layers = (2,2,2)
layers = (2,2,2)
# act = "tanh"
# solve = "adam"
# act = "relu"
# solve = "adam"
# act = "relu"
# solve = "sgd"
# act = "tanh"
# solve = "sgd"
act = "relu"
solve = "adam"


regr = MLPRegressor(hidden_layer_sizes=layers,activation =act, solver =solve,  max_iter=10000, random_state =seed_int)
fitted = regr.fit(X_train, y_train)

#this needs to run before the final model is determined so that the best features are used
#the results can also be displayed with a bar shart showing how each feature cotributes to a percentage of the models accuracy
result = permutation_importance(fitted, X_train, y_train,random_state=seed_int)

print(result["importances_mean"])

print(regr.n_iter_)

#dictionary of users to test features that have been scaled
new_user_to_X_test = dict()

# used to scale test features then the new scaled features are returned ...
# as the values of the approriate user key in new_user_to_X_test 
X_test = []

#populate X_test, key, and counts that are later used to build new_user_to_X_test, a verison of...
#user_to_X_test with scaled features 
#need to decompose then recompose
keys = []
counts = []
for key in user_to_X_test.keys():
    cnt = 0
    for item in user_to_X_test[key]:
        X_test.append(item)
        cnt+=1
    counts.append(cnt)
    keys.append(key)

#scale test features...
scalar = StandardScaler()
X_test = scalar.fit_transform(X_test)

#populate new_user_to_X_test with scaled test features
cnt = 0
for num, key in zip(counts, keys):
    new_user_to_X_test[key] = []
    for i in range(num):
        new_user_to_X_test[key].append(X_test[cnt])
        cnt+=1


# user id to the average predicted rating for the randomly chosen movie
user_to_avg_rating = dict()

# populate user_to_avg_rating by averaging the predictions from all the feature inputs of the...
# movies a user has watched that are not the randomly chosen movie itself
for key in new_user_to_X_test.keys():
    sum =0
    cnt =0 
    predicted = regr.predict(new_user_to_X_test[key])
    for item in predicted:
        sum+=item
        cnt+=1
    user_to_avg_rating[key] = float(sum/cnt)


#outputs
actuals_list = []
preds_list = []
for key in user_to_avg_rating.keys():
    print("Pred: "+str(user_to_avg_rating[key]) , "Actual: "+str(user_to_y_test[key]))
    actuals_list.append(user_to_y_test[key])
    preds_list.append(user_to_avg_rating[key])
print("overall score:", r2_score(actuals_list, preds_list))

#test the time taken to train and predict
end = time.time()
print("Minutes:", float((end - start)/60))


#feature importance scores:
#https://scikit-learn.org/stable/modules/permutation_importance.html
#https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html#sklearn.inspection.permutation_importance

#introduction:
#https://www.kaggle.com/code/dansbecker/permutation-importance

#types of feature importance:
#https://towardsdatascience.com/6-types-of-feature-importance-any-data-scientist-should-master-1bfd566f21c9


#perhaps there is a way to visualize this of the model outputs below in a systematic way???

# Tests:

# full features:
# with linear regression:
# overall score: 0.2657455495660592

# with mlp...:

# first fours features:
# layers: (2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.2667063296881431

#all features: 
# layers: (2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.26606490897808244

#all features: 
# layers: (2,2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.260461734737799

#all features: 
# layers: (4,4,4)
# act = "relu"
# solve = "adam"
# overall score: 0.22932745175064528

# first fours features and variance:
# layers: (2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.2482634902547255

# first fours features and variance:
# layers: (2,2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.2616102684471122

# first fours features and variance:
# layers: (3,3,3)
# act = "relu"
# solve = "adam"
# overall score: 0.25487207187202243

#first two featurs:
# layers: (2,2,2)
# act = "relu"
# solve = "adam"
# overall score: -0.00430015574935827

#first two featurs:
# layers: (2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.04468421358737418

#3rd and 4th features:
# layers: (2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.2546480453878024

[-3.50910138e-05  1.47934257e-07  2.58227704e-01  2.98268535e-01
  2.47937052e-03  1.85234981e-03  4.94337078e-03  5.42048696e-03
  8.14588308e-03  1.08693110e-02  1.46282495e-03]
19
Pred: 3.3420909617280894 Actual: 3.0
Pred: 3.21471216837001 Actual: 1.0
Pred: 3.9162525796936274 Actual: 3.0
Pred: 4.150428622025074 Actual: 4.0
Pred: 2.9536225623200463 Actual: 3.0
Pred: 4.508382091092446 Actual: 4.0
Pred: 3.5021362700246383 Actual: 1.5
Pred: 4.544995408783995 Actual: 4.5
Pred: 3.0144937107000183 Actual: 4.0
Pred: 3.499556689402351 Actual: 4.0
Pred: 3.9248830282144156 Actual: 3.0
Pred: 2.9073696102993547 Actual: 1.0
Pred: 4.26207807410446 Actual: 5.0
Pred: 3.910856828826111 Actual: 3.0
Pred: 3.4744587208277657 Actual: 3.0
Pred: 3.233938480643781 Actual: 3.0
Pred: 3.6413446168055024 Actual: 4.0
Pred: 3.9905922784832084 Actual: 5.0
Pred: 3.6745961653232224 Actual: 4.0
Pred: 3.603680109800211 Actual: 3.0
Pred: 3.1907465665030346 Actual: 3.0
Pred: 3.8732037492567355 Actual: 3.0
Pred: 4.039558