In [1]:
import pandas as pd

#This code is for combining certain data from the necessary csv files into a single dataframe (complete)
pd.set_option('display.max_colwidth', None)

movies_full = pd.read_csv('newdata/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={"tagline": "string", "id":"string", 'genres':"string", "title": "string", "tagline": "string","overview":"string", "production_companies" :"string"})
ratings = pd.read_csv('newdata/ratings.csv', usecols = ("userId", "movieId", "rating"), dtype={"userId": "string","movieId": "string","rating": "string"})
ratings = ratings.rename(columns={"movieId": "id"})

keywords = pd.read_csv('newdata/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})
credits = pd.read_csv("newdata/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})

complete =  pd.merge(movies_full, ratings, on ="id")
complete =  pd.merge(complete,keywords, on ="id")
complete  = pd.merge(complete,credits, on ="id")


complete = complete.sort_values(by = 'userId')

complete  = complete.dropna()

complete  = complete.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]


In [2]:
import ast


#used to filter out the rows of data with empty entries
def condition(array):
    length = len(array[4])
    if(array[4][length-2:] == "[]"):
        return False
    length = len(array[5])
    if(array[5][length-2:] == "[]"):
        return False
    length = len(array[6])
    if(array[6][length-2:] == "[]"):
        return False
    length = len(array[7])
    if(array[7][length-2:] == "[]"):
        return False 
    #note: these can probably be omitted due to the dropna function above
    length = len(array[8])
    if(array[8][length-4:]=="<NA>"):
        return False
    length = len(array[9])
    if(array[9][length-4:]=="<NA>"):
        return False 
    return True


#used to extract names
def populate_names(item):
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    cnt = 0
    for item in jsons:
        if(cnt == len(jsons)-1):
            temp_dict = ast.literal_eval(item)
            names+=str(temp_dict["name"])
        else:
            temp_dict = ast.literal_eval(item+"}")
            names+=str(str(temp_dict["name"])+" ")
        cnt += 1
    return names

#extract data from row of complete_array
def provide_data(array):
    movie_data = []
    movie_data.append(int(array[0]))
    movie_data.append(int(array[1]))
    movie_data.append(float(array[2]))
    movie_data.append(array[3])  

    movie_data.append(populate_names(array[4]))
    movie_data.append(populate_names(array[5]))
    movie_data.append(populate_names(array[6]))
    movie_data.append(populate_names(array[7]))

    movie_data.append(str(array[8]))
    movie_data.append(str(array[9]))
    return movie_data
    


#convert the dataframe into an array and build a dictionary
user_to_data = dict()
complete_array = complete.to_numpy()


#get all unique user ids
list_of_user_ids = []
last_id  = -1
for item in complete_array:
    if(item[0]!= last_id):
        list_of_user_ids.append(item[0])
        last_id = item[0]


index  = 0
#this has been tested with 5000, 10000, 20000, 100000
nof_users = 20000
#populate user_to_data from complete_array
for i in range(0, nof_users):
    user_to_data[list_of_user_ids[i]] = []
    for j in range(index, len(complete_array)):
        if complete_array[j][0] == list_of_user_ids[i]:
            #condition is checked for complete_array[j]
            if(condition(complete_array[j])):
                #this is where data is tranformed
                transformed = provide_data(complete_array[j])
                user_to_data[list_of_user_ids[i]].append(transformed)         
        else:
            #ignore if the number of ratings for a user is too small
            #this can be a higher threshold
            if (len(user_to_data[list_of_user_ids[i]])<10):
                del user_to_data[list_of_user_ids[i]]
            index = j+1
            break
        

In [3]:
#save in a file so that cells below can run without running this cell and above
import csv

with open("constructedData/constructedData.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    for key in user_to_data.keys():
        writer.writerows(user_to_data[key])

In [11]:
#this is a starting point if the data is already saved to the constructedData.csv file
import csv

data_list =[]

with open("constructedData/constructedData.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    data_list = list(csv_reader)

data_list = data_list[1:]


In [12]:
#movie id to list of its ratings by all users
movie_to_ratings = dict()

#user id to the ratings of movies by the user
user_to_ratings = dict()

#The list created by the constructed data csv is in order by user id
#This code populates movie_to_ratings and user_to_ratings
user_id = -1
for row in data_list:
    if (row[0]!=user_id):
        user_id = row[0]
        user_to_ratings[row[0]] = [row]
    else:
        user_to_ratings[row[0]].append(row)

    if(row[1] in movie_to_ratings.keys()):
        movie_to_ratings[row[1]].append(row[2])
    else:
        movie_to_ratings[row[1]] = [row[2]]


In [13]:
from gensim.parsing.preprocessing import remove_stopwords

#dictionary of user id to a list of strings of combined textual features for each movie rated by the user
#the strings do not include ratings or movie id
user_to_corpus_list = dict()

for key in user_to_ratings.keys():
    movie_strings = []
    for movie_data in user_to_ratings[key]:
        movie_string = ""
        #avoid the first three data points (user id, movieid, and rating)
        #use only the text data
        for index in range (3,len(movie_data)):
            if(index!= len(movie_data)-1):
                movie_string+= movie_data[index]+" "
            else:
                movie_string+= movie_data[index]
        #note: cleaned is a string and no lemmatizaion occurs here...
        cleaned = remove_stopwords(movie_string)
        movie_strings.append(cleaned)
    user_to_corpus_list[key] = movie_strings

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import copy
from scipy.stats import kurtosis
from scipy.stats import skew
import statistics
import math


#seed for consistent results across runtime
# seed_int = 1
# random.seed(seed_int)

#get average rating for a single movie amoung all users who rated it
#note: this should omit the rating for the user in question
def get_avg_movie_rating(movie_id):
    ret =0 
    cnt = 0
    for item in movie_to_ratings[movie_id]:
        ret+= float(item)
        cnt+=1
    return float(ret/cnt)


#get all the movie ratings from a single user
def get_user_ratings(user_id):
    ret = []
    for item in user_to_ratings[user_id]:
        ret.append(float(item[2]))
    return ret


#user to model independent var X
user_to_features = dict()
#user to model dependent var y
user_to_rand_rating = dict()


#start of k-means generation (or up one cell???):
#need to start at 1 cluster and work way up while noting the difference in inertia using the elbow method
#or the max number of clusters can be arbitrary
#each model with its numebr of clutsers needs to be saved so that the random test movie...
#can be grouped into a group that the user has a documented rating in (this means a user must have at least two ratings)

#problem: to guess a new users rating requires that none of the users ratings have been used to train the model
#but the cluster algorithm by default does take the entire dataset which is eventually grouped into test and train.
#in realiy the data needs to be split into test and train before clustering

#inputs feature 1: 
#predict the cluster of the movie and find the user in questions average rating for that cluster

#inputs feature 2: 
#the movies rating for all users 

#other features: 
#distribution statistics for the users cluster that the random movie is a part of
#distribution statistics for the movie itself amoung all other users



#populate user_to_features and user_to_rand_rating
for key in user_to_corpus_list.keys():
    #note: this is the advantage of keeping  user_to_corpus_list[key] as a list of strings...
    #ability to use CountVectorizor

    count_matrix = CountVectorizer().fit_transform(user_to_corpus_list[key]).toarray().tolist()
    rand_index = random.randint(0, len(count_matrix)-1)
    rand_test_item = count_matrix[rand_index]
    del count_matrix[rand_index]

    #find similarity by the count of each word between the random selected movie and the other movies rated by the user
    cosine_sim = cosine_similarity(X = count_matrix ,Y = [rand_test_item])

    #technically this should not include the current users rating for the randomly selected movie...
    #that is what we want to find out...
    ratings = copy.deepcopy(get_user_ratings(key))
    similairities = np.reshape(cosine_sim,  (len(cosine_sim)))

    random_rating = ratings[rand_index]
    user_to_rand_rating[key] = random_rating
    del ratings[rand_index]

    #technically this should not include the current users rating for the randomly selected movie...
    #that is what we want to find out...
    movie_rating_avg = get_avg_movie_rating(user_to_ratings[key][rand_index][1])

    user_rating_avg =  float(np.sum(ratings)/(len(ratings)))
    user_rating_skew = skew(ratings)
    if(math.isnan(user_rating_skew)):
        user_rating_skew = 0
    user_rating_kurt = kurtosis(ratings)
    if(math.isnan(user_rating_kurt)):
        user_rating_kurt = 0
    user_rating_var = statistics.variance(ratings)


    sim_average = float(np.sum(similairities)/(len(similairities)))
    sim_skew = skew(similairities) 
    if(math.isnan(sim_skew)):
        sim_skew = 0
    sim_kurt = kurtosis(similairities)
    if(math.isnan(sim_kurt)):
        sim_kurt = 0
    sim_var = statistics.variance(similairities)


    # there are many curve defining features used here that may be impotent and can be cut or kept in the next cell...
    # there may stil be other distribution measures that improve the model...
    # might try inputing some function of sim and rating rather than incluing them on their own


    #guthub note: this is the start to a complete model overall

    #(LOOK)
    #instead of a features having a (rating - user_rating_avg)*sim term which is a week indicator of rating (not sure why)
    #before this loop the k-mean algorithm is trained on the entire data set...
    #and the elbow method can be used to find the right number of clusters..
    #then, inside this loop the rand movie which rating is to be predicted is grouped in a cluster according to k means
    #ideally the user has an average rating for movies in the assigned cluster
    #if the user does not have a rating for the specified cluster than see if the
    #user has a rating for the k-means cluster of k-1 with the clusetr of the random movie reaccessed
    #repeat this process until the random movie is part of a cluster or k = 1 meaning it has to be part of that cluster
    #this process still assumes that the corpus is a good indicator of rating



    #what is the difference between using this and k-means//
    #model = AgglomerativeClustering(n_clusters=i, affinity="euclidean", linkage="single")    


 

    for sim, rating in zip(similairities, ratings):
        if key not in user_to_features:
            # user_to_features[key] = [[(rating - user_rating_avg)*sim, movie_rating_avg, user_rating_skew, user_rating_kurt, user_rating_var, sim_average, sim_skew, sim_kurt, sim_var]]
            user_to_features[key] = [[user_rating_avg, movie_rating_avg, user_rating_skew, user_rating_kurt, user_rating_var, sim_average, sim_skew, sim_kurt, sim_var]]
        else:
            # user_to_features[key].append([(rating - user_rating_avg)*sim, movie_rating_avg, user_rating_skew, user_rating_kurt, user_rating_var, sim_average, sim_skew, sim_kurt, sim_var])
            # user_to_features[key].append([rating, sim, user_rating_avg, movie_rating_avg, user_rating_skew, user_rating_kurt, user_rating_var, sim_average, sim_skew, sim_kurt, sim_var])
            user_to_features[key].append([user_rating_avg, movie_rating_avg, user_rating_skew, user_rating_kurt, user_rating_var, sim_average, sim_skew, sim_kurt, sim_var])

In [17]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
import random
import time
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance


#test the time taken to train and predict
start = time.time()

#this is where you may select certain features to be used to build the model
new_user_to_features = dict()

for key in user_to_features.keys():
    new_list = []
    for item in user_to_features[key]:
        # can try reducing the features like below:
        # old inputs...
        # item[0:4]+ item[6:7]+ item[10:11]
        # item[0:4]
        # item[2:8] + item[8:]
        #new inputs:
        
        new_list.append(item)
    new_user_to_features[key] = new_list

#seed
# seed_int = 1
# random.seed(seed_int)

#instead of using test train split...
user_to_X_train = dict()
user_to_y_train = dict()
user_to_X_test = dict()
user_to_y_test = dict()

#There is a problem with using the same users in training and testing and this code ensures that it doesn't happen
#The model should beable to be used effectively for new users and not just memorized for existing users
c1 = 0
c2 = 0
for key in new_user_to_features.keys():
    if(random.randint(0,10) == 0):
        user_to_X_test[key] = new_user_to_features[key]
        user_to_y_test[key] = user_to_rand_rating[key]
        c1+=1

    else:
        user_to_X_train[key] = new_user_to_features[key]
        user_to_y_train[key] = user_to_rand_rating[key]
        c2+=1

#used to train model
X_train = [] 
y_train = []

#populate X_train and y_train
for key in user_to_X_train.keys():
    for item in user_to_X_train[key]:
        X_train.append(item)
        y_train.append(user_to_y_train[key])


# scale training features...
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)

#data transformation
#https://datascience.stackexchange.com/questions/45900/when-to-use-standard-scaler-and-when-normalizer


#train model
#orginal model layers
#layers = (2,2,2)
layers = (2,2,2)
# act = "tanh"
# solve = "adam"
# act = "relu"
# solve = "adam"
# act = "relu"
# solve = "sgd"
# act = "tanh"
# solve = "sgd"
act = "relu"
solve = "adam"

#note: random_state=seed_int removed
regr = MLPRegressor(hidden_layer_sizes=layers,activation =act, solver =solve,  max_iter=10000)
fitted = regr.fit(X_train, y_train)

#this needs to run before the final model is determined so that the best features are used
#the results can also be displayed with a bar shart showing how each feature cotributes to a percentage of the models accuracy

#note: random_state=seed_int removed
result = permutation_importance(fitted, X_train, y_train)

print(result["importances_mean"])

print(regr.n_iter_)

#dictionary of users to test features that have been scaled
new_user_to_X_test = dict()

# used to scale test features then the new scaled features are returned ...
# as the values of the approriate user key in new_user_to_X_test 
X_test = []

#populate X_test, key, and counts that are later used to build new_user_to_X_test, a verison of...
#user_to_X_test with scaled features 
#need to decompose then recompose
keys = []
counts = []
for key in user_to_X_test.keys():
    cnt = 0
    for item in user_to_X_test[key]:
        X_test.append(item)
        cnt+=1
    counts.append(cnt)
    keys.append(key)

#scale test features...
scalar = StandardScaler()
X_test = scalar.fit_transform(X_test)

#populate new_user_to_X_test with scaled test features
cnt = 0
for num, key in zip(counts, keys):
    new_user_to_X_test[key] = []
    for i in range(num):
        new_user_to_X_test[key].append(X_test[cnt])
        cnt+=1


# user id to the average predicted rating for the randomly chosen movie
user_to_avg_rating = dict()

# populate user_to_avg_rating by averaging the predictions from all the feature inputs of the...
# movies a user has watched that are not the randomly chosen movie itself
for key in new_user_to_X_test.keys():
    sum =0
    cnt =0 
    predicted = regr.predict(new_user_to_X_test[key])
    for item in predicted:
        sum+=item
        cnt+=1
    user_to_avg_rating[key] = float(sum/cnt)


#outputs
actuals_list = []
preds_list = []
for key in user_to_avg_rating.keys():
    print("Pred: "+str(user_to_avg_rating[key]) , "Actual: "+str(user_to_y_test[key]))
    actuals_list.append(user_to_y_test[key])
    preds_list.append(user_to_avg_rating[key])
    
print("overall score:", r2_score(actuals_list, preds_list))

#test the time taken to train and predict
end = time.time()
print("Minutes:", float((end - start)/60))


#feature importance scores:
#https://scikit-learn.org/stable/modules/permutation_importance.html
#https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html#sklearn.inspection.permutation_importance

#introduction:
#https://www.kaggle.com/code/dansbecker/permutation-importance

#types of feature importance:
#https://towardsdatascience.com/6-types-of-feature-importance-any-data-scientist-should-master-1bfd566f21c9


#perhaps there is a way to visualize this of the model outputs below in a systematic way???

# Tests:

# full features:
# with linear regression:
# overall score: 0.2657455495660592

# with mlp...:

# first fours features:
# layers: (2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.2667063296881431

#all features: 
# layers: (2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.26606490897808244

#all features: 
# layers: (2,2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.260461734737799

#all features: 
# layers: (4,4,4)
# act = "relu"
# solve = "adam"
# overall score: 0.22932745175064528

# first fours features and variance:
# layers: (2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.2482634902547255

# first fours features and variance:
# layers: (2,2,2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.2616102684471122

# first fours features and variance:
# layers: (3,3,3)
# act = "relu"
# solve = "adam"
# overall score: 0.25487207187202243

#first two featurs:
# layers: (2,2,2)
# act = "relu"
# solve = "adam"
# overall score: -0.00430015574935827

#first two featurs:
# layers: (2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.04468421358737418

#3rd and 4th features:
# layers: (2,2)
# act = "relu"
# solve = "adam"
# overall score: 0.2546480453878024

[0.31077813 0.40534428 0.01181351 0.0096373  0.04163006 0.02121139
 0.25362815 0.17351571 0.02097077]
79
Pred: 2.133233757067547 Actual: 3.0
Pred: 2.3036635415358417 Actual: 3.0
Pred: 4.21654929365804 Actual: 3.5
Pred: 4.675950415353803 Actual: 3.0
Pred: 3.4097745860096516 Actual: 3.5
Pred: 3.57676844342984 Actual: 2.0
Pred: 3.6534256597567203 Actual: 4.0
Pred: 3.4924393768669875 Actual: 4.0
Pred: 3.040758547030578 Actual: 2.0
Pred: 4.277531782455525 Actual: 5.0
Pred: 2.8005161889304944 Actual: 3.5
Pred: 3.1056330987460203 Actual: 4.0
Pred: 3.5079222090070448 Actual: 3.0
Pred: 3.358482877027911 Actual: 5.0
Pred: 4.401036192733158 Actual: 4.0
Pred: 3.629638246502501 Actual: 3.5
Pred: 3.2357149625902966 Actual: 3.0
Pred: 4.415845930901873 Actual: 5.0
Pred: 4.670628232527948 Actual: 3.5
Pred: 3.9594427405137305 Actual: 4.0
Pred: 5.0762740693850175 Actual: 3.0
Pred: 4.270727213138118 Actual: 4.0
Pred: 4.245589532217444 Actual: 4.0
Pred: 4.002398532907798 Actual: 4.0
Pred: 3.806551454636284