In [45]:
import pandas as pd

pd.set_option('display.max_colwidth', None)

moviesFull = pd.read_csv('newdata/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={"tagline": "string", "id":"string", 'genres':"string", "title": "string", "tagline": "string","overview":"string", "production_companies" :"string"})
ratings = pd.read_csv('newdata/ratings.csv', usecols = ("userId", "movieId", "rating"), dtype={"userId": "string","movieId": "string","rating": "string"})
ratings = ratings.rename(columns={"movieId": "id"})

keywords = pd.read_csv('newdata/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})
credits = pd.read_csv("newdata/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})

complete =  pd.merge(moviesFull, ratings, on ="id")
complete =  pd.merge(complete,keywords, on ="id")
complete  = pd.merge(complete,credits, on ="id")


complete = complete.sort_values(by = 'userId')

complete  = complete.dropna()

complete  = complete.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]


In [46]:
import ast


#used to filter the rows of a data
def condition(array):
    length = len(array[4])
    if(array[4][length-2:] == "[]"):
        return False
    length = len(array[5])
    if(array[5][length-2:] == "[]"):
        return False
    length = len(array[6])
    if(array[6][length-2:] == "[]"):
        return False
    length = len(array[7])
    if(array[7][length-2:] == "[]"):
        return False   
    length = len(array[8])
    if(array[8][length-4:]=="<NA>"):
        return False
    length = len(array[9])
    if(array[9][length-4:]=="<NA>"):
        return False 
    return True


#used to extract names from string of list of json formats
def populateNames(item):
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    cnt = 0
    for item in jsons:
        if(cnt == len(jsons)-1):
            tempDict = ast.literal_eval(item)
            names+=str(tempDict["name"])
        else:
            tempDict = ast.literal_eval(item+"}")
            names+=str(str(tempDict["name"])+" ")
        cnt += 1
    return names


def provideData(array):
    movieData = []
    movieData.append(int(array[0]))
    movieData.append(int(array[1]))
    movieData.append(float(array[2]))
    movieData.append(array[3])  

    movieData.append(populateNames(array[4]))
    movieData.append(populateNames(array[5]))
    movieData.append(populateNames(array[6]))
    movieData.append(populateNames(array[7]))

    movieData.append(str(array[8]))
    movieData.append(str(array[9]))
    return movieData
    


#convert the dataframe into an array
#and then build a dictionary
completeDict = dict()
completeArray = complete.to_numpy()
arrayOfUserIds = []


#get all unique user ids
lastId  = -1
for item in completeArray:
    if(item[0]!= lastId):
        arrayOfUserIds.append(item[0])
        lastId = item[0]


index  = 0
#5000
#10000
#20000
#100000
nofUsers = 100000
#5000 users are tested and potentially added to the dict
for i in range(0, nofUsers):
    completeDict[arrayOfUserIds[i]] = []
    for j in range(index, len(completeArray)):
        if completeArray[j][0] == arrayOfUserIds[i]:
            #this is where conditions are checked in completeArray[j]
            if(condition(completeArray[j])):
                #this is where data is tranformed
                transformed = provideData(completeArray[j])
                completeDict[arrayOfUserIds[i]].append(transformed)         
        else:
            #ignore if the number of ratings is too small
            if (len(completeDict[arrayOfUserIds[i]])<10):
                del completeDict[arrayOfUserIds[i]]
            index = j+1
            break


In [47]:
#save in a file so that cells below can run without running the above
import csv

with open("constructedData/constructedData.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    for item in completeDict.keys():
        writer.writerows(completeDict[item])


In [48]:
#this is a starting point if the data is already saved to the csv file
import csv
import pandas as pd

dataList =[]

with open("constructedData/constructedData.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    dataList = list(csv_reader)

dataList = dataList[1:]


In [49]:
#movie id to list of ratings
movieDict = dict()

#user id to the rated movies by that user
userDict = dict()

#The list created by the constructed data is in order by user id
#this code makes a dictionary out of the data (user id to a list of text from all movies rated by the user)
#it also makes a dictionary of movies to their ratings
userId = -1
for row in dataList:
    if (row[0]!=userId):
        userId = row[0]
        userDict[row[0]] = [row]
    else:
        userDict[row[0]].append(row)

    if(row[1] in movieDict.keys()):
        movieDict[row[1]].append(row[2])
    else:
        movieDict[row[1]] = [row[2]]


In [50]:
from gensim.parsing.preprocessing import remove_stopwords

#dictionary of user id to a list of string of combined textual features for each movie 
#does not include ratings or movie id

combinedCorpus = dict()

i = 0
for key in userDict.keys():
    movieStrings = []
    for movieData in userDict[key]:
        movieString = ""
        #avoid the first three data points (user id, movieid, and rating)
        for index in range (3,len(movieData)):
            if(index!= len(movieData)-1):
                movieString+= movieData[index]+" "
            else:
                movieString+= movieData[index]
        cleaned = remove_stopwords(movieString)
        movieStrings.append(cleaned)
    combinedCorpus[key] = movieStrings

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import copy


#seed
seed_int = 1
random.seed(seed_int)

#get average rating for a single movie amoung all users who rated it
def getAverageMovieRating(movieId):
    ret =0 
    cnt = 0
    for item in movieDict[movieId]:
        ret+= float(item)
        cnt+=1
    return float(ret/cnt)


#get all the user ratings 
def getUserRatings(userId):
    ret = []
    for item in userDict[userId]:
        ret.append(float(item[2]))
    return ret

user_to_inputs = dict()
user_to_rand_ratings = dict()

#need to normalized data features
#note: ratings and similairty scores for a given user need to be ordered the same:
for key in combinedCorpus.keys():

    count_matrix = CountVectorizer().fit_transform(combinedCorpus[key]).toarray().tolist()
    #note: len(count_matrix)-1 included
    randIndex = random.randint(0, len(count_matrix)-1)
    randTestItem = count_matrix[randIndex]
    del count_matrix[randIndex]

    #find similarity with the count of each word between the random selected movie and the other movies rated by the user
    cosine_sim = cosine_similarity(X = count_matrix ,Y = [randTestItem])

    ratings = copy.deepcopy(getUserRatings(key))

    randomRating = ratings[randIndex]
    user_to_rand_ratings[key] = randomRating
    del ratings[randIndex]

    similairities = np.reshape(cosine_sim,  (len(cosine_sim)))

    #float symbol removed...
    averageRatingForUser =  float(np.sum(ratings)/(len(ratings)))
    #the movie in question is the randomly selected movie
    avergaeRatingForMovie = getAverageMovieRating(userDict[key][randIndex][1])

    #need to normalize model inputs so one feature does not dominate the other
    #https://www.youtube.com/watch?v=Bc2dWI3vnE0&ab_channel=KrishNaik
    #try adding the curve defining features of the users scores besides mean...

    #testing removal of sim and rating...
    #testting removal of averages
    #sim, rating, averageRatingForUser, avergaeRatingForMovie
    for sim, rating in zip(similairities, ratings):
        if key not in user_to_inputs:
            user_to_inputs[key] = [[sim, rating, averageRatingForUser, avergaeRatingForMovie]]
        else:
            user_to_inputs[key].append([sim, rating, averageRatingForUser, avergaeRatingForMovie])

In [59]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
import random
import time
from sklearn.preprocessing import StandardScaler

start = time.time()

#seed
seed_int = 1
random.seed(seed_int)

#instead of using test train split...
user_to_X_train = dict()
user_to_y_train = dict()
user_to_X_test = dict()
user_to_y_test = dict()


#LOOK!!!
#Is there a problem with using the same users in training and testing???
#instead users should be in either test or train data not both
c1 = 0
c2 = 0
for key in user_to_inputs.keys():
    if(random.randint(0,10) == 0):
        user_to_X_test[key] = user_to_inputs[key]
        user_to_y_test[key] = user_to_rand_ratings[key]
        c1+=1

    else:
        user_to_X_train[key] = user_to_inputs[key]
        user_to_y_train[key] = user_to_rand_ratings[key]
        c2+=1

X_train = [] 
y_train = []
print("Random test:", c1, c2)

#note: there is no user_to_X_train[key] of length 0
for key in user_to_X_train.keys():
    for item in user_to_X_train[key]:
        X_train.append(item)
        y_train.append(user_to_y_train[key])


#should you standardize outputs (y_test and y_train)?

#tranform training features...
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)

#tranform test features...
X_test = []
y_test = []

keys = []
counts = []
for key in user_to_X_test.keys():
    cnt = 0
    #note: user_to_X_test[key] has at least one item...
    for item in user_to_X_test[key]:
        X_test.append(item)
        y_test.append(user_to_y_test[key])
        cnt+=1
    counts.append(cnt)
    keys.append(key)

scalar = StandardScaler()
X_test = scalar.fit_transform(X_test)

new_user_to_X_test = dict()
new_user_to_y_test = dict()

cnt = 0
for num, key in zip(counts, keys):
    new_user_to_X_test[key] = []
    for i in range(num):
        new_user_to_X_test[key].append(X_test[cnt])
        cnt+=1
    new_user_to_y_test[key] = user_to_y_test[key]


#need to use featurs scaling for X...
#also needs to apply a special function to feature scale for test data...
#user_to_X_test is not in list format it is a dictionary
#what should happend is creating a list out of the lists in the dictionary values
#Then returning the standardized values to a new dictionary with standardized inputs bu the same user mappin
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
#https://stackoverflow.com/questions/55296675/is-is-necessary-to-normalize-data-before-using-mlpregressor
#https://www.youtube.com/watch?v=uet8ZQpyJV8&ab_channel=NeuralNine
#https://stats.stackexchange.com/questions/278566/if-you-standardize-x-must-you-always-standardize-y

#LOOK!!!
#Is there a problem with using the same users in training and testing???


#try higher demensionality
#try more layers
#try adding the curve defining features of the users scores besides mean

layers = (10,10,10)
# act = "tanh"
# solve = "adam"
# act = "relu"
# solve = "adam"
# act = "relu"
# solve = "sgd"
# act = "tanh"
# solve = "sgd"
act = "tanh"
solve = "sgd"
regr = MLPRegressor(hidden_layer_sizes=layers,activation =act, solver =solve,  max_iter=10000, random_state =seed_int)
regr.fit(X_train, y_train)
print(regr.n_iter_)


# note: there could also be a weigthed average applied
# Assume there is no guaranteed order for the list of keys returned by the keys() function.

user_to_avg_rating = dict()

for key in new_user_to_X_test.keys():
    sum =0
    cnt =0 
    predicted = regr.predict(new_user_to_X_test[key])
    for item in predicted:
        sum+=item
        cnt+=1
    user_to_avg_rating[key] = float(sum/cnt)


actuals_list = []
preds_list = []
for key in user_to_avg_rating.keys():
    print("Pred: "+str(user_to_avg_rating[key]) , "Actual: "+str(new_user_to_y_test[key]))
    actuals_list.append(new_user_to_y_test[key])
    preds_list.append(user_to_avg_rating[key])


print("overall score:", r2_score(actuals_list, preds_list))


end = time.time()

print("Minutes:", float((end - start)/60))



#number of users: 5000

# act = "tanh"
# solve = "adam"

#layers: (2,2,2)
#avg only: 0.2009237326745862
#all features: 0.21083819922068037
#no averages: 

# act = "relu"
# solve = "adam"

#layers: (2,2,2)
#avg only: 
#all features: 0.20687562265172776
#no averages: 


# act = "relu"
# solve = "sgd"

#layers: (2,2,2)
#avg only: 
#all features: 0.21522532086530377
#no averages: 

# act = "tanh"
# solve = "sgd"

#layers: (2,2,2)
#avg only: 
#all features: 0.21604259995039543
#no averages: 


#number of users: 10000
# layers = (2,2,2)
# act = "tanh"
# solve = "sgd"
# all features: 0.25679408229755896
# avg only: 0.23732796424330116


#number of users: 20000
# layers = (2,2,2)
# act = "tanh"
# solve = "sgd"
# all features: 0.26741974236483346
# avg only: 0.2588574442908086

#number of users: 100000
# layers = (2,2,2)
# act = "tanh"
# solve = "sgd"
# all features: 0.2669699736303994
# avg only: 0.26639043683714336

#number of users: 100000
# layers = (3,3,3)
# act = "tanh"
# solve = "sgd"
# all features: 0.2677761084067273
# avg only: 

#number of users: 100000
# layers = (2,2,2,2)
# act = "tanh"
# solve = "sgd"
# all features: 0.26383177542831415
# avg only: 

#number of users: 100000
# layers = (3,3,3,3)
# act = "tanh"
# solve = "sgd"
# all features: 0.2688553209671489
# avg only: 

#number of users: 100000
# layers = (4,4,4,4)
# act = "tanh"
# solve = "sgd"
# all features: 0.2685755627668974
# avg only: 

#number of users: 100000
# layers = (10,10,10)
# act = "tanh"
# solve = "sgd"
# all features: 0.2668202528802991s
# avg only: 




Random test: 4808 47253
15
Pred: 3.3597419384042326 Actual: 3.0
Pred: 2.9875760058880823 Actual: 1.0
Pred: 3.9949207095556742 Actual: 3.0
Pred: 4.253914901394937 Actual: 4.0
Pred: 2.938999968242719 Actual: 3.0
Pred: 4.383396576054067 Actual: 4.0
Pred: 3.620599935157495 Actual: 1.5
Pred: 4.512290120093854 Actual: 4.5
Pred: 2.845760986912742 Actual: 4.0
Pred: 3.4123538766848482 Actual: 4.0
Pred: 3.9640261857278736 Actual: 3.0
Pred: 2.852541308331111 Actual: 1.0
Pred: 4.186809352916528 Actual: 5.0
Pred: 3.777928771736863 Actual: 3.0
Pred: 3.4481100331625605 Actual: 3.0
Pred: 3.23844705183212 Actual: 3.0
Pred: 3.6083630180164343 Actual: 4.0
Pred: 3.9977113692432953 Actual: 5.0
Pred: 3.796374774639727 Actual: 4.0
Pred: 3.5407481066258977 Actual: 3.0
Pred: 2.9193304952054646 Actual: 3.0
Pred: 3.7993639310322584 Actual: 3.0
Pred: 3.988908285108815 Actual: 4.5
Pred: 2.4815017119337117 Actual: 2.5
Pred: 4.358259774214897 Actual: 4.5
Pred: 4.235338910840102 Actual: 1.0
Pred: 4.185107240946285 Ac