In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', None)

moviesFull = pd.read_csv('newdata/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={"tagline": "string", "id":"string", 'genres':"string", "title": "string", "tagline": "string","overview":"string", "production_companies" :"string"})
ratings = pd.read_csv('newdata/ratings.csv', usecols = ("userId", "movieId", "rating"), dtype={"userId": "string","movieId": "string","rating": "string"})
ratings = ratings.rename(columns={"movieId": "id"})

keywords = pd.read_csv('newdata/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})
credits = pd.read_csv("newdata/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})

complete =  pd.merge(moviesFull, ratings, on ="id")
complete =  pd.merge(complete,keywords, on ="id")
complete  = pd.merge(complete,credits, on ="id")


complete = complete.sort_values(by = 'userId')

complete  = complete.dropna()

complete  = complete.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]


In [2]:
import ast


#used to filter the rows of a data
def condition(array):
    length = len(array[4])
    if(array[4][length-2:] == "[]"):
        return False
    length = len(array[5])
    if(array[5][length-2:] == "[]"):
        return False
    length = len(array[6])
    if(array[6][length-2:] == "[]"):
        return False
    length = len(array[7])
    if(array[7][length-2:] == "[]"):
        return False   
    length = len(array[8])
    if(array[8][length-4:]=="<NA>"):
        return False
    length = len(array[9])
    if(array[9][length-4:]=="<NA>"):
        return False 
    return True


#used to extract names from string of list of json formats
def populateNames(item):
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    cnt = 0
    for item in jsons:
        if(cnt == len(jsons)-1):
            tempDict = ast.literal_eval(item)
            names+=str(tempDict["name"])
        else:
            tempDict = ast.literal_eval(item+"}")
            names+=str(str(tempDict["name"])+" ")
        cnt += 1
    return names


def provideData(array):
    movieData = []
    movieData.append(int(array[0]))
    movieData.append(int(array[1]))
    movieData.append(float(array[2]))
    movieData.append(array[3])  

    movieData.append(populateNames(array[4]))
    movieData.append(populateNames(array[5]))
    movieData.append(populateNames(array[6]))
    movieData.append(populateNames(array[7]))

    movieData.append(str(array[8]))
    movieData.append(str(array[9]))
    return movieData
    


#convert the dataframe into an array
#and then build a dictionary
completeDict = dict()
completeArray = complete.to_numpy()
arrayOfUserIds = []


#get all unique user ids
lastId  = -1
for item in completeArray:
    if(item[0]!= lastId):
        arrayOfUserIds.append(item[0])
        lastId = item[0]


index  = 0
nofUsers = 5000
#5000 users are tested and potentially added to the dict
for i in range(0, nofUsers):
    completeDict[arrayOfUserIds[i]] = []
    for j in range(index, len(completeArray)):
        if completeArray[j][0] == arrayOfUserIds[i]:
            #this is where conditions are checked in completeArray[j]
            if(condition(completeArray[j])):
                #this is where data is tranformed
                transformed = provideData(completeArray[j])
                completeDict[arrayOfUserIds[i]].append(transformed)         
        else:
            #ignore if the number of ratings is too small
            if (len(completeDict[arrayOfUserIds[i]])<10):
                del completeDict[arrayOfUserIds[i]]
            index = j+1
            break


In [3]:
#save in a file so that cells below can run without running the above
import csv

with open("constructedData/constructedData.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    for item in completeDict.keys():
        writer.writerows(completeDict[item])


In [4]:
#this is a starting point if the data is already saved to the csv file
import csv
import pandas as pd

dataList =[]

with open("constructedData/constructedData.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    dataList = list(csv_reader)

dataList = dataList[1:]


In [5]:
#movie id to list of ratings
movieDict = dict()

#user id to the rated movies by that user
userDict = dict()

#The list created by the constructed data is in order by user id
#this code makes a dictionary out of the data (user id to a list of text from all movies rated by the user)
#it also makes a dictionary of movies to their ratings
userId = -1
for row in dataList:
    if (row[0]!=userId):
        userId = row[0]
        userDict[row[0]] = [row]
    else:
        userDict[row[0]].append(row)

    if(row[1] in movieDict.keys()):
        movieDict[row[1]].append(row[2])
    else:
        movieDict[row[1]] = [row[2]]


In [6]:
from gensim.parsing.preprocessing import remove_stopwords

#dictionary of user id to a list of string of combined textual features for each movie 
#does not include ratings or movie id

combinedCorpus = dict()

i = 0
for key in userDict.keys():
    movieStrings = []
    for movieData in userDict[key]:
        movieString = ""
        #avoid the first three data points (user id, movieid, and rating)
        for index in range (3,len(movieData)):
            if(index!= len(movieData)-1):
                movieString+= movieData[index]+" "
            else:
                movieString+= movieData[index]
        cleaned = remove_stopwords(movieString)
        movieStrings.append(cleaned)
    combinedCorpus[key] = movieStrings

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import copy


#seed
seed_int = 1
random.seed(seed_int)

#get average rating for a single movie amoung all users who rated it
def getAverageMovieRating(movieId):
    ret =0 
    cnt = 0
    for item in movieDict[movieId]:
        ret+= float(item)
        cnt+=1
    return float(ret/cnt)


#get all the user ratings 
def getUserRatings(userId):
    ret = []
    for item in userDict[userId]:
        ret.append(float(item[2]))
    return ret

user_to_inputs = dict()
user_to_rand_ratings = dict()

#need to normalized data features
#note: ratings and similairty scores for a given user need to be ordered the same:
for key in combinedCorpus.keys():

    count_matrix = CountVectorizer().fit_transform(combinedCorpus[key]).toarray().tolist()
    #note: len(count_matrix)-1 included
    randIndex = random.randint(0, len(count_matrix)-1)
    randTestItem = count_matrix[randIndex]
    del count_matrix[randIndex]

    #find similarity with the count of each word between the random selected movie and the other movies rated by the user
    cosine_sim = cosine_similarity(X = count_matrix ,Y = [randTestItem])

    ratings = copy.deepcopy(getUserRatings(key))

    randomRating = ratings[randIndex]
    user_to_rand_ratings[key] = randomRating
    del ratings[randIndex]

    similairities = np.reshape(cosine_sim,  (len(cosine_sim)))

    #float symbol removed...
    averageRatingForUser =  float(np.sum(ratings)/(len(ratings)))
    #the movie in question is the randomly selected movie
    avergaeRatingForMovie = getAverageMovieRating(userDict[key][randIndex][1])

    #need to normalize model inputs so one feature does not dominate the other
    #https://www.youtube.com/watch?v=Bc2dWI3vnE0&ab_channel=KrishNaik
    #try adding the curve defining features of the users scores besides mean...

    #testing removal of sim and rating...
    #testting removal of averages
    for sim, rating in zip(similairities, ratings):
        if key not in user_to_inputs:
            user_to_inputs[key] = [[sim, rating]]
        else:
            user_to_inputs[key].append([sim, rating])

In [29]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
import random
import time
from sklearn.preprocessing import StandardScaler

start = time.time()

#seed
seed_int = 1
random.seed(seed_int)

#instead of using test train split...
user_to_X_train = dict()
user_to_y_train = dict()
user_to_X_test = dict()
user_to_y_test = dict()


#LOOK!!!
#Is there a problem with using the same users in training and testing???
#instead users should be in either test or train data not both
c1 = 0
c2 = 0
for key in user_to_inputs.keys():
    if(random.randint(0,3) == 0):
        user_to_X_test[key] = user_to_inputs[key]
        user_to_y_test[key] = user_to_rand_ratings[key]
        c1+=1

    else:
        user_to_X_train[key] = user_to_inputs[key]
        user_to_y_train[key] = user_to_rand_ratings[key]
        c2+=1

X_train = [] 
y_train = []
print("Random test:", c1, c2)

#note: there is no user_to_X_train[key] of length 0
for key in user_to_X_train.keys():
    for item in user_to_X_train[key]:
        X_train.append(item)
        y_train.append(user_to_y_train[key])


#should you standardize outputs (y_test and y_train)?

#tranform training features...
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)

#tranform test features...
X_test = []
y_test = []

keys = []
counts = []
for key in user_to_X_test.keys():
    cnt = 0
    #note: user_to_X_test[key] has at least one item...
    for item in user_to_X_test[key]:
        X_test.append(item)
        y_test.append(user_to_y_test[key])
        cnt+=1
    counts.append(cnt)
    keys.append(key)

scalar = StandardScaler()
X_test = scalar.fit_transform(X_test)

new_user_to_X_test = dict()
new_user_to_y_test = dict()

cnt = 0
for num, key in zip(counts, keys):
    new_user_to_X_test[key] = []
    for i in range(num):
        new_user_to_X_test[key].append(X_test[cnt])
        cnt+=1
    new_user_to_y_test[key] = user_to_y_test[key]


#need to use featurs scaling for X...
#also needs to apply a special function to feature scale for test data...
#user_to_X_test is not in list format it is a dictionary
#what should happend is creating a list out of the lists in the dictionary values
#Then returning the standardized values to a new dictionary with standardized inputs bu the same user mappin
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
#https://stackoverflow.com/questions/55296675/is-is-necessary-to-normalize-data-before-using-mlpregressor
#https://www.youtube.com/watch?v=uet8ZQpyJV8&ab_channel=NeuralNine
#https://stats.stackexchange.com/questions/278566/if-you-standardize-x-must-you-always-standardize-y

#LOOK!!!
#Is there a problem with using the same users in training and testing???


#try higher demensionality
#try more layers
#try adding the curve defining features of the users scores besides mean
#(10,10,10,10)
#(10,10,10)
#(10,10)
#(5,5,5)
layers = (5, 5,5)
act = "tanh"
solve = "adam"
regr = MLPRegressor(hidden_layer_sizes=layers,activation =act, solver =solve,  max_iter=10000, random_state =seed_int)
regr.fit(X_train, y_train)
print(regr.n_iter_)


# note: there could also be a weigthed average applied
# Assume there is no guaranteed order for the list of keys returned by the keys() function.

user_to_avg_rating = dict()

for key in new_user_to_X_test.keys():
    sum =0
    cnt =0 
    predicted = regr.predict(new_user_to_X_test[key])
    for item in predicted:
        sum+=item
        cnt+=1
    user_to_avg_rating[key] = float(sum/cnt)


actuals_list = []
preds_list = []
for key in user_to_avg_rating.keys():
    print("Pred: "+str(user_to_avg_rating[key]) , "Actual: "+str(new_user_to_y_test[key]))
    actuals_list.append(new_user_to_y_test[key])
    preds_list.append(user_to_avg_rating[key])


print("overall score:", r2_score(actuals_list, preds_list))


end = time.time()

print("Minutes:", float((end - start)/60))

#layers: (10,10)
#avg only: 0.1344422405859914
#all features: 0.1263325462370427

#layers: (10, 10,10)
#avg only: 0.049823527336465334
#all features: -0.015054211264476702

#layers: (5,5,5)
#avg only: 0.14120225901404682
#all features: 0.16311449260270638
#no averages: 0.02526772169834124



Random test: 642 1945
23
Pred: 3.623149617527699 Actual: 3.0
Pred: 3.574126872348588 Actual: 4.0
Pred: 3.4572251497973134 Actual: 3.0
Pred: 3.5534356265062046 Actual: 4.0
Pred: 3.543368859104209 Actual: 3.5
Pred: 3.4880972108910546 Actual: 3.0
Pred: 3.2062417816366917 Actual: 3.0
Pred: 3.503919318877813 Actual: 3.0
Pred: 3.509058668191241 Actual: 4.0
Pred: 3.4447375682908286 Actual: 1.0
Pred: 3.4654617862683317 Actual: 3.0
Pred: 3.5413110422504706 Actual: 3.0
Pred: 3.33010867169398 Actual: 3.0
Pred: 3.4461318638797693 Actual: 4.5
Pred: 3.512116985403862 Actual: 1.5
Pred: 3.470952997692286 Actual: 4.0
Pred: 3.392301481495895 Actual: 0.5
Pred: 3.5916476347302217 Actual: 5.0
Pred: 3.5738985627189503 Actual: 5.0
Pred: 3.4418771629872196 Actual: 3.0
Pred: 3.6848257365415384 Actual: 5.0
Pred: 3.590617469301948 Actual: 5.0
Pred: 3.520222187364846 Actual: 4.0
Pred: 3.5905987495323335 Actual: 4.0
Pred: 3.5221707530479827 Actual: 4.0
Pred: 3.503023192506815 Actual: 5.0
Pred: 3.558724881108671 Ac