In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', None)

moviesFull = pd.read_csv('newdata/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={"tagline": "string", "id":"string", 'genres':"string", "title": "string", "tagline": "string","overview":"string", "production_companies" :"string"})
ratings = pd.read_csv('newdata/ratings.csv', usecols = ("userId", "movieId", "rating"), dtype={"userId": "string","movieId": "string","rating": "string"})
ratings = ratings.rename(columns={"movieId": "id"})

keywords = pd.read_csv('newdata/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})
credits = pd.read_csv("newdata/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})

complete =  pd.merge(moviesFull, ratings, on ="id")
complete =  pd.merge(complete,keywords, on ="id")
complete  = pd.merge(complete,credits, on ="id")


complete = complete.sort_values(by = 'userId')

complete  = complete.dropna()

complete  = complete.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]


In [2]:
import ast


#used to filter the rows of a data
def condition(array):
    length = len(array[4])
    if(array[4][length-2:] == "[]"):
        return False
    length = len(array[5])
    if(array[5][length-2:] == "[]"):
        return False
    length = len(array[6])
    if(array[6][length-2:] == "[]"):
        return False
    length = len(array[7])
    if(array[7][length-2:] == "[]"):
        return False   
    length = len(array[8])
    if(array[8][length-4:]=="<NA>"):
        return False
    length = len(array[9])
    if(array[9][length-4:]=="<NA>"):
        return False 
    return True


#used to extract names from string of list of json formats
def populateNames(item):
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    cnt = 0
    for item in jsons:
        if(cnt == len(jsons)-1):
            tempDict = ast.literal_eval(item)
            names+=str(tempDict["name"])
        else:
            tempDict = ast.literal_eval(item+"}")
            names+=str(str(tempDict["name"])+" ")
        cnt += 1
    return names


def provideData(array):
    movieData = []
    movieData.append(int(array[0]))
    movieData.append(int(array[1]))
    movieData.append(float(array[2]))
    movieData.append(array[3])  

    movieData.append(populateNames(array[4]))
    movieData.append(populateNames(array[5]))
    movieData.append(populateNames(array[6]))
    movieData.append(populateNames(array[7]))

    movieData.append(str(array[8]))
    movieData.append(str(array[9]))
    return movieData
    


#convert the dataframe into an array
#and then build a dictionary
completeDict = dict()
completeArray = complete.to_numpy()
arrayOfUserIds = []


#get all unique user ids
lastId  = -1
for item in completeArray:
    if(item[0]!= lastId):
        arrayOfUserIds.append(item[0])
        lastId = item[0]


index  = 0
nofUsers = 5000
#5000 users are tested and potentially added to the dict
for i in range(0, nofUsers):
    completeDict[arrayOfUserIds[i]] = []
    for j in range(index, len(completeArray)):
        if completeArray[j][0] == arrayOfUserIds[i]:
            #this is where conditions are checked in completeArray[j]
            if(condition(completeArray[j])):
                #this is where data is tranformed
                transformed = provideData(completeArray[j])
                completeDict[arrayOfUserIds[i]].append(transformed)         
        else:
            #ignore if the number of ratings is too small
            if (len(completeDict[arrayOfUserIds[i]])<10):
                del completeDict[arrayOfUserIds[i]]
            index = j+1
            break


In [3]:
#save in a file so that cells below can run without running the above
import csv

with open("constructedData/constructedData.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    for item in completeDict.keys():
        writer.writerows(completeDict[item])


In [19]:
#this is a starting point if the data is already saved to the csv file
import csv
import pandas as pd

dataList =[]

with open("constructedData/constructedData.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    dataList = list(csv_reader)

dataList = dataList[1:]


In [20]:
#movie id to list of ratings
movieDict = dict()

#user id to the rated movies by that user
userDict = dict()

#The list created by the constructed data is in order by user id
#this code makes a dictionary out of the data (user id to a list of text from all movies rated by the user)
#it also makes a dictionary of movies to their ratings
userId = -1
for row in dataList:
    if (row[0]!=userId):
        userId = row[0]
        userDict[row[0]] = [row]
    else:
        userDict[row[0]].append(row)

    if(row[1] in movieDict.keys()):
        movieDict[row[1]].append(row[2])
    else:
        movieDict[row[1]] = [row[2]]


In [21]:
from gensim.parsing.preprocessing import remove_stopwords

#dictionary of user id to a list of string of combined textual features for each movie 
#does not include ratings or movie id

combinedCorpus = dict()

i = 0
for key in userDict.keys():
    movieStrings = []
    for movieData in userDict[key]:
        movieString = ""
        #avoid the first three data points (user id, movieid, and rating)
        for index in range (3,len(movieData)):
            if(index!= len(movieData)-1):
                movieString+= movieData[index]+" "
            else:
                movieString+= movieData[index]
        cleaned = remove_stopwords(movieString)
        movieStrings.append(cleaned)
    combinedCorpus[key] = movieStrings

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import copy
import numpy as np

#seed
random.seed(1)

#get average rating for a single movie amoung all users who rated it
def getAverageMovieRating(movieId):
    ret =0 
    cnt = 0
    for item in movieDict[movieId]:
        ret+= float(item)
        cnt+=1
    return (ret/cnt)


#get all the user ratings 
def getUserRatings(userId):
    ret = []
    for item in userDict[userId]:
        ret.append(float(item[2]))
    return ret

userToInputs = dict()
userToRandRating = dict()

once = False
#note: ratings and similairty scores for a given user need to be ordered the same:
for key in combinedCorpus.keys():

    count_matrix = CountVectorizer().fit_transform(combinedCorpus[key]).toarray().tolist()
    #note: len(count_matrix)-1 included
    randIndex = random.randint(0, len(count_matrix)-1)
    randTestItem = count_matrix[randIndex]
    del count_matrix[randIndex]

    #find similarity with the count of each word between the random selected movie and the other movies rated by the user
    cosine_sim = cosine_similarity(X = count_matrix ,Y = [randTestItem])

    ratings = copy.deepcopy(getUserRatings(key))
    randomRating = ratings[randIndex]
    userToRandRating[key] = randomRating
    del ratings[randIndex]

    #testing
    print(ratings)
    print(type(ratings))

    similairities = np.reshape(cosine_sim,  (len(cosine_sim)))

    averageRatingForUser =  sum(ratings)/(len(ratings))
    #the movie in question is the randomly selected movie
    avergaeRatingForMovie = getAverageMovieRating(userDict[key][randIndex][1])

    if(not once):
        print(averageRatingForUser)
        print(avergaeRatingForMovie)
        once = True

    #testing removal of sim and rating...
    for sim, rating in zip(similairities, ratings):
        if key not in userToInputs:
            userToInputs[key] = [[averageRatingForUser,avergaeRatingForMovie]]
        else:
            userToInputs[key].append([averageRatingForUser,avergaeRatingForMovie])

[5.0, 5.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 4.0, 5.0, 5.0, 3.0, 4.0, 5.0, 4.0, 5.0, 5.0, 5.0]
<class 'list'>


TypeError: 'numpy.float64' object is not callable

In [8]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
import random

#seed
random.seed(1)

#instead of using test train split...
user_to_X_train = dict()
user_to_y_train = dict()
user_to_X_test = dict()
user_to_y_test = dict()


for key in userToInputs.keys():
    if(int(len(userToInputs[key])*.25) != 0):
        random.shuffle(userToInputs[key])
        user_to_X_train[key] = userToInputs[key][int(len(userToInputs[key])*.25):]
        user_to_X_test[key] = userToInputs[key][:int(len(userToInputs[key])*.25)]
        user_to_y_train[key] = userToRandRating[key]
        user_to_y_test[key] = userToRandRating[key]
    else:
        random.shuffle(userToInputs[key])
        user_to_X_train[key] = userToInputs[key][1:]
        user_to_X_test[key] = userToInputs[key][:1]
        user_to_y_train[key] = userToRandRating[key]
        user_to_y_test[key] = userToRandRating[key]

X = [] 
y = []


#note: excludes user_to_X_train[key] of length 0
for key in user_to_X_train.keys():
    for item in user_to_X_train[key]:
        X.append(item)
        y.append(user_to_y_train[key])


#(10,10,10,10)
layers = (10, 10, 10, 10)
act = "tanh"
solve = "adam"
regr = MLPRegressor(hidden_layer_sizes=layers,activation =act, solver =solve,  max_iter=1000, random_state =1)
regr.fit(X, y)
print(regr.n_iter_)



# note: there could also be a weigthed average applied
# There is no guaranteed order for the list of keys returned by the keys() function.

userToAveRating = dict()

for key in user_to_X_test.keys():
    sum =0
    cnt =0 
    predicted = regr.predict(user_to_X_test[key])
    for item in predicted:
        sum+=item
        cnt+=1
    userToAveRating[key] = float(sum/cnt)


actualsList = []
predsList = []
for key in userToAveRating.keys():
    print("Pred: "+str(userToAveRating[key]) , "Actual: "+str(user_to_y_test[key]))
    actualsList.append(user_to_y_test[key])
    predsList.append(userToAveRating[key])


print("overall score:", r2_score(actualsList, predsList))

577
Pred: 4.240795525950667 Actual: 4.0
Pred: 4.298789645074739 Actual: 3.0
Pred: 3.843038690039627 Actual: 4.0
Pred: 4.022002531891602 Actual: 4.0
Pred: 3.9125107779216 Actual: 4.0
Pred: 3.401153141974309 Actual: 3.0
Pred: 4.373861133990317 Actual: 4.0
Pred: 3.811068125885285 Actual: 2.0
Pred: 1.6888752360321502 Actual: 2.0
Pred: 4.1807881754847145 Actual: 3.0
Pred: 2.5172405194593965 Actual: 2.5
Pred: 3.9659157685661666 Actual: 4.0
Pred: 4.041632652719805 Actual: 4.0
Pred: 4.042439899330937 Actual: 3.0
Pred: 4.259573550499343 Actual: 3.5
Pred: 3.4158682477549873 Actual: 4.0
Pred: 2.64274898202869 Actual: 3.5
Pred: 3.3025478352263677 Actual: 1.0
Pred: 3.2876242579769785 Actual: 3.0
Pred: 4.145227665452383 Actual: 3.5
Pred: 3.0486450484087944 Actual: 3.0
Pred: 3.9195224635949404 Actual: 3.0
Pred: 3.784403978484348 Actual: 4.0
Pred: 1.8557305264571091 Actual: 1.0
Pred: 3.6479599356528403 Actual: 3.0
Pred: 4.195017271141255 Actual: 4.0
Pred: 3.039943712021509 Actual: 3.0
Pred: 4.13205027