In [5]:
import pandas as pd

pd.set_option('display.max_colwidth', None)

moviesFull = pd.read_csv('newdata/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={"tagline": "string", "id":"string", 'genres':"string", "title": "string", "tagline": "string","overview":"string", "production_companies" :"string"})
ratings = pd.read_csv('newdata/ratings.csv', usecols = ("userId", "movieId", "rating"), dtype={"userId": "string","movieId": "string","rating": "string"})
ratings = ratings.rename(columns={"movieId": "id"})

keywords = pd.read_csv('newdata/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})
credits = pd.read_csv("newdata/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})

complete =  pd.merge(moviesFull, ratings, on ="id")
complete =  pd.merge(complete,keywords, on ="id")
complete  = pd.merge(complete,credits, on ="id")


complete = complete.sort_values(by = 'userId')

complete  = complete.dropna()

complete  = complete.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]


In [6]:
import numpy as np
import ast


#used to filter the rows of a data
def condition(array):
    length = len(array[4])
    if(array[4][length-2:] == "[]"):
        return False
    length = len(array[5])
    if(array[5][length-2:] == "[]"):
        return False
    length = len(array[6])
    if(array[6][length-2:] == "[]"):
        return False
    length = len(array[7])
    if(array[7][length-2:] == "[]"):
        return False   
    length = len(array[8])
    if(array[8][length-4:]=="<NA>"):
        return False
    length = len(array[9])
    if(array[9][length-4:]=="<NA>"):
        return False 
    return True


#used to extract names from string of list of json formats
def populateNames(item):
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    cnt = 0
    for item in jsons:
        if(cnt == len(jsons)-1):
            tempDict = ast.literal_eval(item)
            names+=str(tempDict["name"])
        else:
            tempDict = ast.literal_eval(item+"}")
            names+=str(str(tempDict["name"])+" ")
        cnt += 1
    return names


def provideData(array):
    movieData = []
    movieData.append(int(array[0]))
    movieData.append(int(array[1]))
    movieData.append(float(array[2]))
    movieData.append(array[3])  

    movieData.append(populateNames(array[4]))
    movieData.append(populateNames(array[5]))
    movieData.append(populateNames(array[6]))
    movieData.append(populateNames(array[7]))

    movieData.append(str(array[8]))
    movieData.append(str(array[9]))
    return movieData
    


#convert the dataframe into an array
#and then build a dictionary
completeDict = dict()
completeArray = complete.to_numpy()
arrayOfUserIds = []


#get all unique user ids
lastId  = -1
for item in completeArray:
    if(item[0]!= lastId):
        arrayOfUserIds.append(item[0])
        lastId = item[0]


index  = 0
nofUsers = 5000
#5000 users are tested and potentially added to the dict
for i in range(0, nofUsers):
    completeDict[arrayOfUserIds[i]] = []
    for j in range(index, len(completeArray)):
        if completeArray[j][0] == arrayOfUserIds[i]:
            #this is where conditions are checked in completeArray[j]
            if(condition(completeArray[j])):
                #this is where data is tranformed
                transformed = provideData(completeArray[j])
                completeDict[arrayOfUserIds[i]].append(transformed)         
        else:
            #ignore if the number of ratings is too small
            if (len(completeDict[arrayOfUserIds[i]])<10):
                del completeDict[arrayOfUserIds[i]]
            index = j+1
            break


In [7]:
#save in a file so that cells below can run without running the above
import csv

with open("constructedData/constructedData.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    for item in completeDict.keys():
        writer.writerows(completeDict[item])


In [8]:
#this is a starting point if the data is already saved to the csv file
import csv
import pandas as pd

dataList =[]

with open("constructedData/constructedData.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    dataList = list(csv_reader)

dataList = dataList[1:]


In [9]:
#movie id to list of ratings
movieDict = dict()

#user id to the rated movies by that user
userDict = dict()

#The list created by the constructed data is in order by user id
#this code makes a dictionary out of the data (user id to a list of text from all movies rated by the user)
#it also makes a dictionary of movies to their ratings
userId = -1
for row in dataList:
    if (row[0]!=userId):
        userId = row[0]
        userDict[row[0]] = [row]
    else:
        userDict[row[0]].append(row)

    if(row[1] in movieDict.keys()):
        movieDict[row[1]].append(row[2])
    else:
        movieDict[row[1]] = [row[2]]


In [10]:
from gensim.parsing.preprocessing import remove_stopwords

#dictionary of user id to a list of string of combined textual features for each movie 
#does not include ratings or movie id

combinedCorpus = dict()

i = 0
for key in userDict.keys():
    movieStrings = []
    for movieData in userDict[key]:
        movieString = ""
        #avoid the first three data points (user id, movieid, and rating)
        for index in range (3,len(movieData)):
            if(index!= len(movieData)-1):
                movieString+= movieData[index]+" "
            else:
                movieString+= movieData[index]
        cleaned = remove_stopwords(movieString)
        movieStrings.append(cleaned)
    combinedCorpus[key] = movieStrings

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import statistics
import math
import random
import copy
from scipy.stats import kurtosis
from scipy.stats import skew
import numpy as np

#analysis 
random.seed(1)

#get average rating for a single movie amoung all users who rated it
def getAverageMovieRatings(movieId):
    ret =0 
    cnt = 0
    for item in movieDict[movieId]:
        ret+= float(item)
        cnt+=1
    return (ret/cnt)


#get all the user ratings
def getUserRatings(userId):
    ret = []
    for item in userDict[userId]:
        ret.append(float(item[2]))
    return ret


X = []
y = []
userToPair = dict()
userToRandRating = dict()




#note: ratings and similairty scores for a given user need to be ordered the same:

for key in combinedCorpus.keys():

    count_matrix = CountVectorizer().fit_transform(combinedCorpus[key]).toarray().tolist()
    randIndex = random.randint(0, len(count_matrix)-1)
    randTestItem = count_matrix[randIndex]
    del count_matrix[randIndex]

    #find similarity with the count of each word between the random movie and the other movies
    cosine_sim = cosine_similarity(X = count_matrix ,Y = [randTestItem])

    ratings = copy.deepcopy(getUserRatings(key))
    randomRating = ratings[randIndex]
    userToRandRating[key] = randomRating
    del ratings[randIndex]


    similairities = np.reshape(cosine_sim,  (len(cosine_sim)))
    #can be saved for later:
    # averageRating =  sum(ratings)/(len(ratings))

    #populate features (more detail in paper)
    # features = []


    #need a data structure that keeps track of all the movies a user rated
    #dictionary of user to [sim, rating] pair


    #should also include average user rating for movies and avergae rating for the movie amoung all users
    for sim, rating in zip(similairities, ratings):
        X.append([sim, rating])
        y.append(randomRating)
        if key not in userToPair:
            userToPair[key] = [[sim, rating]]
        else:
            userToPair[key].append([sim, rating])




    #can be saved for later:
    # #average rating for the test movie for all users
    # #note: userDict[key][randIndex][1] is a movie id
    # features.append(getAverageMovieRatings(userDict[key][randIndex][1]))
    # #average rating for all movies rated by the given user given by "key"
    # features.append(averageRating)
    # #average cosine similairty score between the randomly chosen movie rated by the user given by "key"
    # #and the other movies rated by the same user
    # #what does this do???
    # features.append(sum(similairities)/(len(similairities)))


    # old features
    # X.append(features)
    # y.append(randomRating)



In [24]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

#(20,10,10,5)
#(10, 10, 10)
#(100, 100, 100)
layers = (5, 5, 5)
act = "tanh"
solve = "adam"


#note: used to be 1000 iterations
regr = MLPRegressor(hidden_layer_sizes=layers,activation =act, solver =solve,  max_iter=1000, random_state =1)



X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .75)

regr.fit(X_train, y_train)
print(regr.n_iter_)


#note: this does not split training and test data
#there could also be a weigthed average applied

# there is no guaranteed order for the list of keys returned by the keys() function.
# In most cases, the key list is returned in the same order as the insertion,
# however, that behavior is NOT guaranteed and should not be depended on by your program.


userToAveRating = dict()

for key in userToPair.keys():
    predicted  = regr.predict(userToPair[key])
    sum = 0
    cnt = 0
    for item in predicted:
        sum+=item
        cnt+=1
    userToAveRating[key] = float(sum/cnt)



actualsList = []
predsList = []
for key in userToAveRating.keys():
    print("Pred: "+str(userToAveRating[key]) , "Actual: "+str(userToRandRating[key]))
    actualsList.append(userToRandRating[key])
    predsList.append(userToAveRating[key])


print("overall score:", r2_score(actualsList, predsList))
    
    

19
Pred: 3.6827740761038936 Actual: 4.0
Pred: 3.6239929295414837 Actual: 3.0
Pred: 3.5563527095792753 Actual: 4.0
Pred: 3.589677188631906 Actual: 4.0
Pred: 3.61008532940486 Actual: 4.0
Pred: 3.5721844045998115 Actual: 3.0
Pred: 3.4908460264477577 Actual: 4.0
Pred: 3.5365757987225988 Actual: 2.0
Pred: 3.3718175739944813 Actual: 2.0
Pred: 3.4922072121924734 Actual: 3.0
Pred: 3.4906773062061522 Actual: 2.5
Pred: 3.5749440228080362 Actual: 4.0
Pred: 3.565569675085462 Actual: 4.0
Pred: 3.503810183982435 Actual: 3.0
Pred: 3.555170466877438 Actual: 3.5
Pred: 3.4960990420444626 Actual: 4.0
Pred: 3.3641717415760772 Actual: 3.5
Pred: 3.554929766779995 Actual: 1.0
Pred: 3.497757245404082 Actual: 3.0
Pred: 3.6061084548847533 Actual: 3.5
Pred: 3.253770233649227 Actual: 3.0
Pred: 3.526636378796539 Actual: 3.0
Pred: 3.5297953394135573 Actual: 4.0
Pred: 3.4680313634701307 Actual: 1.0
Pred: 3.4911833078988495 Actual: 3.0
Pred: 3.692793349829357 Actual: 4.0
Pred: 3.3789048066662857 Actual: 3.0
Pred: 3.4