In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', None)

moviesFull = pd.read_csv('large_source_data/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={"tagline": "string", "id":"string", 'genres':"string", "title": "string", "tagline": "string","overview":"string", "production_companies" :"string"})
ratings = pd.read_csv('large_source_data/ratings.csv', usecols = ("userId", "movieId", "rating"), dtype={"userId": "string","movieId": "string","rating": "string"})
ratings = ratings.rename(columns={"movieId": "id"})

keywords = pd.read_csv('large_source_data/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})
credits = pd.read_csv("large_source_data/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})

complete =  pd.merge(moviesFull, ratings, on ="id")
complete =  pd.merge(complete,keywords, on ="id")
complete  = pd.merge(complete,credits, on ="id")


complete = complete.sort_values(by = 'userId')

complete  = complete.dropna()

complete  = complete.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]


In [2]:
import numpy as np
import ast


#used to filter the rows of a data
def condition(array):
    length = len(array[4])
    if(array[4][length-2:] == "[]"):
        return False
    length = len(array[5])
    if(array[5][length-2:] == "[]"):
        return False
    length = len(array[6])
    if(array[6][length-2:] == "[]"):
        return False
    length = len(array[7])
    if(array[7][length-2:] == "[]"):
        return False   
    length = len(array[8])
    if(array[8][length-4:]=="<NA>"):
        return False
    length = len(array[9])
    if(array[9][length-4:]=="<NA>"):
        return False 
    return True


#used to extract names from string of list of json formats
def populateNames(item):
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    cnt = 0
    for item in jsons:
        if(cnt == len(jsons)-1):
            tempDict = ast.literal_eval(item)
            names+=str(tempDict["name"])
        else:
            tempDict = ast.literal_eval(item+"}")
            names+=str(str(tempDict["name"])+" ")
        cnt += 1
    return names


def provideData(array):
    movieData = []
    movieData.append(int(array[0]))
    movieData.append(int(array[1]))
    movieData.append(float(array[2]))
    movieData.append(array[3])  

    movieData.append(populateNames(array[4]))
    movieData.append(populateNames(array[5]))
    movieData.append(populateNames(array[6]))
    movieData.append(populateNames(array[7]))

    movieData.append(str(array[8]))
    movieData.append(str(array[9]))
    return movieData
    


#convert the dataframe into an array
#and then build a dictionary
completeDict = dict()
completeArray = complete.to_numpy()
arrayOfUserIds = []


#get all unique user ids
lastId  = -1
for item in completeArray:
    if(item[0]!= lastId):
        arrayOfUserIds.append(item[0])
        lastId = item[0]


index  = 0
nofUsers = 5000
#5000 users are tested and potentially added to the dict
for i in range(0, nofUsers):
    completeDict[arrayOfUserIds[i]] = []
    for j in range(index, len(completeArray)):
        if completeArray[j][0] == arrayOfUserIds[i]:
            #this is where conditions are checked in completeArray[j]
            if(condition(completeArray[j])):
                #this is where data is tranformed
                transformed = provideData(completeArray[j])
                completeDict[arrayOfUserIds[i]].append(transformed)         
        else:
            #ignore if the number o ratings is too small
            if (len(completeDict[arrayOfUserIds[i]])<10):
                del completeDict[arrayOfUserIds[i]]
            index = j+1
            break


In [3]:
#save in a file so that cells below can run without running the above
import csv

with open("constructed_data/constructed_data.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    for item in completeDict.keys():
        writer.writerows(completeDict[item])


In [4]:
#this is a starting point if the data is already saved to the csv file
import csv
import pandas as pd

dataList =[]

with open("constructed_data/constructed_data.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    dataList = list(csv_reader)

dataList = dataList[1:]


In [5]:
#movie id to list of ratings
movieDict = dict()

#user id to the rated movies by that user
userDict = dict()

userId = -1
for row in dataList:
    if (row[0]!=userId):
        userId = row[0]
        userDict[row[0]] = [row]
    else:
        userDict[row[0]].append(row)

    if(row[1] in movieDict.keys()):
        movieDict[row[1]].append(row[2])
    else:
        movieDict[row[1]] = [row[2]]


In [6]:
from gensim.parsing.preprocessing import remove_stopwords

#dictionary of user id to a list of string of combined textual features for each movie 
#does not include ratings or movie id

combinedCorpus = dict()

i = 0
for key in userDict.keys():
    movieStrings = []
    for movieData in userDict[key]:
        movieString = ""
        #avoid the first three data points (user id, movieid, and rating)
        for index in range (3,len(movieData)):
            if(index!= len(movieData)-1):
                movieString+= movieData[index]+" "
            else:
                movieString+= movieData[index]
        cleaned = remove_stopwords(movieString)
        movieStrings.append(cleaned)
    combinedCorpus[key] = movieStrings

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import statistics
import math
import random
import copy
from scipy.stats import kurtosis
from scipy.stats import skew
import numpy as np

#analysis 
random.seed(1)


def getAverageMovieRatings(movieId):
    ret =0 
    cnt = 0
    for item in movieDict[movieId]:
        ret+= float(item)
        cnt+=1
    return (ret/cnt)


def getUserRatings(userId):
    ret = []
    for item in userDict[userId]:
        ret.append(float(item[2]))
    return ret


X = []
y = []


for key in combinedCorpus.keys():

    count_matrix = CountVectorizer().fit_transform(combinedCorpus[key]).toarray().tolist()
    randIndex = random.randint(0, len(count_matrix)-1)
    randTestItem = count_matrix[randIndex]
    del count_matrix[randIndex]

    #find similarity with the count of each word between the random movie and the other movies
    cosine_sim = cosine_similarity(X = count_matrix ,Y = [randTestItem])

    ratings = copy.deepcopy(getUserRatings(key))
    randomRating = ratings[randIndex]
    del ratings[randIndex]


    similairities = np.reshape(cosine_sim,  (len(cosine_sim)))
    averageRating =  sum(ratings)/(len(ratings))

    #populate features (more detail in paper)
    features = []

    features.append(getAverageMovieRatings(userDict[key][randIndex][1]))
    features.append(averageRating)
    features.append(sum(similairities)/(len(similairities)))
    features.append(statistics.variance(ratings))
    features.append(statistics.variance(similairities))

    skew1= skew(ratings)
    skew2 = skew(similairities)
    kurt1 = kurtosis(ratings)
    kurt2 = kurtosis(similairities)


    lst = []
    for item in [skew1,skew2, kurt1, kurt2]:
        if(math.isnan(item)):
            lst.append(0)
        else:   
            lst.append(item)

    features.append(lst[0])
    features.append(lst[1])
    features.append(lst[2])
    features.append(lst[3])

    totalRating = 0

    for sim, rating in zip(similairities, ratings):
        totalRating += sim*(rating-averageRating)
    

    features.append(totalRating)
    X.append(features)
    y.append(randomRating)



  skew1= skew(ratings)
  kurt1 = kurtosis(ratings)
  skew1= skew(ratings)
  kurt1 = kurtosis(ratings)


In [8]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neural_network import MLPRegressor


layers = (50, 50, 50)
act = "tanh"
solve = "adam"

regr = MLPRegressor(hidden_layer_sizes=layers,activation =act, solver =solve,  max_iter=1000, random_state =1)


k_folds = KFold(n_splits = 5)
scores = cross_val_score(regr, X, y, cv = k_folds)

print("cross validation scores:")
print(scores)
print("mean score:")
print(scores.mean())
print("layers:")
print(layers)
print("activation function:")
print(act)
print("solver:")
print(solve)


cross validation scores:
[0.23190825 0.23378539 0.2654336  0.13857587 0.19720419]
mean score:
0.21338145819859541
layers:
(50, 50, 50)
activation function:
tanh
solver:
adam
