In [15]:
import pandas as pd
import random as random

#seed for consistent results across runtime
seed_int = 1
random.seed(seed_int)

#This code is for combining certain data from the necessary csv files into a single dataframe (complete)
pd.set_option('display.max_colwidth', None)

movies_full = pd.read_csv('newdata/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={"tagline": "string", "id":"string", 'genres':"string", "title": "string", "tagline": "string","overview":"string", "production_companies" :"string"})
ratings = pd.read_csv('newdata/ratings.csv', usecols = ("userId", "movieId", "rating"), dtype={"userId": "string","movieId": "string","rating": "string"})
ratings = ratings.rename(columns={"movieId": "id"})

keywords = pd.read_csv('newdata/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})
credits = pd.read_csv("newdata/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})

complete =  pd.merge(movies_full, ratings, on ="id")
complete =  pd.merge(complete,keywords, on ="id")
complete  = pd.merge(complete,credits, on ="id")


#this can be used to group by a column yet does not sort
#this should be used after shuffling
#df.groupby("Animal", group_keys=False).apply(lambda x: x)

#new:
complete = complete.sample(frac=1, random_state = seed_int)
# print(complete.head())
complete = complete.groupby(by = "userId", sort = False, group_keys = False).apply(lambda x: x)
# print(complete.head())

#this is omitted since the values should not be sorted by userId just grouped by userId
#complete = complete.sort_values(by = 'userId')

complete  = complete.dropna()

complete  = complete.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]

#new: 
print(complete.head())

         userId    id rating                           title  \
8190632   58028   920    4.0                            Cars   
4570359   72052  2294    0.5  Jay and Silent Bob Strike Back   
4899847  240451   480    1.0                 Monsoon Wedding   
4312331  145345   849    0.5                           Krull   
2745889  209846  8970    4.0              The Out-of-Towners   

                                                                                                                                    genres  \
8190632  [{'id': 16, 'name': 'Animation'}, {'id': 12, 'name': 'Adventure'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]   
4570359                                                                                                     [{'id': 35, 'name': 'Comedy'}]   
4899847                                      [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]   
4312331                                     [{'

In [50]:
import ast
import random

#seed for consistent results across runtime
seed_int = 1
random.seed(seed_int)

#used to filter out the rows of data with empty entries
def condition(array):
    length = len(array[4])
    if(array[4][length-2:] == "[]"):
        return False
    length = len(array[5])
    if(array[5][length-2:] == "[]"):
        return False
    length = len(array[6])
    if(array[6][length-2:] == "[]"):
        return False
    length = len(array[7])
    if(array[7][length-2:] == "[]"):
        return False   
    length = len(array[8])
    if(array[8][length-4:]=="<NA>"):
        return False
    length = len(array[9])
    if(array[9][length-4:]=="<NA>"):
        return False 
    return True


#used to extract names
def populate_names(item):
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    cnt = 0
    for item in jsons:
        if(cnt == len(jsons)-1):
            temp_dict = ast.literal_eval(item)
            names+=str(temp_dict["name"])
        else:
            temp_dict = ast.literal_eval(item+"}")
            names+=str(str(temp_dict["name"])+" ")
        cnt += 1
    return names

#extract data from row of complete_array
def provide_data(array):
    movie_data = []
    movie_data.append(int(array[0]))
    movie_data.append(int(array[1]))
    movie_data.append(float(array[2]))
    movie_data.append(array[3])  

    movie_data.append(populate_names(array[4]))
    movie_data.append(populate_names(array[5]))
    movie_data.append(populate_names(array[6]))
    movie_data.append(populate_names(array[7]))

    movie_data.append(str(array[8]))
    movie_data.append(str(array[9]))
    return movie_data
    


#convert the dataframe into an array and build a dictionary
user_to_data = dict()
complete_array = complete.to_numpy()


#get all unique user ids
list_of_user_ids = []
last_id  = -1
for item in complete_array:
    if(item[0]!= last_id):
        list_of_user_ids.append(item[0])
        last_id = item[0]


index  = 0
#this has been tested with 5000, 10000, 20000, 100000
nof_users = 20000
#the users are taken in order...
#while they should be randomly chosen...

#populate user_to_data from complete_array
for i in range(0, nof_users):
    #a random integer can be generated that cannot be generated again...
    #that can be used as the place holder i 
    #or the data can be randomly shuffle before hand, user rating need to be together

    user_to_data[list_of_user_ids[i]] = []
    for j in range(index, len(complete_array)):
        if complete_array[j][0] == list_of_user_ids[i]:
            #condition is checked for complete_array[j]
            if(condition(complete_array[j])):
                #this is where data is tranformed
                transformed = provide_data(complete_array[j])
                user_to_data[list_of_user_ids[i]].append(transformed)         
        else:
            #should have a min and a max number of data points for consistency
            #success of this condition can also be what increments i instead of it auto-incrementing
            if (len(user_to_data[list_of_user_ids[i]])<50):
                del user_to_data[list_of_user_ids[i]]
            index = j+1
            break
        

In [51]:
#save in a file so that cells below can run without running this cell and above
import csv

with open("constructedData/constructedData.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    for key in user_to_data.keys():
        writer.writerows(user_to_data[key])

In [5]:
#this is a starting point if the data is already saved to the constructedData.csv file
import csv

data_list =[]

with open("constructedData/constructedData.csv", 'r', encoding="utf-8") as read_obj:
    csv_reader = csv.reader(read_obj)
    data_list = list(csv_reader)

data_list = data_list[1:]


In [9]:
from ordered_set import OrderedSet
import random

#seed for consistent results across runtime
seed_int = 1
random.seed(seed_int)

#user to data rows 
user_to_data = dict()
user_to_data_train = dict()
user_to_data_test = dict()

user_id = -1
for row in data_list:
    if (row[0]!=user_id):
        user_id = row[0]
        user_to_data[row[0]] = [row]
    else:
        user_to_data[row[0]].append(row)


for i in range(150):
    user = random.choice(list(user_to_data.keys()))
    user_to_data_train[user] = user_to_data[user]
    user_to_data.pop(user)

for i in range(50):
    user = random.choice(list(user_to_data_train.keys()))
    user_to_data_test[user] = user_to_data_train[user]
    user_to_data_train.pop(user)


user_to_data.clear()


In [10]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
import random
import json

#includes omitted movies
user_to_movie_id_to_corpus_train = dict()
movies_in_order = OrderedSet()
user_to_movie_id_to_rating = dict()
movie_id_to_ratings = dict()
movie_id_to_average_rating = dict()

#this is what determines ommited movies
user_to_rand_movie_id = dict()

#used to create overall_average
#omits movies with user_to_rand_movie_id
overall_sum = 0
overall_counts = 0 

# WordNetLemmatizer().lemmatize(token.lower())
wnl = WordNetLemmatizer()


for user in user_to_data_train.keys():
    movie_strings = dict()
    movie_id_to_rating_temp = dict()
    cnt = 0
    #does this cover all of 0-len(user_to_data_train[user])-1 ???
    rand_int = random.randint(0, len(user_to_data_train[user])-1)
    for movie_data in user_to_data_train[user]:
        if cnt == rand_int:    
            user_to_rand_movie_id[user] = movie_data[1]
        else:
            overall_sum += float(movie_data[2])
            overall_counts += 1

        #this also includes omitted ratings (the rating to be predicted by the model)
        if movie_data[1] in movie_id_to_ratings.keys():
            movie_id_to_ratings[movie_data[1]].append(float(movie_data[2]))
        else:
            movie_id_to_ratings[movie_data[1]] = [float(movie_data[2])]

        movies_in_order.add(movie_data[1])
        movie_string = ""
        #avoid the first three data points (user id, movieid, and rating)
        #use only the text data
        for index in range (3,len(movie_data)):
            if(index!= len(movie_data)-1):
                movie_string+= movie_data[index]+" "
            else:
                movie_string+= movie_data[index]
        cleaned = remove_stopwords(movie_string)
        #why recreate the string when you could accept it as a list???
        # cleaned = " ".join([wnl.lemmatize(word) for word in cleaned.split(" ")])
        cleaned = [wnl.lemmatize(word) for word in cleaned.split(" ")]
        # cleaned.sort()
        movie_strings[movie_data[1]] = cleaned
        movie_id_to_rating_temp[movie_data[1]] = float(movie_data[2])
        cnt+=1
    user_to_movie_id_to_corpus_train[user] = movie_strings
    user_to_movie_id_to_rating[user] = movie_id_to_rating_temp

overall_average = float(overall_sum/overall_counts)

#there needs to be a replacement when there is only a single rating in the dataset for a movie
#there is no problem if there are 2 or more ratings for a single movie
#this replacement can be the overall average of all ratings or perhaps some guess based on the users
#other ratings 

for movie in movie_id_to_ratings.keys():
    temp = 0
    for rating in movie_id_to_ratings[movie]:
        temp +=rating
    movie_id_to_average_rating[movie] = float(temp/len(movie_id_to_ratings[movie]))



file = open("test_dicts.txt", 'w')
string = json.dumps(user_to_movie_id_to_rating)
file.write(string)
file.close()



In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import copy
from scipy.stats import kurtosis
from scipy.stats import skew
import statistics
import math
from ordered_set import OrderedSet
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import r2_score

#these 2 data strctures all have omited values from user_to_rand_movie_id[user]
user_to_word_counts = dict()
words_in_order = OrderedSet()

#the ommited movie rating is marked with -2 and the unrated movies to be filled in are marked with -1 
user_to_ratings = dict()

#this is the user to word counts of the omitted movies with user_to_rand_movie_id[user]
user_to_rand_word_counts = dict()

#this store the actual ratings of movies to be predicted by the model
user_to_rating_to_predict = dict()


for user in user_to_movie_id_to_corpus_train.keys():
    temp = []
    #note: length of user_to_ratings[user] is always equal to len(movies_in_order)
    for movie_id in movies_in_order:
        if movie_id != user_to_rand_movie_id[user]:
            if movie_id in user_to_movie_id_to_rating[user].keys():
                temp.append(user_to_movie_id_to_rating[user][movie_id])
            else:
                #this signifies the ratings to be filled in before predictions from the model take place
                temp.append(-1)
        else:
            #this signifies the ratings to be predicted by the model
            temp.append(-2)
            user_to_rating_to_predict[user] = user_to_movie_id_to_rating[user][movie_id]
    user_to_ratings[user] = temp


#note: words_in_order has ommited words from a single movie (user_to_rand_movie_id[user])
for user in user_to_movie_id_to_corpus_train.keys():
    for movie_id in user_to_movie_id_to_corpus_train[user].keys():
        if movie_id != user_to_rand_movie_id[user]:
            #this is the problem line!!!
            for word in user_to_movie_id_to_corpus_train[user][movie_id]:
                words_in_order.add(word)

            

#note: user_to_word_counts and movie_id_to_word_counts have an omitted movies...
#by user_to_rand_movie_id[user]

#note: user to word counts needs to omit movies with no rating
for user in user_to_movie_id_to_corpus_train.keys():
    user_to_word_counts[user] = []
    for movie_id in user_to_movie_id_to_corpus_train[user].keys():
        if movie_id != user_to_rand_movie_id[user]:
            temp = []
            for word in words_in_order:
                temp.append(user_to_movie_id_to_corpus_train[user][movie_id].count(word))
            user_to_word_counts[user].append(temp)
        else:
            temp = []
            for word in words_in_order:
                temp.append(user_to_movie_id_to_corpus_train[user][movie_id].count(word))
            user_to_rand_word_counts[user] = temp



#what if a new user to be tested has new words that are new to words_in_order???
#what if a user simply has more words than normal, need to normalize(not this is doen with cossine similairity)
#what if a user rated a movie that has not been rated by any users??? (is it omited ???)
#do the randomly selected users meed to be omited from the words_in_order and user_to_word_counts
#to emulate completely new users???


def predict(user, word_counts):
    values = []
    #note: user_to_word_counts[user] length is equal to the number of movies rated by the user...
    #minus one for the user_to_rand_movie_id[user]

    #note: user to word counts needs to omit movies with no rating
    for counts in user_to_word_counts[user]:
        values.append(counts)
    
    cosine_sim = cosine_similarity(X = values ,Y = [word_counts])
    #not sure if reshape is needed???
    cosine_sim = np.reshape(cosine_sim,  (len(cosine_sim)))

    ratings = []
    #note: user_to_ratings[user] length is equal to the number of movies
    #the length of ratings is the number of rated movies -1 for user_to_rand_movie_id[user]
    for rating in user_to_ratings[user]:
        if(rating != -1 and rating != -2):
            ratings.append(rating)


    #note: for accuracy purposes, the user should have a certain number of rated movies
    #instead of selecting the most similair movie should clusters be used???
    #for now this simple method will suffice
    #are the ratings lined up with the cossine similarities???
    #yes the order of user_to_ratings is used to make user_to_corpus_list...
    #which is used to make lst_of_word_to_nof_occurances

    combined = zip(cosine_sim, ratings)
    combined = sorted(combined, key=lambda x: x[0], reverse=True)
    avg = 0
    for i in range(0, 10):
        avg += combined[i][1]


    return float(avg/10.0)


#fill in the missing ratings for a user
def fill_in_ratings(user):
    cnt = 0
    pred = 0
    #note: it is possible that movies in order includes some movies that are not part of user_to_ratings[user]
    #due to the randomly omitted movie for each user in user_to_ratings
    
    for rating in user_to_ratings[user]:
        if rating == -2:
            user_to_ratings[user][cnt] = predict(user, user_to_rand_word_counts[user])
            pred = user_to_ratings[user][cnt]
            break
        cnt+=1

    return pred


predicted = []
true = []
for user in user_to_movie_id_to_corpus_train.keys():
    # commented out for testing
    # predicted.append(fill_in_ratings(user))
    true.append(user_to_rating_to_predict[user])

# commented out for testing
# print(r2_score(true, predicted))


predicted = []
#if there is a movie unrated by any other user this will throw an error
for user in user_to_movie_id_to_corpus_train.keys(): 
    if(len(movie_id_to_ratings[user_to_rand_movie_id[user]])==1):
        predicted.append(overall_average)
    else:
        predicted.append(float(((movie_id_to_average_rating[user_to_rand_movie_id[user]]
                        *len(movie_id_to_ratings[user_to_rand_movie_id[user]]))
                        -user_to_movie_id_to_rating[user][user_to_rand_movie_id[user]])
                        /(len(movie_id_to_ratings[user_to_rand_movie_id[user]])-1)))


#note: can also use count here...
nof_overall_avg = 0

for item in predicted:
    if item == overall_average:
        nof_overall_avg += 1

print(nof_overall_avg)
print(r2_score(true, predicted))


#now need to try r2score for average rating amoung all users vs real ratings
#what happens if the movies does not have addiitonal ratings???





#now for the user comparison logic (need user to list of movie ratings)
#fill in ratings that the user hasn't watched with the method above
#then cluster the users by their ratings

#note: agglomerative clustering might make more sense here since k-means has random init for centroids...
#note: to guess a new users rating requires that none of that users ratings have been used to train the model
#The data needs to be split into test and train before modeling the algorithm on the train data

#Training process:
#split data into test and train data
#proceed with train data...
#cluster movies by the tokens with range for k
#cluster users by the ratings with range for k and (fill in ratings for movies a users hasn't watched with some guess)
#guess: this can be obtained by clustering the movies that the user has watched...
#for each movie the user hasn't watched find the cluster that it belongs to with the highest possible k value
#that the user has at least one movie belonging to one of the clusters and then take the average of those movies
#this is exactly like a later training step excpet it is applied to all the movies the user watched

#for a single randomly chosen movie from each user in the trainging data...

#find the cluster the movie belongs to 
#find the movies part of that same cluster that the user has scored at the highest possible k value
#take the average score of these movies
#find the cluster the user belongs to
#find the average rating of the movie for users in that cluster at the highest possible k value
#train an mlp model with both averages and perhaps some extra statistics as features...
#using the given movie ratings as actuals


#The process of predicting a rating:
#1. find the cluster the movie belongs to 
#2. find the movies part of that same cluster that the user has scored at the highest possible k value
#3. take the average score of these movies
#4. find the cluster the user belongs to
#5. find the average rating of the movie for users in that cluster at the highest possible k value
#6. input into the trained mlp model both averages and perhaps some extra statistics
#7. make predictions and test against the randomly chosen movies actual ratings


#summary:
#find cluster for movie -> find movies part of the same clusters that the users rated -> average
#question: are the clusters unique to the movies the user has watched or to all movies???
#what is the technical difference???
#is this the same as finding the most simimlair movie the user rated and copying the rating???

#find cluster for user -> find the ratings for the movie by people in the same cluster -> average

#other avenues considered:
#idea 1:
#for the first process, instead of averaging the movies that only the user rated, find other users that are...
#like the user in question and find the average for that movie cluster
#Problem: it is better to get the users raw opionion rather than generalizing it to some like minded users
#there is an extra costly step to this
#idea 2: 
#for the second process, instead of finding the average rating for the movie in the same cluster of users...
#also find the average rating of movies that are like the movie in question 
#Problem, it is better to get the movies rating itself as it would be the most accurate indicator
#there is an extra costly step to this



#next steps
#This took a little under an hour to compute: 57m 20.7 seconds

#test with overall averges:
#-0.07017692724590896 seed: 3
#0.15101902163346914 seed: 0
#seed: 1, 0.16889559289764922 (equal to average: 3)



3
0.16889559289764922
