In [52]:
import os 
import numpy as np 
import pandas as pd
from collections import defaultdict
import itertools
import ast

In [53]:
small_title_set = {"Kung Fu Panda 2", "Forrest Gump", "The Shawshank Redemption", "Pulp Fiction", "Wonder Women", "Beauty and the Beast", "Baby Driver", "Big Hero 6", "Deadpool", "Guardians of the Galaxy Vol.2", "The Avengers", "Dawn of the Planet of the Apes", "The Chain Reaction"}
rating_threshold = 3.5 #only counted the usr rating if it is rated above the rating threshold
user_count_threshold = 10 #only keep the bk_id if the bk is rated by user for more than user_count_threshold times

Beauty and the Beast
The Shawshank Redemption
The Avengers
The Chain Reaction
Guardians of the Galaxy Vol.2
Big Hero 6
Wonder Women
Deadpool
Forrest Gump
Pulp Fiction
Baby Driver
Dawn of the Planet of the Apes
Kung Fu Panda 2
13


In [None]:
for i in small_title_set:
    print(i)
print(len(small_title_set))

In [54]:
#hyper parameters
split = [0.8, 0.1, 0.1] #train, dev, test split
#INPUT_DIR = '../../../data/the-movies-dataset/'
INPUT_DIR ='../../data/the-movies-dataset/'
OUTPUT_DIR = '../box-code/data/movie_data/movie_big/small/'

In [55]:
#read in raw data
#prepare data for the complete dataset
ratings_file = INPUT_DIR + 'ratings.csv'
df = pd.read_csv(ratings_file, delimiter=',')
df.dataframeName = 'ratings.csv'
metadata_file = INPUT_DIR + 'movies_metadata.csv'
df_metadata = pd.read_csv(metadata_file, delimiter=',')
df_metadata.dataframeName = 'metadata.csv'
print(df.head(5))

   userId  movieId  rating   timestamp
0       1      110     1.0  1425941529
1       1      147     4.5  1425942435
2       1      858     5.0  1425941523
3       1     1221     5.0  1425941546
4       1     1246     5.0  1425941556


In [56]:
links_file = INPUT_DIR + 'links.csv'
df_link = pd.read_csv(links_file, delimiter=',')
df_link.dataframeName = 'links.csv'
print(df_link.head(5))

   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [57]:
#dict convert imdb_id to mv_id, key: imdb_id, val: mv_id
mv_id_convert={}
mv_id_back_convert = {}#for presentation
for index, row in df_link.iterrows():
    imdb_id = int(row['imdbId'])
    movie_id = int(row['movieId'])
    mv_id_convert[imdb_id] = movie_id 
    mv_id_back_convert[movie_id] = imdb_id

print(len(mv_id_convert.keys()))
print(len(mv_id_back_convert.keys()))

45843
45843


In [58]:
#drop unwanted data in the metadata 
print(len(df_metadata))
df_metadata = df_metadata[df_metadata['original_title'].isin(small_title_set)]
print(len(df_metadata))

45466
17


In [63]:
print(df_metadata.loc[292])

adult                                                                False
belongs_to_collection                                                  NaN
budget                                                             8000000
genres                   [{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...
homepage                                                               NaN
id                                                                     680
imdb_id                                                          tt0110912
original_language                                                       en
original_title                                                Pulp Fiction
overview                 A burger-loving hit man, his philosophical par...
popularity                                                          140.95
poster_path                                /dM2w364MScsjFf8pfMbaWUcWrR.jpg
production_companies     [{'name': 'Miramax Films', 'id': 14}, {'name':...
production_countries     

In [69]:
#extract genre ID as integers from data
def extractGenreID(text):
    tmp = ast.literal_eval(text)
    #ids = []
    gen = []
    for d in tmp:
       # ids.append(int(d['id'])+MAX_MV_ID)
        gen.append(d['name'].replace(" ", "-"))
   # return [ids, gen]
    return gen

mv_genre = {}#dict of movie and labeled genre pairs, key: movie id, value: list of genre id
genres = {} #dict for genres, key: genre id, value: genre name in English
small_movie_id_set = set()
movie_title_id_dict = {}#dict for movie_id and title conversion: key: id, val: title

#for i in range(len(df_metadata)):
    #title = df_metadata.loc[i].original_title
for index, row in df_metadata.iterrows():
    title = row['original_title']
    try:
        temp = row['imdb_id']
        if isinstance(temp, str) and temp[:2] == "tt":  #filter out corrupted data in df_metadata
            if int(temp[2:]) in mv_id_convert: #filter out the imdb_id s not in id convertion dict
                this_mv_id = mv_id_convert[int(temp[2:])]#convert imdb_id to movie id
                this_mv_gen = extractGenreID(row['genres'])
                mv_genre[this_mv_id] = this_mv_gen #have tested every mv_id only appear once in the dataframe 
                small_movie_id_set.add(this_mv_id)
                movie_title_id_dict[this_mv_id] = title 

    except (ValueError, TypeError) as e:  
        print("error in data")
        print(temp, type(temp))


In [70]:
print(len(mv_genre))
print(len(small_movie_id_set))

17
17


In [71]:
#drop unwanted data in the user rating data
print(len(df))
df = df[df['movieId'].isin(small_movie_id_set)]
print(len(df))

26024289
355784


In [72]:
#preprocess data
df.drop(columns=['timestamp'])
df = df[df.rating >= rating_threshold]#Drop all entry with rating lower than the threshold

movie_dict = defaultdict(list)#dict with key: movie_id, val: the list of user_ids rated the movie above the rating threshold
for index, row in df.iterrows():   
    movieId = int(row['movieId'])
    if movieId not in small_movie_id_set:
        continue
    usrId = int(row['userId'])
    movie_dict[movieId].append(usrId)

for key in list(movie_dict.keys()):#Drop all movie entry if number of rates below the user count threshold 
    if len(movie_dict[key]) < user_count_threshold:
        del movie_dict[key]

In [73]:
print(len(movie_dict))
for k,v in movie_dict.items():
    print(k,len(v))
    print(movie_title_id_dict[k])

12
296 72361
Pulp Fiction
318 83240
The Shawshank Redemption
595 25407
Beauty and the Beast
356 72973
Forrest Gump
89745 11447
The Avengers
112623 2683
Dawn of the Planet of the Apes
122904 7354
Deadpool
115617 7179
Big Hero 6
2153 383
The Avengers
168366 367
Beauty and the Beast
87222 1714
Kung Fu Panda 2
171763 255
Baby Driver


In [74]:
def marginal_prob(movie_id, count_matrix, num_users):
    '''function to get the marginal prob:
       P(movie_id1)       
    '''
    margn_count = count_matrix[ movie_id]
    return margn_count/num_users

def joint_prob(movie_id1, movie_id2, count_matrix, num_users):
    '''function to get the joint prob:
        P(movie_id1, movie_id2)
    '''
    key = (movie_id1, movie_id2) if movie_id1<= movie_id2 else (movie_id2, movie_id1) 
    joint_count = count_matrix[key] 
    return joint_count/num_users

def conditional_prob(movie_id1, movie_id2, count_matrix, marginal_matrix):
    '''function to get the conditional prob:
        P(movie_id1 | movie_id2)       
    '''
    key = (movie_id1, movie_id2) if movie_id1<= movie_id2 else (movie_id2, movie_id1)
    joint_count = count_matrix[key]
    if joint_count == 0:
        return 0
    margn_count = marginal_matrix[movie_id2]
    '''
    print("------")
    print("id1: ", movie_id1, "id2: ", movie_id2)
    print(count_matrix[(movie_id1, movie_id2)])
    print(marginal_matrix[movie_id1], marginal_matrix[movie_id2])
    print("------")
    '''
    #print(joint_count/margn_count)
    return joint_count/margn_count

#given ordered pair of key(a,b), generate both p(b|a) and p(a|b)
def data_generation(a,b, count_matrix, marginal_matrix):
    p_a_b = conditional_prob(a, b, count_matrix, marginal_matrix)#p(a|b)
    p_b_a = conditional_prob(b, a, count_matrix, marginal_matrix)#p(b|a)
    return(((a,b),p_b_a), ((b,a),p_a_b))

In [75]:
movie_id_set = set(movie_dict.keys())#make a set of valid movie ids used in our data after filtering
print(len(movie_id_set))


12


In [76]:
print(len(mv_genre))
for k in list(mv_genre.keys()):#double check if all movie_genre_pairs are the valid movies after filtering
    if k not in movie_id_set:
        del mv_genre[k]
print(len(mv_genre))

17
12


In [77]:
#create the movie_genre_master file
file = open(OUTPUT_DIR+"movie_genre_master.txt", "w") 
for movie_id in list(mv_genre.keys()):
    for genre_id in mv_genre[movie_id]:
        file.write("IsA\t"+str(movie_id)+"\t" + str(genre_id) + "\t" + str(1)+"\n")
file.close() 

In [78]:
user_dict = defaultdict(list)#dict with key: user, val: the movie_id this usr rated above given threshold
for key, val in movie_dict.items():
    for i in val:
        user_dict[i].append(key)
        
#to make sure if a user rate a movie, the user only rate the movie once
for key, val in user_dict.items():
    user_dict[key] = list(set(val))

#movie_movie_count: key: (bk_id_i, bk_id_j) (bk_id_i != bk_id_j and bk_id_i < bk_id_j) 
#                 val: co-ocurrence count of bk_id_i and bk_id_j
#movie_movie_count_marginal: key: (bk_id_i) val: ocurrence count of bk_id_i in the dataset
movie_movie_count =  defaultdict(lambda: 0)
movie_movie_count_marginal = defaultdict(lambda:0)
for key, val in user_dict.items():
    for i in range(len(val)):
        movie_movie_count_marginal[val[i]] += 1 #marginal count
        for j in range(i+1, len(val)):#marginal count not included in book_book_count
            movie_movie_count[(val[i], val[j]) if val[i] <= val[j] else (val[j], val[i])] += 1
            

In [79]:
gen_gen_count =  defaultdict(lambda: 0)
gen_gen_count_marginal = defaultdict(lambda:0)

for key, val in mv_genre.items():
     for i in range(len(val)):
        gen_gen_count_marginal[val[i]] += 1 #marginal count
        for j in range(i+1, len(val)):#marginal count not included in book_book_count
            gen_gen_count[(val[i], val[j]) if val[i] <= val[j] else (val[j], val[i])] += 1

In [80]:
#create the movie vocab file
file = open(OUTPUT_DIR+"movie_vocabulary.txt", "w") 
marginal_keys = list(movie_movie_count_marginal.keys())
for i in marginal_keys:
    file.write(str(i)+"\n") 
file.close() 

#create the movie marginal prob file, order of value match order of books id in vocab file
file = open(OUTPUT_DIR+"movie_marginal_prob.txt", "w") 
N = len(user_dict)#number of user
for i in marginal_keys:
    file.write(str(marginal_prob(i, movie_movie_count_marginal, N))+"\n")
file.close() 

In [81]:
#create the genre vocab file
file = open(OUTPUT_DIR+"genre_vocabulary.txt", "w") 
marginal_keys = list(gen_gen_count_marginal.keys())
for i in marginal_keys:
    file.write(str(i)+"\n") 
file.close() 

#create the genre marginal prob file, order of value match order of books id in vocab file
file = open(OUTPUT_DIR+"genre_marginal_prob.txt", "w") 
N = len(mv_genre)#number of total movies
for i in marginal_keys:
    file.write(str(marginal_prob(i, gen_gen_count_marginal, N))+"\n")
file.close() 

In [82]:
temp = list(movie_movie_count.items())
movie_movie_master_data=[]
for data in temp:
    a,b = data_generation(data[0][0],data[0][1], movie_movie_count, movie_movie_count_marginal)
    movie_movie_master_data.append(a)
    movie_movie_master_data.append(b)
#create the train data file
file = open(OUTPUT_DIR + "movie_movie_master.txt", "w") 
for i in movie_movie_master_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 

In [83]:
temp = list(gen_gen_count.items())
gen_gen_master_data=[]
for data in temp:
    a,b = data_generation(data[0][0],data[0][1], gen_gen_count, gen_gen_count_marginal)
    gen_gen_master_data.append(a)
    gen_gen_master_data.append(b)
#create the train data file
file = open(OUTPUT_DIR + "genre_genre_master.txt", "w") 
for i in gen_gen_master_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 