In [1]:
import os 
import numpy as np 
import pandas as pd
from collections import defaultdict
import itertools
import ast

In [2]:
#hyper parameters
rating_threshold = 4.5 #only counted the usr rating if it is rated above the rating threshold
user_count_threshold = 500 #only keep the bk_id if the bk is rated by user for more than user_count_threshold times
split = [0.8, 0.1, 0.1] #train, dev, test split
#INPUT_DIR = '../../../data/the-movies-dataset/'
INPUT_DIR ='../../data/the-movies-dataset/'
OUTPUT_DIR = '../box-code/data/movie_data/movie_big/taxonomy/'

In [3]:
#read in raw data
#prepare data for the complete dataset
ratings_file = INPUT_DIR + 'ratings.csv'
df = pd.read_csv(ratings_file, delimiter=',')
df.dataframeName = 'ratings.csv'
metadata_file = INPUT_DIR + 'movies_metadata.csv'
df_metadata = pd.read_csv(metadata_file, delimiter=',')
df_metadata.dataframeName = 'metadata.csv'

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#preprocess data
df.drop(columns=['timestamp'])
df = df[df.rating >= rating_threshold]#Drop all entry with rating lower than the threshold

movie_dict = defaultdict(list)#dict with key: movie_id, val: the list of user_ids rated the movie above the rating threshold
for index, row in df.iterrows():    
    movieId = int(row['movieId'])
    usrId = int(row['userId'])
    movie_dict[movieId].append(usrId)

for key in list(movie_dict.keys()):#Drop all movie entry if number of rates below the user count threshold 
    if len(movie_dict[key]) < user_count_threshold:
        del movie_dict[key]

In [5]:
def marginal_prob(movie_id, count_matrix, num_users):
    '''function to get the marginal prob:
       P(movie_id1)       
    '''
    margn_count = count_matrix[ movie_id]
    return margn_count/num_users

def joint_prob(movie_id1, movie_id2, count_matrix, num_users):
    '''function to get the joint prob:
        P(movie_id1, movie_id2)
    '''
    key = (movie_id1, movie_id2) if movie_id1<= movie_id2 else (movie_id2, movie_id1) 
    joint_count = count_matrix[key] 
    return joint_count/num_users

def conditional_prob(movie_id1, movie_id2, count_matrix, marginal_matrix):
    '''function to get the conditional prob:
        P(movie_id1 | movie_id2)       
    '''
    key = (movie_id1, movie_id2) if movie_id1<= movie_id2 else (movie_id2, movie_id1)
    joint_count = count_matrix[key]
    if joint_count == 0:
        return 0
    margn_count = marginal_matrix[movie_id2]
    return joint_count/margn_count

#given ordered pair of key(a,b), generate both p(b|a) and p(a|b)
def data_generation(a,b, count_matrix, marginal_matrix):
    p_a_b = conditional_prob(a, b, count_matrix, marginal_matrix)#p(a|b)
    p_b_a = conditional_prob(b, a, count_matrix, marginal_matrix)#p(b|a)
    return(((a,b),p_b_a), ((b,a),p_a_b))

In [6]:
movie_id_set = set(movie_dict.keys())#make a set of valid movie ids used in our data after filtering
print(len(movie_id_set))


1817


In [7]:
#extract genre ID as integers from data
def extractGenreID(text, MAX_MV_ID ):
    tmp = ast.literal_eval(text)
    #ids = []
    gen = []
    for d in tmp:
       # ids.append(int(d['id'])+MAX_MV_ID)
        gen.append(d['name'])
   # return [ids, gen]
    return gen

In [8]:
MAX_MV_ID = 163949+1
mv_genre = {}#dict of movie and labeled genre pairs, key: movie id, value: list of genre id
genres = {} #dict for genres, key: genre id, value: genre name in English
for i in range(len(df_metadata)):
    try:
        this_mv_id = int(df_metadata.loc[i].id)
        this_mv_gen = extractGenreID(df_metadata.loc[i].genres, MAX_MV_ID)
        '''
        this_mv_genre_list = ids  
        for i in range(len(ids)):
            genres[ids[i]] = gen[i] #have tested every genre and genre_id pair are unique
        '''
        mv_genre[this_mv_id ] = this_mv_gen #have tested every mv_id only appear once in the dataframe        
    except ValueError:
        #print(i, df_metadata.loc[i].id,df_metadata.loc[i].genres )
        print("error in data")

error in data
error in data
error in data


In [9]:
print(len(mv_genre))
for k in list(mv_genre.keys()):#double check if all movie_genre_pairs are the valid movies after filtering
    if k not in movie_id_set:
        del mv_genre[k]
print(len(mv_genre))

45433
736


In [10]:
#create the movie_genre_master file
file = open(OUTPUT_DIR+"movie_genre_master.txt", "w") 
for movie_id in list(mv_genre.keys()):
    for genre_id in mv_genre[movie_id]:
        file.write("IsA\t"+str(movie_id)+"\t" + str(genre_id) + "\t" + str(1)+"\n")
file.close() 

In [11]:
user_dict = defaultdict(list)#dict with key: user, val: the movie_id this usr rated above given threshold
for key, val in movie_dict.items():
    for i in range(len(val)):
        user_dict[i].append(key)
        
#to make sure if a user rate a movie, the user only rate the movie once
for key, val in user_dict.items():
    user_dict[key] = list(set(val))

#movie_movie_count: key: (bk_id_i, bk_id_j) (bk_id_i != bk_id_j and bk_id_i < bk_id_j) 
#                 val: co-ocurrence count of bk_id_i and bk_id_j
#movie_movie_count_marginal: key: (bk_id_i) val: ocurrence count of bk_id_i in the dataset
movie_movie_count =  defaultdict(lambda: 0)
movie_movie_count_marginal = defaultdict(lambda:0)
for key, val in user_dict.items():
    for i in range(len(val)):
        movie_movie_count_marginal[val[i]] += 1 #marginal count
        for j in range(i+1, len(val)):#marginal count not included in book_book_count
            movie_movie_count[(val[i], val[j]) if val[i] <= val[j] else (val[j], val[i])] += 1
            

In [12]:
gen_gen_count =  defaultdict(lambda: 0)
gen_gen_count_marginal = defaultdict(lambda:0)

for key, val in mv_genre.items():
     for i in range(len(val)):
        gen_gen_count_marginal[val[i]] += 1 #marginal count
        for j in range(i+1, len(val)):#marginal count not included in book_book_count
            gen_gen_count[(val[i], val[j]) if val[i] <= val[j] else (val[j], val[i])] += 1

In [13]:
#create the movie vocab file
file = open(OUTPUT_DIR+"movie_vocabulary.txt", "w") 
marginal_keys = list(movie_movie_count_marginal.keys())
for i in marginal_keys:
    file.write(str(i)+"\n") 
file.close() 

#create the movie marginal prob file, order of value match order of books id in vocab file
file = open(OUTPUT_DIR+"movie_marginal_prob.txt", "w") 
N = len(user_dict)#number of user
for i in marginal_keys:
    file.write(str(marginal_prob(i, movie_movie_count_marginal, N))+"\n")
file.close() 

In [14]:
#create the genre vocab file
file = open(OUTPUT_DIR+"genre_vocabulary.txt", "w") 
marginal_keys = list(gen_gen_count_marginal.keys())
for i in marginal_keys:
    file.write(str(i)+"\n") 
file.close() 

#create the genre marginal prob file, order of value match order of books id in vocab file
file = open(OUTPUT_DIR+"genre_marginal_prob.txt", "w") 
N = len(mv_genre)#number of total movies
for i in marginal_keys:
    file.write(str(marginal_prob(i, gen_gen_count_marginal, N))+"\n")
file.close() 

In [15]:
temp = list(movie_movie_count.items())
movie_movie_master_data=[]
for data in temp:
    a,b = data_generation(data[0][0],data[0][1], movie_movie_count, movie_movie_count_marginal)
    movie_movie_master_data.append(a)
    movie_movie_master_data.append(b)
#create the train data file
file = open(OUTPUT_DIR + "movie_movie_master.txt", "w") 
for i in movie_movie_master_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 

In [16]:
temp = list(gen_gen_count.items())
gen_gen_master_data=[]
for data in temp:
    a,b = data_generation(data[0][0],data[0][1], gen_gen_count, gen_gen_count_marginal)
    gen_gen_master_data.append(a)
    gen_gen_master_data.append(b)
#create the train data file
file = open(OUTPUT_DIR + "genre_genre_master.txt", "w") 
for i in gen_gen_master_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 