In [1]:
import os 
import numpy as np 
import pandas as pd
from collections import defaultdict
import itertools
import ast

In [2]:
#hyper parameters
rating_threshold = 4 #only counted the usr rating if it is rated above the rating threshold
user_count_threshold = 500 #only keep the bk_id if the bk is rated by user for more than user_count_threshold times
split = [0.8, 0.1, 0.1] #train, dev, test split
#INPUT_DIR = '../../../data/the-movies-dataset/'
INPUT_DIR ='../../data/goodbooks-10k-master/'
OUTPUT_DIR = '../box-code/data/book_data/big/taxonomy/'

In [3]:
#read in bk_id and usr rating
ratings_file = INPUT_DIR + 'ratings.csv'
df = pd.read_csv(ratings_file, delimiter=',')
df.dataframeName = 'ratings.csv'
#print(df.head(5))

In [4]:
#read in all tag name and their id
tag_name_file = INPUT_DIR + 'tags.csv'
df_tag_name = pd.read_csv(tag_name_file, delimiter=',')
df_tag_name.dataframeName = 'tags.csv'
#print(df_tag_name.size)

In [5]:
#read in goodreads_book_id and their labeled tags with tag_id
bk_tag_file = INPUT_DIR + 'book_tags.csv'
df_bk_tag = pd.read_csv(bk_tag_file, delimiter=',')
df_bk_tag.dataframeName = 'book_tags.csv'
#df_bk_tag.drop(columns=['count']) drop count

In [6]:
#read in valid tag names
genre_name_file = INPUT_DIR + "book-genres.txt"
df_valid_tag = pd.read_csv(genre_name_file, sep= "\n")
df_valid_tag.dataframeName = 'book-genres.txt'
df_valid_tag = df_valid_tag[~df_valid_tag.genres.str.contains(" ")]#delete the lines that counts how many books has the tag
print(df_valid_tag.size)

1232


In [7]:
#number of tags obtained from website: 1232
#number of tags in the tag id vs name datafile:  34252
#overlap of the two: 833

In [8]:
#read in bk id and goodreads book id, later convert all goodreads bk id to bk id
bk_id_file = INPUT_DIR + 'books.csv'
df_bk_id = pd.read_csv(bk_id_file, delimiter=',')
df_bk_id.dataframeName = 'books.csv'
df_bk_id=df_bk_id[['book_id','goodreads_book_id']]

In [9]:
#create a dict map goodreads bk id to bk id, we will use bk id to train
id_name_dict={} #key: good_read_id, val: bk_id
for index, row in df_bk_id.iterrows():
    id_name_dict[row['goodreads_book_id']] = row['book_id'] 

In [10]:
valid_gen = set([])#set of names of valid tags, this list is obtained from the goodreads website
for index, row in df_valid_tag.iterrows():
    valid_gen.add(row['genres'])
print(len(valid_gen))

1232


In [11]:
#create dictionary of valid tag id and tag names, key: valid tag id, value: valid tag name
valid_tag_dict = {}
for index, row in df_tag_name.iterrows():
    tag_name = row['tag_name']
    tag_id = row['tag_id']
    if tag_name in valid_gen:
        valid_tag_dict[tag_id] = tag_name 
print(len(valid_tag_dict.keys()))

833


In [12]:
#preprocess usr rating data, prepare for bk bk pairs
df = df[df.rating >= rating_threshold]#Drop all entry with rating lower than the threshold
bk_usr_dict = defaultdict(list)#dict with key: bk_id, val: the list of user_ids rated the bk above the rating threshold
for index, row in df.iterrows():    
    movieId = int(row['book_id'])
    usrId = int(row['user_id'])
    bk_usr_dict[movieId].append(usrId)

for key in list(bk_usr_dict.keys()):#Drop all movie entry if number of rates below the user count threshold 
    if len(bk_usr_dict[key]) < user_count_threshold:
        del bk_usr_dict[key]
print(len(bk_usr_dict.keys()))#736 for 4.5 500

1723


In [14]:
print(len(bk_usr_dict.keys()))

1723


In [15]:
#create dict of each book and the tags they are labeled with, key: bk_id, val: list of tag names
bk_tag_dict = defaultdict(list)
valid_data_count = 0## count the number of pieces of valid bk-tag relations 
for index, row in df_bk_tag.iterrows():
    gd_read_id = row['goodreads_book_id']
    tag = row['tag_id']
    bk_id =id_name_dict[gd_read_id]
    #check if the tag is a valid tag, and the bk is valid after thresholding and filtering
    if (tag in valid_tag_dict) and (bk_id in bk_usr_dict):
        bk_tag_dict[bk_id].append(valid_tag_dict[tag])
        valid_data_count += 1##
        
#there are totally 99912 rows in the bk-tag datafile, valid_data_count is the number of rows we use    
print(valid_data_count)

38212


In [16]:
print(len(bk_tag_dict.keys()))
print(len(bk_usr_dict.keys()))

1723
1723


In [30]:
def marginal_prob(movie_id, count_matrix, num_users):
    '''function to get the marginal prob:
       P(movie_id1)       
    '''
    margn_count = count_matrix[ movie_id]
    return margn_count/num_users

def joint_prob(movie_id1, movie_id2, count_matrix, num_users):
    '''function to get the joint prob:
        P(movie_id1, movie_id2)
    '''
    key = (movie_id1, movie_id2) if movie_id1<= movie_id2 else (movie_id2, movie_id1) 
    joint_count = count_matrix[key] 
    return joint_count/num_users

def conditional_prob(movie_id1, movie_id2, count_matrix, marginal_matrix):
    '''function to get the conditional prob:
        P(movie_id1 | movie_id2)       
    '''
    key = (movie_id1, movie_id2) if movie_id1<= movie_id2 else (movie_id2, movie_id1)
    joint_count = count_matrix[key]
    if joint_count == 0:
        return 0
    margn_count = marginal_matrix[movie_id2]
    '''
   
        print("------")
        print("id1: ", movie_id1, "id2: ", movie_id2)
        print(count_matrix[(movie_id1, movie_id2)])
        print(marginal_matrix[movie_id1], marginal_matrix[movie_id2])
        print(joint_count,margn_count,  joint_count/margn_count)
        print("------")
   '''
    return joint_count/margn_count

#given ordered pair of key(a,b), generate both p(b|a) and p(a|b)
def data_generation(a,b, count_matrix, marginal_matrix):
    p_a_b = conditional_prob(a, b, count_matrix, marginal_matrix)#p(a|b)
    p_b_a = conditional_prob(b, a, count_matrix, marginal_matrix)#p(b|a)
    return(((a,b),p_b_a), ((b,a),p_a_b))

In [18]:
#create the movie_genre_master file
file = open(OUTPUT_DIR+"book_genre_master.txt", "w") 
for movie_id in list(bk_tag_dict.keys()):
    for genre_id in bk_tag_dict[movie_id]:
        file.write("IsA\t"+str(movie_id)+"\t" + str(genre_id) + "\t" + str(1)+"\n")
file.close() 

In [19]:
#usr_bk_dict: dict with key: user, val: the book_id this usr rated above given threshold
usr_bk_dict = defaultdict(list)
for key, val in bk_usr_dict.items():
    for i in val:
        usr_bk_dict[i].append(key)
        
#to make sure if a user rate a movie, the user only rate the movie once
for key, val in usr_bk_dict.items():
    usr_bk_dict[key] = list(set(val))

#movie_movie_count: key: (bk_id_i, bk_id_j) (bk_id_i != bk_id_j and bk_id_i < bk_id_j) 
#                 val: co-ocurrence count of bk_id_i and bk_id_j
#movie_movie_count_marginal: key: (bk_id_i) val: ocurrence count of bk_id_i in the dataset
movie_movie_count =  defaultdict(lambda: 0)
movie_movie_count_marginal = defaultdict(lambda:0)
for key, val in usr_bk_dict.items():
    for i in range(len(val)):
        movie_movie_count_marginal[val[i]] += 1 #marginal count
        for j in range(i+1, len(val)):#marginal count not included in book_book_count
            movie_movie_count[(val[i], val[j]) if val[i] <= val[j] else (val[j], val[i])] += 1
            

In [20]:
gen_gen_count =  defaultdict(lambda: 0)
gen_gen_count_marginal = defaultdict(lambda:0)

for key, val in bk_tag_dict.items():
     for i in range(len(val)):
        gen_gen_count_marginal[val[i]] += 1 #marginal count
        for j in range(i+1, len(val)):#marginal count not included in book_book_count
            gen_gen_count[(val[i], val[j]) if val[i] <= val[j] else (val[j], val[i])] += 1

In [21]:
#create the movie vocab file
file = open(OUTPUT_DIR+"book_vocabulary.txt", "w") 
marginal_keys = list(movie_movie_count_marginal.keys())
for i in marginal_keys:
    file.write(str(i)+"\n") 
file.close() 

#create the movie marginal prob file, order of value match order of books id in vocab file
file = open(OUTPUT_DIR+"book_marginal_prob.txt", "w") 
N = len(usr_bk_dict)#number of user
for i in marginal_keys:
    file.write(str(marginal_prob(i, movie_movie_count_marginal, N))+"\n")
file.close() 

In [22]:
#create the genre vocab file
file = open(OUTPUT_DIR+"genre_vocabulary.txt", "w") 
marginal_keys = list(gen_gen_count_marginal.keys())
for i in marginal_keys:
    file.write(str(i)+"\n") 
file.close() 

#create the genre marginal prob file, order of value match order of books id in vocab file
file = open(OUTPUT_DIR+"genre_marginal_prob.txt", "w") 
N = len(bk_tag_dict)#number of total movies
for i in marginal_keys:
    file.write(str(marginal_prob(i, gen_gen_count_marginal, N))+"\n")
file.close() 

In [23]:
temp = list(movie_movie_count.items())
movie_movie_master_data=[]
for data in temp:
    a,b = data_generation(data[0][0],data[0][1], movie_movie_count, movie_movie_count_marginal)
    movie_movie_master_data.append(a)
    movie_movie_master_data.append(b)
#create the train data file
file = open(OUTPUT_DIR + "book_book_master.txt", "w") 
for i in movie_movie_master_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 

In [31]:
temp = list(gen_gen_count.items())
gen_gen_master_data=[]
for data in temp:
    a,b = data_generation(data[0][0],data[0][1], gen_gen_count, gen_gen_count_marginal)
    gen_gen_master_data.append(a)
    gen_gen_master_data.append(b)
#create the train data file
file = open(OUTPUT_DIR + "genre_genre_master.txt", "w") 
for i in gen_gen_master_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 

------
id1:  fantasy id2:  young-adult
545
905 814
545 814 0.6695331695331695
------
------
id1:  young-adult id2:  fantasy
0
814 905
545 905 0.6022099447513812
------
------
id1:  fantasy id2:  fiction
897
905 1664
897 1664 0.5390625
------
------
id1:  fiction id2:  fantasy
0
1664 905
897 905 0.9911602209944751
------
------
id1:  fantasy id2:  magic
408
905 411
408 411 0.9927007299270073
------
------
id1:  magic id2:  fantasy
0
411 905
408 905 0.450828729281768
------
------
id1:  childrens id2:  fantasy
234
304 905
234 905 0.2585635359116022
------
------
id1:  fantasy id2:  childrens
0
905 304
234 304 0.7697368421052632
------
------
id1:  adventure id2:  fantasy
613
781 905
613 905 0.6773480662983425
------
------
id1:  fantasy id2:  adventure
0
905 781
613 781 0.7848911651728553
------
------
id1:  classics id2:  fantasy
391
769 905
391 905 0.4320441988950276
------
------
id1:  fantasy id2:  classics
0
905 769
391 769 0.5084525357607282
------
------
id1:  fantasy id2:  novels

------
id1:  economics id2:  fantasy
2
25 905
2 905 0.0022099447513812156
------
------
id1:  fantasy id2:  economics
0
905 25
2 25 0.08
------
------
id1:  business id2:  fantasy
2
46 905
2 905 0.0022099447513812156
------
------
id1:  fantasy id2:  business
0
905 46
2 46 0.043478260869565216
------
------
id1:  fantasy id2:  non-fiction
32
905 265
32 265 0.12075471698113208
------
------
id1:  non-fiction id2:  fantasy
0
265 905
32 905 0.03535911602209945
------
------
id1:  fantasy id2:  modern
40
905 109
40 109 0.3669724770642202
------
------
id1:  modern id2:  fantasy
0
109 905
40 905 0.04419889502762431
------
------
id1:  fantasy id2:  high-school
126
905 308
126 308 0.4090909090909091
------
------
id1:  high-school id2:  fantasy
0
308 905
126 905 0.13922651933701657
------
------
id1:  fantasy id2:  novella
21
905 36
21 36 0.5833333333333334
------
------
id1:  novella id2:  fantasy
0
36 905
21 905 0.023204419889502764
------
------
id1:  fantasy id2:  short-stories
74
905 12

------
id1:  fantasy id2:  ireland
12
905 27
12 27 0.4444444444444444
------
------
id1:  ireland id2:  fantasy
0
27 905
12 905 0.013259668508287293
------
------
id1:  fantasy id2:  irish-literature
2
905 11
2 11 0.18181818181818182
------
------
id1:  irish-literature id2:  fantasy
0
11 905
2 905 0.0022099447513812156
------
------
id1:  christmas id2:  fantasy
11
13 905
11 905 0.012154696132596685
------
------
id1:  fantasy id2:  christmas
0
905 13
11 13 0.8461538461538461
------
------
id1:  fantasy id2:  holiday
8
905 10
8 10 0.8
------
------
id1:  holiday id2:  fantasy
0
10 905
8 905 0.008839779005524863
------
------
id1:  fantasy id2:  scotland
10
905 13
10 13 0.7692307692307693
------
------
id1:  scotland id2:  fantasy
0
13 905
10 905 0.011049723756906077
------
------
id1:  fantasy id2:  time-travel-romance
6
905 6
6 6 1.0
------
------
id1:  time-travel-romance id2:  fantasy
0
6 905
6 905 0.0066298342541436465
------
------
id1:  fantasy id2:  vampires
129
905 130
129 130

2 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  shonen
1
905 1
1 1 1.0
------
------
id1:  shonen id2:  fantasy
0
1 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  shinigami
1
905 1
1 1 1.0
------
------
id1:  shinigami id2:  fantasy
0
1 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  graphic-novels-comics-manga
4
905 4
4 4 1.0
------
------
id1:  graphic-novels-comics-manga id2:  fantasy
0
4 905
4 905 0.004419889502762431
------
------
id1:  fantasy id2:  heroic-fantasy
22
905 22
22 22 1.0
------
------
id1:  heroic-fantasy id2:  fantasy
0
22 905
22 905 0.02430939226519337
------
------
id1:  fantasy id2:  sword-and-sorcery
21
905 21
21 21 1.0
------
------
id1:  sword-and-sorcery id2:  fantasy
0
21 905
21 905 0.023204419889502764
------
------
id1:  contemporary-romance id2:  fantasy
20
134 905
20 905 0.022099447513812154
------
------
id1:  fantasy id2:  contemporary-romance
0
905 134
20 134 0.14925373134328357
------
------
id

id1:  dungeons-and-dragons id2:  fantasy
1
1 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  dungeons-and-dragons
0
905 1
1 1 1.0
------
------
id1:  fantasy id2:  romantic-suspense
2
905 14
2 14 0.14285714285714285
------
------
id1:  romantic-suspense id2:  fantasy
0
14 905
2 905 0.0022099447513812156
------
------
id1:  fantasy id2:  shojo
1
905 2
1 2 0.5
------
------
id1:  shojo id2:  fantasy
0
2 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  manga-romance
1
905 2
1 2 0.5
------
------
id1:  manga-romance id2:  fantasy
0
2 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  true-crime
1
905 14
1 14 0.07142857142857142
------
------
id1:  true-crime id2:  fantasy
0
14 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  true-story
1
905 24
1 24 0.041666666666666664
------
------
id1:  true-story id2:  fantasy
0
24 905
1 905 0.0011049723756906078
------
------
id1:  bdsm id2:  fantasy
3
8 905
3 905 0.0033149171270718232

50 50 1.0
------
------
id1:  aliens id2:  fantasy
50
50 905
50 905 0.055248618784530384
------
------
id1:  literature id2:  fantasy
0
772 905
321 905 0.3546961325966851
------
------
id1:  fantasy id2:  literature
321
905 772
321 772 0.41580310880829013
------
------
id1:  science id2:  fantasy
0
222 905
155 905 0.1712707182320442
------
------
id1:  fantasy id2:  science
155
905 222
155 222 0.6981981981981982
------
------
id1:  speculative-fiction id2:  fantasy
0
293 905
291 905 0.32154696132596683
------
------
id1:  fantasy id2:  speculative-fiction
291
905 293
291 293 0.9931740614334471
------
------
id1:  fantasy id2:  20th-century
0
905 486
245 486 0.5041152263374485
------
------
id1:  20th-century id2:  fantasy
245
486 905
245 905 0.27071823204419887
------
------
id1:  modern-classics id2:  fantasy
0
168 905
46 905 0.05082872928176796
------
------
id1:  fantasy id2:  modern-classics
46
905 168
46 168 0.27380952380952384
------
------
id1:  unfinished id2:  fantasy
0
527 90

id1:  fantasy id2:  greek-mythology
16
905 16
16 16 1.0
------
------
id1:  fantasy id2:  ancient-history
0
905 6
5 6 0.8333333333333334
------
------
id1:  ancient-history id2:  fantasy
5
6 905
5 905 0.0055248618784530384
------
------
id1:  fantasy id2:  classical-studies
0
905 6
5 6 0.8333333333333334
------
------
id1:  classical-studies id2:  fantasy
5
6 905
5 905 0.0055248618784530384
------
------
id1:  plays id2:  fantasy
0
37 905
13 905 0.014364640883977901
------
------
id1:  fantasy id2:  plays
13
905 37
13 37 0.35135135135135137
------
------
id1:  theatre id2:  fantasy
0
29 905
7 905 0.0077348066298342545
------
------
id1:  fantasy id2:  theatre
7
905 29
7 29 0.2413793103448276
------
------
id1:  tragedy id2:  fantasy
0
37 905
7 905 0.0077348066298342545
------
------
id1:  fantasy id2:  tragedy
7
905 37
7 37 0.1891891891891892
------
------
id1:  fantasy id2:  17th-century
0
905 14
6 14 0.42857142857142855
------
------
id1:  17th-century id2:  fantasy
6
14 905
6 905 0.

21
905 30
21 30 0.7
------
------
id1:  french-literature id2:  fantasy
0
16 905
9 905 0.009944751381215469
------
------
id1:  fantasy id2:  french-literature
9
905 16
9 16 0.5625
------
------
id1:  swashbuckling id2:  fantasy
0
2 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  swashbuckling
1
905 2
1 2 0.5
------
------
id1:  theory id2:  fantasy
0
8 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  theory
1
905 8
1 8 0.125
------
------
id1:  medical id2:  fantasy
0
31 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  medical
1
905 31
1 31 0.03225806451612903
------
------
id1:  fantasy id2:  disease
0
905 5
1 5 0.2
------
------
id1:  disease id2:  fantasy
1
5 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  biology
0
905 10
1 10 0.1
------
------
id1:  biology id2:  fantasy
1
10 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  dinosaurs
0
905 2
2 2 1.0
------
------
id1:  dinosaurs id2:  fantasy
2

3 905 0.0033149171270718232
------
------
id1:  new-age id2:  fantasy
0
5 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  new-age
1
905 5
1 5 0.2
------
------
id1:  fantasy id2:  birds
0
905 4
3 4 0.75
------
------
id1:  birds id2:  fantasy
3
4 905
3 905 0.0033149171270718232
------
------
id1:  puzzles id2:  fantasy
0
1 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  puzzles
1
905 1
1 1 1.0
------
------
id1:  genetics id2:  fantasy
0
4 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  genetics
1
905 4
1 4 0.25
------
------
id1:  low-fantasy id2:  fantasy
0
5 905
5 905 0.0055248618784530384
------
------
id1:  fantasy id2:  low-fantasy
5
905 5
5 5 1.0
------
------
id1:  islam id2:  fantasy
0
12 905
1 905 0.0011049723756906078
------
------
id1:  fantasy id2:  islam
1
905 12
1 12 0.08333333333333333
------
------
id1:  folk-tales id2:  fantasy
0
6 905
4 905 0.004419889502762431
------
------
id1:  fantasy id2:  folk-tales
4
905 6
