In [None]:
import os 
import numpy as np 
import pandas as pd
from collections import defaultdict
import itertools
import random

In [None]:
datadir = '../../data/goodbooks-10k-master/'

In [None]:
ratings_file = datadir + 'ratings.csv'
df = pd.read_csv(ratings_file, delimiter=',')

In [None]:
df.dataframeName = 'ratings.csv'
print(df.columns)

In [None]:
print(df.head(5))

In [None]:
print (max(df['book_id']), min(df['book_id']))
print (max(df['user_id']), min(df['user_id']))#len of the set of user_id is 53424 here
print (max(df['rating']), min(df['rating']))
print(len(df))

In [None]:
#First create a threshold for the ratings to consider: rating_threshold
#Drop all entry with rating lower than the threshold
rating_threshold = 3.5
df = df[df.rating >= rating_threshold]
print(len(df))
user_id_set = set(df['user_id'])
movie_id_set = set(df['book_id'])
print(len(user_id_set))
print(len(movie_id_set))

In [None]:
#dict with key: book_id, val: the list of user_ids rated the book above the rating threshold
book_dict = defaultdict(list)
for index, row in df.iterrows():    
    bookId = int(row['book_id'])
    usrId = int(row['user_id'])
    book_dict[bookId].append(usrId)
print(len(book_dict))

In [None]:
#usercount threshold, only keep the bk_id if the bk is rated by user for more than user_count_threshold times
user_count_threshold = 100
for key in list(book_dict.keys()):
    if len(book_dict[key]) < user_count_threshold:
        #book_dict[key] = [0]
        del book_dict[key]
print(len(book_dict))

In [None]:
#dict with key: user, val: the bookID this usr rated above given threshold
#in the current implementation, rating a bk 3.5 has the same effect as rating a bk 5, as long as the score is above threshold, it does not matter
user_dict = defaultdict(list)
for key, val in book_dict.items():
    for i in val:
        user_dict[i].append(key)
print(len(user_dict))#53304 if user_count_threshold = 1000

In [None]:
###Question, can we use the rating below for ex, 2.5 as an negative association?
# Answer: negative association is no association. We can use low ratings to calculate how strong people would dislike the pair together

In [None]:
#to make sure if a user rate a book, the user only rate the book once
for key, val in user_dict.items():
    user_dict[key] = list(set(val))

In [None]:
#marginal counts included, without marginal counts, the dict len is 20537173
# withou maginal count added the book_book_count dict len is 20547173

#book_book_count key: (bk_id_i, bk_id_j) (bk_id_i != bk_id_j and bk_id_i < bk_id_j) val: co-ocurrence count of bk_id_i and bk_id_j
#book_book_count_marginal key: (bk_id_i) val: ocurrence count of bk_id_i in the dataset
book_book_count =  defaultdict(lambda: 0)
book_book_count_marginal = defaultdict(lambda:0)
for key, val in user_dict.items():
    for i in range(len(val)):
        book_book_count_marginal[val[i]] += 1#marginal count
        for j in range(i+1, len(val)):#marginal count not included
           # if val[i] == val[j]: #error check
            #    print("val[i] == val[j], error")
            book_book_count[(val[i], val[j]) if val[i] <= val[j] else (val[j], val[i])] += 1 

In [None]:
print(len(book_book_count))
print(len(book_book_count_marginal))

In [None]:
print(book_book_count[(10,11)])
print(book_book_count[(10.0,11.0)])
print(book_book_count[(10,10)])# when marginal count was included in the book_book_count dict, this val was 10562
print(book_book_count_marginal[10])

In [None]:
def marginal_prob(movie_id, count_matrix, num_users):
    '''function to get the marginal prob:
        P(movie_id1)       
    '''
    margn_count = count_matrix[ movie_id]
    return margn_count/num_users


def joint_prob(movie_id1, movie_id2, count_matrix, num_users):
    '''function to get the joint prob:
        P(movie_id1, movie_id2)
    '''
    key = (movie_id1, movie_id2) if movie_id1<= movie_id2 else (movie_id2, movie_id1) 
    joint_count = count_matrix[key] 
    return joint_count/num_users


def conditional_prob(movie_id1, movie_id2, count_matrix, marginal_matrix):
    '''function to get the conditional prob:
        P(movie_id1 | movie_id2)       
    '''
    key = (movie_id1, movie_id2) if movie_id1<= movie_id2 else (movie_id2, movie_id1)
    joint_count = count_matrix[key]
    if joint_count == 0:
        return 0
    margn_count = marginal_matrix[movie_id2]
    return joint_count/margn_count

In [None]:
BASE_DIR = "../box-code/data/book_data/"
#create the vocab file
file = open(BASE_DIR+"vocabulary.txt", "w") 
marginal_keys = list(book_book_count_marginal.keys())
for i in marginal_keys:
    file.write(str(i)+"\n") 
file.close() 

#create the marginal prob file, order of value match order of books id in vocab file
file = open(BASE_DIR+"book_marginal_prob.txt", "w") 
N = len(user_dict)#number of user
for i in marginal_keys:
    #file.write(str(book_book_count_marginal[i]/N)+"\n")
    file.write(str(marginal_prob(i, book_book_count_marginal, N))+"\n")
file.close() 

In [None]:
'''
maxval = max([v for k,v in book_book_count.items()])
print(maxval)#11992
'''

In [None]:
#given ordered pair of key(a,b), generate both p(b|a) and p(a|b)
def data_generation(a,b, count_matrix, marginal_matrix):
    p_a_b = conditional_prob(a, b, count_matrix, marginal_matrix)#p(a|b)
    p_b_a = conditional_prob(b, a, count_matrix, marginal_matrix)#p(b|a)
    return(((a,b),p_b_a), ((b,a),p_a_b))

In [None]:
#train, dev, test split
split = [0.8, 0.1, 0.1]
#shuffle the data
items = list(book_book_count.items())
items = random.sample(items, len(items))
#make the split
n = len(items)
#print(n)
train_split = int(split[0]*n)
dev_split = int(train_split+ split[1]*n)
train_data_half = items[:train_split]
dev_data_half = items[train_split : dev_split]
test_data_half= items[dev_split :]
#print(len(train_data), len(dev_data), len(test_data))

#augment the train dev and test dataset
train_data=[]
dev_data=[]
test_data=[]

for data in train_data_half:
    a,b = data_generation(data[0][0],data[0][1], book_book_count, book_book_count_marginal)
    train_data.append(a)
    train_data.append(b)
    
for data in dev_data_half:
    a,b = data_generation(data[0][0],data[0][1], book_book_count, book_book_count_marginal)
    dev_data.append(a)
    dev_data.append(b)
    
for data in test_data_half:
    a,b = data_generation(data[0][0],data[0][1], book_book_count, book_book_count_marginal)
    test_data.append(a)
    test_data.append(b)
    
#reshuffle the train, dev and test dataset
train_data = random.sample(train_data, len(train_data))
dev_data = random.sample(dev_data, len(dev_data))
test_data = random.sample(test_data, len(test_data))
#print(train_data[0])

In [None]:
#create the train data file
file = open(BASE_DIR + "book_train.txt", "w") 
for i in train_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 

In [None]:
#create the dev data file
file = open(BASE_DIR + "book_dev.txt", "w") 
for i in dev_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 

In [None]:
#create the test data file
file = open(BASE_DIR + "book_test.txt", "w") 
for i in test_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 

In [None]:
#create the test data file
file = open(BASE_DIR + "book_train_test.txt", "w") 
for i in train_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
for i in dev_data:
    if str(i[1])!="0":
        file.write("IsA\t"+str(i[0][0])+"\t" + str(i[0][1]) + "\t" + str(i[1])+"\n")
file.close() 

In [None]:
num_users = len(user_id_set)
print(num_users)
print(min(user_id_set), max(user_id_set))
print("marginal")
print(marginal_prob(10, book_book_count, num_users))
print(marginal_prob(11, book_book_count, num_users))
print("joint")
print(joint_prob(10, 11, book_book_count, num_users))
print(joint_prob(11, 10, book_book_count, num_users))
print("conditional")
#print(conditional_prob(10, 11, book_book_count))
#print(conditional_prob(11, 10, book_book_count))
print()

In [None]:
#now we have the book_book_pair and the methods to calculate any pair's joint/ conditional
#prob, if we need, we can generate the entire joint/conditional prob matrix as well.

In [None]:
'''
# Sample example
# Conditioning CAN be less than the marginal! 
# ref: 2018-box-paper table-1

for k in final_dict.keys():
    p1 = conditional_prob(10, k, count_matrix)
    p2 = marginal_prob(10, count_matrix, N)
    if p1 <= p2:
        print(k, p1)
'''

In [None]:
#ToDo
#Filter out tags that does not make sense
#Right now I only delete the entry in the book_tags.csv if the count is less than a threshold
#We can also manuallly remove tags in the tags.csv if we think the tag does not make sense

In [None]:
#build a dict betweer goodread_book_id and the book_id used in this dataset
books = datadir + 'books.csv'
df_books = pd.read_csv(books, delimiter=',')

In [None]:
df_books.dataframeName = 'books.csv'
df_books= df_books[['book_id', 'goodreads_book_id']]
print(df_books.columns)

In [None]:
#dict with key: goodreads_book_id, val: book_id 
book_id_dict = defaultdict(lambda:-1)
for index, row in df_books.iterrows():    
    bookId = int(row['book_id'])
    GoodReadId = int(row['goodreads_book_id'])
    book_id_dict[GoodReadId]=bookId
print(len(book_id_dict))
print(book_id_dict[1])

In [None]:
#load book_tags file
book_tags = datadir + 'book_tags.csv'
df_book_tags = pd.read_csv(book_tags, delimiter=',')

In [None]:
df_book_tags.dataframeName = 'book_tags.csv'
print(df_book_tags.columns)

In [None]:
print(df_book_tags.head(5))

In [None]:
print (max(df_book_tags['goodreads_book_id']), min(df_book_tags['goodreads_book_id']))
print (max(df_book_tags['tag_id']), min(df_book_tags['tag_id']))
print (max(df_book_tags['count']), min(df_book_tags['count']))
print(len(df_book_tags))
#There are negative counts?

In [None]:
#remove entry if tag count is lower than given threshold, in which case the association between the tag and the book is
#not strong
tag_count_threshold = 500
df_book_tags = df_book_tags[df_book_tags['count'] >= tag_count_threshold]
print(len(df_book_tags))

In [None]:
#convert the good_read_book_id to book_id in this dataframe
for index, row in df_book_tags.iterrows():
    bookId = book_id_dict[int(row['goodreads_book_id'])]
    df_book_tags.set_value(index,'goodreads_book_id', bookId) 
print(df_book_tags.head(5))

In [None]:
# rename the column name from goodread_id to book_id
df_book_tags.rename(columns={'goodreads_book_id': 'book_id'}, inplace=True)
print(df_book_tags.head(5))

In [None]:
#dict with key: book_id, val: tag_id, to list all the tags every book has
book_tag_dict = defaultdict(list)
for index, row in df_book_tags.iterrows():    
    bookId = int(row['book_id'])
    tagId = int(row['tag_id'])
    book_tag_dict[bookId].append(tagId)
print(len(book_tag_dict))

In [None]:
#marginal counts included
#here the tag tag pair is incremented by one if a book is listed with both of the two tags
tag_tag_count =  defaultdict(lambda: 0)
for key, val in book_tag_dict.items():
    for i in range(len(val)):
        for j in range(i, len(val)):
            tag_tag_count[(val[i], val[j]) if val[i] <= val[j] else (val[j], val[i])] += 1  

In [None]:
print(len(tag_tag_count))