In [23]:
import os 
import random
import numpy as np 
import pandas as pd
from collections import defaultdict
import itertools
import ast
import csv

In [2]:
#hyper parameters
rating_threshold = 4.5 #only counted the usr rating if it is rated above the rating threshold
user_count_threshold = 500 #only keep the bk_id if the bk is rated by user for more than user_count_threshold times
split = [0.8, 0.1, 0.1] #train, dev, test split
#INPUT_DIR = '../../../data/the-movies-dataset/'
INPUT_DIR ='../../datasets/goodbooks-10k-master/'
OUTPUT_DIR = '../box-code/data/book_data/big/taxonomy/'

In [3]:
#read in external taxonomy
tax_file = INPUT_DIR + 'extras/ext_genres_taxo.csv'
df_ex_tax = pd.read_csv(tax_file, delimiter=',')
df_ex_tax.dataframeName = 'ex_tax.csv'
print(df_ex_tax.head(10))

                     Parent                                            Child
0              10th-century                        ['Fiction', 'Literature']
1              11th-century                        ['Fiction', 'Literature']
2              12th-century                        ['Fiction', 'Literature']
3              13th-century                        ['Fiction', 'Literature']
4              14th-century                        ['Fiction', 'Literature']
5              15th-century  ['Fiction', 'Historical Fiction', 'Literature']
6              16th-century                        ['Fiction', 'Literature']
7              17th-century  ['Fiction', 'Historical Fiction', 'Literature']
8  1864-shenandoah-campaign       ['American History', 'American Civil War']
9              18th-century  ['Fiction', 'Historical Fiction', 'Literature']


In [4]:
vocab_file = INPUT_DIR + 'extras/genre_vocabulary.txt'
with open (vocab_file, 'r') as rfile:
    vocab_list = [line.rstrip() for line in rfile]

vocab_set = set(vocab_list)  # just an addition. keep it for now.

In [5]:
print(vocab_list[:5])

['fantasy', 'young-adult', 'fiction', 'magic', 'childrens']


In [6]:
#preprocess external taxonomy
for i, col in enumerate(df_ex_tax.columns):
    df_ex_tax.iloc[:, i] = df_ex_tax.iloc[:, i].str.replace('"', '')
    df_ex_tax.iloc[:, i] = df_ex_tax.iloc[:, i].str.replace('\'', '')
    df_ex_tax.iloc[:, i] = df_ex_tax.iloc[:, i].str.replace('[', '')
    df_ex_tax.iloc[:, i] = df_ex_tax.iloc[:, i].str.replace(']', '')

In [7]:
print(df_ex_tax.head(10))

                     Parent                                    Child
0              10th-century                      Fiction, Literature
1              11th-century                      Fiction, Literature
2              12th-century                      Fiction, Literature
3              13th-century                      Fiction, Literature
4              14th-century                      Fiction, Literature
5              15th-century  Fiction, Historical Fiction, Literature
6              16th-century                      Fiction, Literature
7              17th-century  Fiction, Historical Fiction, Literature
8  1864-shenandoah-campaign     American History, American Civil War
9              18th-century  Fiction, Historical Fiction, Literature


In [8]:
import string
def strProcess(s):
    if not s: return 
    s = s.replace('\\u200', '')
    s = s.lstrip().lower().replace(' ', '-')
    if not s.replace('-', '').isalnum():
        #print(s)
        return
    return s

In [9]:
#read external taxonomy into dict
ex_tax_dict={} #key: term 1 in external taxonomy, val: term 2 in external taxonomy(a list)
for index, row in df_ex_tax.iterrows():    
    child = row['Child'].split(",")  # list of related genres (+ve examples)!
    parent = row['Parent'].replace(' ', '-')
    if parent not in vocab_set:
        continue
    for i in child:
        i = strProcess(i)
        if not i or i not in vocab_set: 
            continue
        if parent not in ex_tax_dict:
            ex_tax_dict[parent] = [i]
        else:
            ex_tax_dict[parent].append(i)

In [15]:
len(ex_tax_dict)

380

In [28]:
# this sampling can be made more efficient!
num_neg = 5
negative_dict = defaultdict(list)

for key in ex_tax_dict:
    count = 0
    while count < num_neg:
        gen = random.choice(vocab_list)
        cond1 = gen not in ex_tax_dict[key]
        cond2 = key not in ex_tax_dict[gen] if gen in ex_tax_dict else True
        if cond1 and cond2:
            negative_dict[key].append(gen)
            count += 1



In [29]:
len(negative_dict)

380

In [33]:
negative_dict['government']

['shapeshifters', 'epic', 'adoption', 'self-help', 'retellings']

In [35]:
#save ex_tax_dict (positive trn egs)
import pickle
dict_file = INPUT_DIR + 'extras/extax_dict_Pos.pkl'
with open(dict_file, 'wb') as f:  
    pickle.dump(ex_tax_dict, f)

# save neg_dict (negative trn egs)
import pickle
neg_dict_file = INPUT_DIR + 'extras/extax_dict_Neg.pkl'
with open(neg_dict_file, 'wb') as f:  
    pickle.dump(negative_dict, f)

        

In [37]:
pos_gn_gn_pairs = INPUT_DIR + 'extras/genre_genre_4.5_Pos.txt'
neg_gn_gn_pairs = INPUT_DIR + 'extras/genre_genre_4.5_Neg.txt'
all_gn_gn_pairs = INPUT_DIR + 'extras/genre_genre_4.5_All.txt'
eps = 0.00001 # non-zero low prob for  neg pairs. 1e-5

with open(pos_gn_gn_pairs, 'w') as outfile:
    for key, values in ex_tax_dict.items():
        for v in values:
            outfile.write("IsA %s %s 1\n" % (key, v))    

with open(neg_gn_gn_pairs, 'w') as outfile:
    for key, values in negative_dict.items():
        for v in values:
            outfile.write("IsA %s %s 0.00001\n" % (key, v))    

# CAT the files together using cat in terminal! to All.txt file.