In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import random

In [2]:
from collections import Counter, defaultdict #defaultdict provides value of nonexist key

In [3]:
REVIEW_DIR = ".\data\goodreads_reviews_comics_graphic.json"
BOOK_DIR = ".\data\goodreads_books_comics_graphic.json"
INTER_DIR = ".\data\goodreads_interactions_comics_graphic.json"

# We need a user table which contains all the books he rated that has >=3
# We need a list of all books

In [4]:
def process_review( record , table ):
    
    if record['rating']==0:
        #do not perform any operation
        return table
    
    
    user_id = record['user_id']
    book_id = record['book_id']
    
    
    
    if not book_id in table['books']:
        table['books'].add(book_id)
        #let table['books'] be a set
    if not user_id in table.keys():  #check if this user has registered in our dataset
        table[user_id] = set()

    table[user_id].add(book_id) #register this book
    return table

In [5]:
num_records = 529532  #there are 529532 record in total
index = 0 

data = {'books':set()}

#----------------------------
#     run main
#----------------------------

with open(REVIEW_DIR) as fie:
    for review in fie:
        
        if index > num_records:
            fie.close()
            break
            
        
            
        record = json.loads(review)  #load json as a dictionary
        data = process_review(record, data)
        
        #print(i)
        index+=1
    
    
    
    fie.close()
    

# we then need to build a look up dictionary for our books
# we would first want to remove users with only one comment

In [6]:
#we then remove those user who only rates one book.
data = [v for k,v in data.items() if len(v) > 1 ]
data.pop(0) #also remove the first element, which is a set of all books
# then compute how many books are in the dataset
books = set()
for v in data:
    books = books.union(v)
books = list(books)

In [7]:
look_up = {books[i]:i for i in range(len(books))  }  
#this is a mapping dictionary that map book id to a unique id
book_code = {i:books[i] for i in range(len(books))}
#this is a book code dictionary that map id to book id back

# Now we made up our training samples

    Note here because the data is fairly large, we need a datagenerator to feed neural network

In [8]:
data = [list(i) for i in data] #change set to list
data = [  [look_up[j] for j in i]  for i in data] #change all raw bookid to the id in look up table

In [9]:
corpus = [item for sublist in data for item in sublist] 
#flatten the data to a corpus, and we will use this corpus to do a counter job

In [10]:
corpus[0:5]

[1088, 69578, 30750, 63458, 20579]

In [11]:
bookcounts = Counter(corpus)
uniquebooks = np.unique(corpus)

# Perform word2vec

In [12]:
def negativeSampleTable(uniqueWords, wordcounts, exp_power=0.75): #exp_power is the default value
    #global wordcounts
    #... stores the normalizing denominator (count of all tokens, each count raised to exp_power)
    max_exp_count = 0  #this is the sum of total weights
    
    print ("Generating exponentiated count vectors")
    #... (TASK) for each uniqueWord, compute the frequency of that word to the power of exp_power
    #... store results in exp_count_array.
    exp_count_array = [wordcounts[i]**exp_power for i in wordcounts]
    max_exp_count = sum(exp_count_array)

    print ("Generating distribution")

    #... (TASK) compute the normalized probabilities of each term.
    #... using exp_count_array, normalize each value by the total value max_exp_count so that
    #... they all add up to 1. Store this corresponding array in prob_dist
    prob_dist = [np.float(i/max_exp_count) for i in exp_count_array]
    #print(sum(prob_dist))


    print ("Filling up sampling table")
    #... (TASK) create a dict of size table_size where each key is a sequential number and its value is a one-hot index
    #... the number of sequential keys containing the same one-hot index should be proportional to its prob_dist value
    #... multiplied by table_size. This table should be stored in cumulative_dict.
    #... we do this for much faster lookup later on when sampling from this table.

    table_size = 1e7
    counter=0 #this is to specify the index of array
    #note if we do a for loop to print keys in a dict, it will preserve the same order for the same dict
    #hence, we the order of above array prob_dist is the same as we print the wordcount dict
    table_place = 0  #this is the sequential number we are current in among the dictionary cumulative_dict
    cumulative_dict=dict()
    
    for key in wordcounts:
        prob = prob_dist[counter]  #this will be the probability that it get sampled
        
        sub_table = round(prob*table_size) #round it to an int, which is how many keys in the table will have this index
        
        onehot_index = key
        
        for i in range(table_place,table_place+sub_table):  #we will make sub_table number of keys, they all have the same index.
            cumulative_dict[i] = onehot_index
        table_place = table_place+sub_table
        counter+=1
    

    return cumulative_dict

In [13]:
samplingTable = negativeSampleTable(uniquebooks, bookcounts)
#samplingtable is a table that given a number between 0,10000 to will output the corresponding word

Generating exponentiated count vectors
Generating distribution
Filling up sampling table


In [14]:
def generateSamples(context_idx, num_samples):
    #context_id is a list of ids which should not be negative sampled,
    #num_samples is the total negative sample we want to do
    global samplingTable
    results = []

    n=len(samplingTable)-1  #hence, we choose an int from 0 to n, which is the key of the sampling table
    #... (TASK) randomly sample num_samples token indices from samplingTable.
    #... don't allow the chosen token to be context_idx.
    #... append the chosen indices to results
    for i in range(num_samples):
        index = context_idx #first make an index that will go the while loop       
        while index in [context_idx]:  #this while loop will stop untile a sampled index is not in the context_idx
            quantile = random.randint(0,n)
            index = samplingTable[quantile]
        #to get multiple negative samples, we will then add the selected samples into the context_idx
        #this is like a sampling without replacement
        results.append(index)

    return results

In [15]:
def sigmoid(x):
    return 1.0/(1+np.exp(-x))

In [16]:
def performDescent(num_samples, learning_rate, center_token, context_words,W1,W2,negative_indices):
    # sequence chars was generated from the mapped sequence in the core code
    nll_new = 0
    #... (TASK) implement gradient descent. Find the current context token from context_words
    #... and the associated negative samples from negative_indices. Run gradient descent on both
    #... weight matrices W1 and W2.
    #... compute the total negative log-likelihood and store this in nll_new.
    #... You don't have to use all the input list above, feel free to change them
    
    voca = W1.shape[0]
    hidden_size = W1.shape[1]
    
    #let's first train context_words

    h = W1[center_token]
    new_h = np.copy(h)
    
    #first let's solve for context_word
    v_j = np.copy(W2[context_words])
    sig = sigmoid( np.dot(v_j,np.transpose(h)) )
    #minuse 1 because we are daling with context_words
    #update log likelihood
    nll_new-=np.log(sig)
    #then update W1 
    new_h -= learning_rate*(sig-1)*v_j
    #then update the projection matrix
    v_j = v_j - learning_rate*(sig-1)*h  
    
    W2[context_words]=v_j
    
    
    #then we slove for negative words
    for j in negative_indices:
        v_j = W2[j]
        sig = sigmoid( np.dot(v_j,np.transpose(h)) )
        #update nll        
        nsig = sigmoid(-np.dot(v_j,np.transpose(h)))
        nll_new -= np.log(nsig)
        #update W1
        new_h -= learning_rate*sig*v_j
        v_j = v_j - learning_rate*sig*h
        W2[j] = v_j
    #finally update h
    W1[center_token] = new_h
    

    return nll_new

In [17]:
def trainer(curW1 = None, curW2 = None, hidden_size=100):
    global data #data is a list of list, in which each element is the id of that book.
    
    #... set the training parameters
    epochs = 3
    num_samples = 2
    learning_rate = 0.05
    nll = 0
    iternum = 0
    
    book_size = len(look_up)
    nll_results = []
    
    if curW1==None:
        #np_randcounter += 1
        W1 = np.random.uniform(-.5, .5, size=(book_size, hidden_size))
        #print(W1)
        W2 = np.random.uniform(-.5, .5, size=(book_size, hidden_size))
        #print(W2)
    else:
        #... initialized from pre-loaded file
        W1 = curW1
        W2 = curW2  
    
    #now we start training:
    for epc in range(epochs):
        #how many epochs we need to run
        print("epoch {epc}".format(epc = str(epc)))
        iternum=0
        for user in data:
            
            if iternum%1000==0:
                print ("Negative likelihood: ", nll)
                nll_results.append(nll)
                nll = 0
            if iternum%10000==0:
                save_model(W1,W2)
            
            iternum += 1
            #user is a list of all books this user rated
            temp_books=user
            #then we need to run through every book in this books set
            for book_index in range(len(user)):
                
                center_book = temp_books[book_index]
                context_books = temp_books[:book_index]+temp_books[(book_index+1):] 
                
                #this is all contexts books, then we need to perform descent for each of this context books
                for context_book in context_books:
                    
                    negative_indices = generateSamples(context_book, num_samples) #create negative samplings
                    nll=performDescent(num_samples, learning_rate, center_book, context_book, W1, W2, negative_indices)
                    #create negative log likelihood
                    #nll_results.append(nll)
                    
                
                    
    return [W1,W2]
                    

In [18]:
def save_model(W1,W2):
	handle = open("saved_W1.data","wb+")
	np.save(handle, W1, allow_pickle=False)
	handle.close()

	handle = open("saved_W2.data","wb+")
	np.save(handle, W2, allow_pickle=False)
	handle.close()

In [19]:
def load_model():
	handle = open("saved_W1.data","rb")
	W1 = np.load(handle)
	handle.close()
	handle = open("saved_W2.data","rb")
	W2 = np.load(handle)
	handle.close()
	return [W1,W2]

In [None]:
#1k iteration usually takes more than 10 minutes
[W1,W2] = trainer()
save_model(W1,W2)

epoch 0
Negative likelihood:  0
Negative likelihood:  8.943419470696215
Negative likelihood:  2.8318675601876633
Negative likelihood:  2.324756304433464
Negative likelihood:  0.7061995172715936
Negative likelihood:  2.7092771246062393
Negative likelihood:  1.4374306415239202
Negative likelihood:  7.427740239383592
Negative likelihood:  0.8563482901912824
Negative likelihood:  2.772605326858002
Negative likelihood:  2.478340140222707
Negative likelihood:  2.616199370533028
Negative likelihood:  1.2374100276670355
Negative likelihood:  0.9475555930195149
Negative likelihood:  3.219496509711116
Negative likelihood:  5.8559455527996365
Negative likelihood:  1.1658120608369935
Negative likelihood:  1.7359210849166278
Negative likelihood:  0.3904922835614683
Negative likelihood:  0.4651917465619934
Negative likelihood:  1.469536916919807
Negative likelihood:  4.046177808516103
Negative likelihood:  2.552448411882127
Negative likelihood:  8.043885002928004
Negative likelihood:  2.140845539721