# Process Data

In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import random

In [2]:
from collections import Counter, defaultdict #defaultdict provides value of nonexist key

In [3]:
REVIEW_DIR = ".\data\goodreads_reviews_comics_graphic.json"
BOOK_DIR = ".\data\goodreads_books_comics_graphic.json"
INTER_DIR = ".\data\goodreads_interactions_comics_graphic.json"

In [4]:
def process_review( record , table ):
    
    if record['rating']==0:
        #do not perform any operation
        return table
    
    
    user_id = record['user_id']
    book_id = record['book_id']
    
    
    
    if not user_id in table['users']:
        table['users'].add(user_id)
        #let table['books'] be a set
    if not book_id in table.keys():  #check if this user has registered in our dataset
        table[book_id] = set()

    table[book_id].add(user_id) #register this book
    return table

In [5]:
num_records = 529532  #there are 529532 record in total
index = 0 

data = {'users':set()}

#----------------------------
#     run main
#----------------------------

with open(REVIEW_DIR) as fie:
    for review in fie:
        
        if index > num_records:
            fie.close()
            break
            
        
            
        record = json.loads(review)  #load json as a dictionary
        data = process_review(record, data)
        
        #print(i)
        index+=1
    
    
    
    fie.close()

In [6]:
#we then remove those books which are not popular
data = [v for k,v in data.items() if len(v) > 1 ]
data.pop(0) #also remove the first element, which is a set of all books
# then compute how many books are in the dataset
users = set()
for v in data:
    users = users.union(v)
users = list(users)

In [7]:
look_up = {users[i]:i for i in range(len(users))  }  
#this is a mapping dictionary that map book id to a unique id
user_code = {i:users[i] for i in range(len(users))}
#this is a book code dictionary that map id to book id back

In [8]:
data = [ list(i) for i in data ] #change set to list
data = [  [look_up[j] for j in i]  for i in data] #change all raw bookid to the id in look up table

In [9]:
len(users)

51184

# Generate training samples

In [10]:
train_x = []
train_y = []

for i in range(len(data)):
    for j in range(len(data[i])):
        train_x.append( data[i][j] )
        temp = data[i][:j]+data[i][j+1:]
        train_y.append(temp)

In [11]:
len(users)

51184

# Now we prepare a keras data generator

This is a key part because if we save all training x and y fully, we will have 2*num_samples*num_classes values to save however, we can use a generator to output each of them safely.

In [12]:
import numpy as np
import tensorflow.keras as keras

In [13]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, num_samples, n_classes, train_x, train_y, batch_size=32, dim=1, shuffle=True):
        'Initialization'
      
        self.dim = dim #specify the input dimension
        self.batch_size = batch_size  #the batch size
        self.num_samples = num_samples  #how many ids are there in total
        
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end() #then call this method to kinda initialize it
        
        self.train_x = train_x
        self.train_y = train_y
        
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange( self.num_samples )
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
            
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor( self.num_samples / self.batch_size))
    #---------------------------------------
    #    now generate data samples
    #---------------------------------------

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        #list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation( indexes )

        return X, y


    def __data_generation(self, list_IDs):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.array( [self.train_x[i] for i in list_IDs] )
        y = np.zeros( (self.batch_size, self.n_classes), dtype=int ) 

        # Generate data
        for i in range(len(list_IDs)):
            # Store class
            
            for idx in self.train_y[ list_IDs[i]  ]:
                y[i,idx] = 1

        return X, y

In [14]:
from keras.models import Sequential

# Generators
num_samples = len(train_x)
num_classes = len(users)
num_dimension = 100


training_generator = DataGenerator(num_samples, num_classes,train_x,train_y, batch_size=64)

# Design model
model = keras.models.Sequential()
embedding = keras.layers.Embedding( num_classes, num_dimension, input_length=1  ) 
#the input is one book of a user's like
model.add(embedding)
model.add(keras.layers.Flatten())  
#since the input has only input_lengh being one, flatten it does not change everythingg and it's necessary
model.add(keras.layers.Dense(num_classes,activation='sigmoid')) 
#our train_y is the other books this user likes. Hence its a vector of zeros and ones
#one specify the books he like.

model.compile(optimizer='adam', loss='binary_crossentropy')
# Train model on dataset
model.fit_generator(generator=training_generator, epochs=3)

Using TensorFlow backend.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x172bf9fcd48>

In [26]:
embedding.weights[0].numpy().shape

(51184, 100)

In [30]:
weights = embedding.weights[0].numpy()

In [32]:
test = { k:list( weights[v] ) for k,v in look_up.items()   }

In [34]:
import pickle
with open("user_embedding.firefire",'wb') as file:
    pickle.dump(test,file)

In [35]:
with open('user_embedding.firefire','rb') as f:
    x = pickle.load(f)