In [None]:
import numpy as np
import matplotlib.pyplot as plt  
import gzip
import os
import tensorflow as tf
import keras
os.chdir('') ##set to directory

## Read in data

In [None]:
def parse(path):
    #open a gzip-compressed file in binary read mode
    #binary means any type of data, not just text
    g = gzip.open(path, mode='rb') #mode: read raw binary
    #generate records from the opened file
    for l in g:
        yield eval(l)

# Prepare dummy variable structure
Cycle through all the data once to get all the unique words and count how many times they occur
We will later remove words that appear few times. What is the cut off to remove items? However many items your RAM can handle.
#We can do this with a dictionary:

In [None]:
# When trying to add a new item to the dictionary it will be added if it is not already in the dictionary. If it is already in the dictionary it will not be added.
X_unique_words = {} #this creates an empty dictionary
Y_unique_categories = {}

In [None]:
#counter
i = 0

## For-loop to itereate through the dataset one row at a time so that we do not overload RAM
for d in parse('meta_Clothing_Shoes_and_Jewelry.json.gz'):
    i += 1
    
    #X
    ##############################################################
    X = np.array(d['title'])
    #print('\nX (title):\n')
    #print(X)
    
    # np.char.split; split text by any whitespace. Results in numpy array.
    # item(): make the numpy array a list
    for j in np.char.split(X).item():
        #add any text processing here you would like
        #examples: 
            #remove stop words
            #convert to lower case
        word = j.lower()
        if word in ['with','the','we','you','and']:
            continue
        # Add to dictionary:     
            # The get() dictionary method returns the value of the item with the specified key.
            # We can specify a value to return if the item does not exist
            # dictionary.get(keyname, value)           
        X_unique_words[word] = X_unique_words.get(word,0) + 1

    #every 1000 iterations remove words appearing less than or equal to 5 times
    if (i % 1000) == 0:
        #need list around items to make copy to avoid 
        #iteration over and modifying dict at the same time
        for key,value in list(X_unique_words.items()): 
            if value <= 5:
                X_unique_words.pop(key)        
        
        
    #Y
    ##############################################################
    Y = np.array(d['category'])
    #print('\nY (category):\n')
    #print(Y)                  

    
    for jj in Y:
        category = jj.lower()
        Y_unique_categories[category] = Y_unique_categories.get(category,0) + 1
    
    #every 1000 iterations remove categories appearing less than or equal to 5 times
    if (i % 1000) == 0:
        #need list around items to make copy to avoid 
        #iteration over and modifying dict at the same time
        for key,value in list(Y_unique_categories.items()): 
            if value <= 5:
                Y_unique_categories.pop(key)      

    #print progress every 100k records                
    if (i % 100000) == 0:
        print(i)
        
#     if i == 1000: 
#         break                

len(X_unique_words)
len(Y_unique_categories)
np.median(list(X_unique_words.values()))

In [None]:
#If a category is present in all items, then it is a good idea to help the model learn by removing this category.

# Extract keys and assign to an ordered object type such as list. This might not be necessary anymore as dictionary is now an ordered type since Python 3.6.

X_unique_words_ordered = np.array(list(X_unique_words.keys()))
Y_unique_categories_ordered = np.array(list(Y_unique_categories))

## Build Model

In [None]:
# Create Layers
inputs = tf.keras.layers.Input(shape=(len(uniquewords),),name='input')  # For a two dimensional input dataset, use (Nbrvariables,) for shape.
hidden1 = tf.keras.layers.Dense(units=round(len(uniquewords)*.75), activation="elu", name='hidden1')(inputs)  # number of units 20% less than input node
hidden2 = tf.keras.layers.Dense(units=round(len(uniquewords)*.5), activation="elu", name='hidden2')(hidden1)
hidden3 = tf.keras.layers.Dense(units=round(len(uniquewords)*.25), activation="elu", name='hidden3')(hidden2)
outputs = tf.keras.layers.Dense(units=len(uniquecategories), activation="softplus", name='output')(hidden3)
# Create model
model = tf.keras.Model(inputs=inputs, outputs=outputs)
# Compile model
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(learning_rate=0.001)) # opted for categorical cross entropy as opposed to binary cross entropy


## Train Model

In [None]:
avg_loss_per_epoch = []

epochs = 5
for epoch in range(epochs):
    i = 0
    avg_loss = 0
    for d in parse('meta_Clothing_Shoes_and_Jewelry.json.gz'):
        i += 1

        #X
        ##############################################################    
        X = np.array(d['title'])


        #initialize array with 0
        X_dummies = np.zeros((1,len(X_unique_words)))

        #write 1 where word appears
        for j in np.char.split(X).item():
            #add any text processing here you would like
            #examples: 
                #remove stop words
                #convert to lower case
            word = j.lower()
            if word in ['with','the','we','you','and']:
                continue

            #check if word appears in the lookup list (it might be an uncommon word that we deleted)
            # if present write 1, otherwise skip to next word
            pos = np.where(word == np.array(X_unique_words_ordered))[0]
            if len(pos) == 1:
                position = pos[0]
            else:
                continue

            X_dummies[0,position] = 1

        #Y
        ##############################################################         
        Y = np.array(d['category'])

        #initialize array with 0
        Y_dummies = np.zeros((1,len(Y_unique_categories)))

        #write 1 where category appears    
        for jj in Y:
            category = jj.lower()

            #check if category appears in the lookup list (it might be an uncommon category that we deleted)
            # if present write 1, otherwise skip to next category            
            pos = np.where(category == np.array(Y_unique_categories_ordered))[0]
            if len(pos) == 1:
                position = pos[0]
            else:
                continue

            Y_dummies[0,position] = 1        


        #Fit model
        modinfo = model.fit(x=X_dummies,y=Y_dummies, batch_size=1, epochs=1, verbose=0)
        #compute average loss
        loss = modinfo.history['loss'][0]
        avg_loss = avg_loss + (1/i)*(loss - avg_loss)
        
        #print progress every 10k records
        if (i % 10) == 0:
            print(i) 
        #Do 200k iterations (i.e., records)
        if i == 2000: 
            break
        
    avg_loss_per_epoch.append(avg_loss)
    plt.plot(avg_loss_per_epoch)
    plt.show()

## Variable Importance

In [None]:
yhat = model.predict(X_dummies)
performance_before = model.evaluate(X_dummies, Y_dummies)
performance_before

importance = list()
k=0
for i in range(len(X_dummies)):
    X_copy = np.copy(X)
    variable = np.random.permutation(np.copy(X_copy[:,i]))

    X_copy[:,i] = variable
    
    performance_after = model.evaluate(X_copy,Y_dummies)
    
    importance.append(performance_before - performance_after)
    print(performance_before - performance_after)
    k += 1
    print(k)
    if k >= 100:
        break