In [1]:
import numpy as np
import pandas as pd
from nutshell import ModelData, Learner

Using TensorFlow backend.


In [2]:
# Fathom concepts

# FactSet is a dataset with three columns: subject, fact, value
# You can generate a FactSet from a data with rows and columns using 
#  source csv has one row per subject - with columns containing fact values
#  destination dataset has one row per subject/fact



In [60]:
class FactSet:
    
    def __init__(self):
        fact_data = pd.DataFrame()   # internal dataset with one row per subject/fact
        label_data = pd.DataFrame()  # internal dataset with one row per subject
        model_name = 'factset'       # override this value to the name of the model file that will be built
        model = None
        subject_column = ''          # name of subject id column
        fact_name_column = ''
        fact_value_column = ''
        label_column = ''            # name label column - this will be not be used in training
        fact_colnames = []           # optional: list of fact columns; default is all non subject/label columns
               
    def load_subject_rows(self, df):
        
        # dataset is in the format one row per subject, one fact per column
        print('Transposing data into fact rows...')
        
        #to do
        
    
    def load_fact_rows(self, df):
               
        # dataset is in the format one row per subject/fact, each row has a fact name and value column
        self.fact_data = pd.DataFrame()
        self.fact_data['subject'] = df[self.subject_column]
        self.fact_data['fact_name'] = df[self.fact_name_column]
        self.fact_data['fact_value'] = df[self.fact_value_column]
        
        print(len(df), 'fact rows loaded')
    
    def learn(self, model_name=''):
     
        # training a neural net to tell which facts are true and which are false about each subject
        #  so for every true fact about a subject, there should be an equal number of false facts
        #  false facts are taken from other subjects, so they are plausible facts
    
        if model_name == '':
            model_name = self.model_name
    
        
        print('Preparing data for model training...')

        dfLearn = pd.DataFrame()
        dfLearn['subject'] = self.fact_data['subject']
        dfLearn['fact'] = self.fact_data['fact_name'].astype('str') + '/' + self.fact_data['fact_value'].astype('str')
        dfLearn['is_true'] = 1
        
        data = ModelData(dfLearn)
        data.category_columns = ['subject','fact']
        data.label_column = 'is_true'
        data.prepare_data()
        data.add_false_rows(['subject'])
                
        print('Building neural network...')
        
        model = Learner(data)
        model.dropout = 0 # overfitting is the point
        model.batch_size = 1024
        model.build_model()
        
        print('Training neural network...')

        model.train_model(model_name, epochs=3, super_epochs=3)

        print ('Stored model to: ', model_name)
        
    
        
    



In [61]:
# as a sample, use instacart purchases
# we will use users as subjects and products purchased as facts

dataPath = '/Users/Miles/Documents/Datasets/Instacart/'
dfProducts = pd.read_csv(dataPath + 'products.csv')
dfOrders = pd.read_csv(dataPath + 'orders.csv')
dfOrderProducts = pd.read_csv(dataPath + 'order_products__prior.csv')

In [62]:
# full set has 206K unique users
dfOrders['user_id'].nunique()

# lets start with a sample set of 2000 users
dfSample = pd.DataFrame()
dfSample['user_id'] = dfOrders['user_id'].unique()
dfSample = dfSample.sample(n=2000)

dfOrders = pd.merge(dfSample, dfOrders, on='user_id').reset_index()

In [63]:
# create list of user ids and products they have ordered

dfUserProducts = pd.merge(dfOrders, dfOrderProducts, on='order_id')[['user_id', 'product_id']].\
    sort_values(['user_id', 'product_id']).groupby(['user_id', 'product_id']).count().reset_index()

In [64]:
dfUserProducts['fact_name'] = 'product'

In [65]:
facts = FactSet()
facts.subject_column = 'user_id'
facts.fact_name_column = 'fact_name'
facts.fact_value_column = 'product_id'
facts.model_name = 'grocery'
facts.load_fact_rows(dfUserProducts)

126878 fact rows loaded


In [69]:
elf = facts

dfLearn = pd.DataFrame()
dfLearn['subject'] = elf.fact_data['subject']
dfLearn['fact'] = elf.fact_data['fact_name'].astype('str') + '/' + elf.fact_data['fact_value'].astype('str')
dfLearn['is_true'] = 1


In [70]:
dfLearn[0:5]

Unnamed: 0,subject,fact,is_true
0,23,product/2138,1
1,23,product/3108,1
2,23,product/3243,1
3,23,product/3518,1
4,23,product/3873,1


In [71]:
data = ModelData(dfLearn)
data.category_columns = ['subject','fact']
data.label_column = 'is_true'
data.prepare_data()
data.add_false_rows(['subject'])


Tokenizing category columns...
subject 2000 unique values
fact 20547 unique values
Done preparing data
Adding false rows
Added 126878 rows


In [76]:
data.prepare_data()
data.split_data(shuffle=True)

Tokenizing category columns...
** Using pre-defined token map **
subject 2002 unique values
fact 20549 unique values
Done preparing data
Training examples: 114191
Validation examples: 12687


In [78]:
model_name = 'grocery'
model = Learner(data)
model.dropout = 0 # overfitting is the point
model.batch_size = 1024
model.build_model()
model.train_model(model_name, epochs=3, super_epochs=3)
        


Non-Sequential Merge Layer Shape: (?, 100)
Loss Function: mse
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_subject (InputLayer)       (None, 1)             0                                            
____________________________________________________________________________________________________
input_fact (InputLayer)          (None, 1)             0                                            
____________________________________________________________________________________________________
embed_subject (Embedding)        (None, 1, 50)         100100      input_subject[0][0]              
____________________________________________________________________________________________________
embed_fact (Embedding)           (None, 1, 50)         1027450     input_fact[0][0]                 
_____________________________