[View in Colaboratory](https://colab.research.google.com/github/iampatgrady/FeatureTools-Instacart-Demo/blob/master/Feature_Tools_Demo_Predicting_Basket_Orders.ipynb)

In [0]:
# clean project space, setup dependencies
!pip install dask
!pip install featuretools
!rm -rf top_features
!rm -rf data
!rm -rf partitioned_data
!rm data.zip
!pip install -U -q PyDrive

from IPython.display import clear_output
clear_output()

# FeatureTools

In [3]:
import featuretools as ft
import pandas as pd
import os
from random import sample
from tqdm import tqdm
ft.__version__

'0.1.20'

## Download partitioned data, and kaggle data

In [4]:
#TODO replace with bq connection

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# data.zip
file_id = '1-Xy-dO8-xuyrYGqCDk_AVQ3qXdry80S4'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('data.zip')
!unzip -o data.zip
clear_output()
!ls

data  datalab  data.zip  __MACOSX


## Generate Data Partitions

In [0]:
def make_user_sample(orders, order_products, departments, products, user_ids, out_dir):
    orders_sample = orders[orders["user_id"].isin(user_ids)]

    orders_keep = orders_sample["order_id"].values
    order_products_sample = order_products[order_products["order_id"].isin(orders_keep)]

    try:
        os.mkdir(out_dir)
    except:
        pass
    order_products_sample.to_csv(os.path.join(out_dir, "order_products__prior.csv"), index=None)
    orders_sample.to_csv(os.path.join(out_dir, "orders.csv"), index=None)
    departments.to_csv(os.path.join(out_dir, "departments.csv"), index=None)
    products.to_csv(os.path.join(out_dir, "products.csv"), index=None)



def generateDataPartitions(data_dir,partition_dir,target_users,fullset=False, chunksize=1000):
    order_products = pd.concat([pd.read_csv(os.path.join(data_dir,"order_products__prior.csv")),
                                pd.read_csv(os.path.join(data_dir, "order_products__train.csv"))])
    orders = pd.read_csv(os.path.join(data_dir, "orders.csv"))
    departments = pd.read_csv(os.path.join(data_dir, "departments.csv"))
    products = pd.read_csv(os.path.join(data_dir, "products.csv"))

    import string
    printable = set(string.printable)
    products.product_name = products.product_name.apply( lambda pn: filter(lambda x: x in printable, pn) )
    
    users_unique = orders["user_id"].unique()
    if (fullset):
      target_users = len(users_unique)
    part_num = 0
    try:
        os.mkdir(partition_dir)
    except:
        pass
    for i in tqdm(range(0, target_users, chunksize)):
        users_keep = users_unique[i: i+chunksize]
        make_user_sample(orders, order_products, departments, products, users_keep, os.path.join(partition_dir, "part_%d" % part_num))
        part_num += 1

In [7]:
# target_users is number of users to generate partitions for to speed up demonstration  should process full data for best prections, flip to "fullset=True", takes ~10 mins  
generateDataPartitions("data", "partitioned_data", 10000, fullset=False, chunksize=1000)  

100%|██████████| 10/10 [00:30<00:00,  3.04s/it]


## Load Data

In [0]:
from dask import bag
from dask.diagnostics import ProgressBar
import numpy as np

from sklearn.ensemble import RandomForestClassifier

In [0]:
order_products = pd.read_csv(os.path.join("partitioned_data/part_1/order_products__prior.csv"))
orders = pd.read_csv(os.path.join("partitioned_data/part_1/orders.csv"))
departments = pd.read_csv(os.path.join("partitioned_data/part_1/departments.csv"))
products = pd.read_csv(os.path.join("partitioned_data/part_1/products.csv"))

order_products = order_products.merge(products).merge(departments)

In [0]:
def load_entityset(data_dir):
  order_products = pd.read_csv(os.path.join(data_dir, "order_products__prior.csv"))
  orders = pd.read_csv(os.path.join(data_dir, "orders.csv"))
  departments = pd.read_csv(os.path.join(data_dir, "departments.csv"))
  products = pd.read_csv(os.path.join(data_dir, "products.csv"))

  order_products = order_products.merge(products).merge(departments)

  def add_time(df):
    df.reset_index(drop=True)
    df["order_time"] = np.nan
    days_since = df.columns.tolist().index("days_since_prior_order")
    hour_of_day = df.columns.tolist().index("order_hour_of_day")
    order_time = df.columns.tolist().index("order_time")

    df.iloc[0, order_time] = pd.Timestamp('Jan 1, 2015') +  pd.Timedelta(df.iloc[0, hour_of_day], "h")
    for i in xrange(1, df.shape[0]):
      df.iloc[i, order_time] = df.iloc[i-1, order_time] \
                                + pd.Timedelta(df.iloc[i, days_since], "d") \
                                + pd.Timedelta(df.iloc[i, hour_of_day], "h")

    return df

  orders = orders.groupby("user_id").apply(add_time)
  order_products = order_products.merge(orders[["order_id", "order_time"]])
  order_products["order_product_id"] = order_products["order_id"].astype(str) + "_" + order_products["add_to_cart_order"].astype(str)
  order_products.drop(["product_id", "department_id", "add_to_cart_order"], axis=1, inplace=True)
  es = ft.EntitySet("instacart")


  es.entity_from_dataframe(entity_id="order_products",
                           dataframe=order_products,
                           index="order_product_id",
                           variable_types={"aisle_id": ft.variable_types.Categorical, "reordered": ft.variable_types.Boolean},
                           time_index="order_time")

  es.entity_from_dataframe(entity_id="orders",
                           dataframe=orders,
                           index="order_id",
                           time_index="order_time")

  es.add_relationship(ft.Relationship(es["orders"]["order_id"], es["order_products"]["order_id"]))

  es.normalize_entity(base_entity_id="orders", new_entity_id="users", index="user_id")
  es.add_last_time_indexes()

  es["order_products"]["department"].interesting_values = list(set(es["order_products"].df.department.values))
  es["order_products"]["product_name"].interesting_values = ["Banana", "Organic Strawberries", "Organic Avocado"] 
  return es

In [56]:
es = load_entityset("partitioned_data/part_1/")
es

Entityset: instacart
  Entities:
    order_products [Rows: 156605, Columns: 7]
    orders [Rows: 16328, Columns: 8]
    users [Rows: 1000, Columns: 2]
  Relationships:
    order_products.order_id -> orders.order_id
    orders.user_id -> users.user_id

## Make Labels

In [0]:
def make_labels(es, training_window, cutoff_time, prediction_window):
  prediction_window_end = cutoff_time + prediction_window
  t_start = cutoff_time - training_window

  orders = es["orders"].df
  ops = es["order_products"].df

  training_data = ops[(ops["order_time"] <= cutoff_time) & (ops["order_time"] > t_start)]
  prediction_data = ops[(ops["order_time"] > cutoff_time) & (ops["order_time"] < prediction_window_end)]

  users_in_training = training_data.merge(orders)["user_id"].unique()

  valid_pred_data = prediction_data.merge(orders)
  valid_pred_data = valid_pred_data[valid_pred_data["user_id"].isin(users_in_training)]

  # list of products  
  empty_dict = dict((p, 0) for p in es["order_products"]["product_name"].interesting_values)
  def bought_products(df, d):
    for p in df.product_name.values:
      if p in d:
        d[p] = 1
    if len(d.values()) == 1:
      return d.values()[0]
    else:
      return d.values()

  labels = valid_pred_data.groupby("user_id").apply((lambda x: bought_products(x, empty_dict.copy()))).reset_index()
  labels["cutoff_time"] = cutoff_time
  #  rename and reorder
  labels.columns = ["user_id", "label", "time",]
  labels = labels[["user_id", "time", "label"]]

  return labels

In [58]:
label_times = make_labels(es=es,
                          training_window = ft.Timedelta("30 days"), 
                          cutoff_time = pd.Timestamp('March 15, 2015'),
                          prediction_window = ft.Timedelta("30 days"))

label_times.sample(5)

Unnamed: 0,user_id,time,label
299,1369,2015-03-15,"[0, 0, 0]"
309,1380,2015-03-15,"[0, 0, 0]"
527,1667,2015-03-15,"[0, 1, 0]"
268,1335,2015-03-15,"[0, 0, 0]"
601,1759,2015-03-15,"[0, 0, 0]"


## Automated Feature Engineering

In [59]:
feature_matrix, features = ft.dfs(target_entity="users", 
                                  cutoff_time=label_times,
                                  training_window=ft.Timedelta("30 days"), # same as above
                                  entityset=es,
                                  verbose=True)
# encode categorical values
fm_encoded, features_encoded = ft.encode_features(feature_matrix, features)

Built 106 features
Elapsed: 00:16 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks


## Machine Learning

In [0]:
X = fm_encoded
X["label"] = label_times["label"].values
X = X.fillna(0)
y = X.pop("label")


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
label_encoder = LabelEncoder()

integer_encoded = label_encoder.fit_transform(y)

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
encoded_y = onehot_encoder.fit_transform(integer_encoded)

from sklearn.preprocessing import StandardScaler 
x_scaler = StandardScaler()

normal_X = x_scaler.fit_transform(X=X)

In [0]:
y_banana = label_times.copy()
y_banana.label = y_banana.label.apply((lambda x: x[0]))

y_strawberries = label_times.copy()
y_strawberries.label = y_strawberries.label.apply((lambda x: x[1]))

y_avocado = label_times.copy()
y_avocado.label = y_avocado.label.apply((lambda x: x[2]))

### Random forest

In [0]:
clf = RandomForestClassifier(n_estimators=400, n_jobs=-1)

In [63]:
def feature_importances(model, features, n=10):
    importances = model.feature_importances_
    zipped = sorted(zip(features, importances), key=lambda x: -x[1])
    for i, f in enumerate(zipped[:n]):
        print "%d: Feature: %s, %.3f" % (i+1, f[0].get_name(), f[1])

    return [f[0] for f in zipped[:n]]

clf.fit(normal_X, encoded_y)
top_features = feature_importances(clf, features_encoded, n=20)

1: Feature: COUNT(order_products WHERE product_name = Organic Strawberries), 0.034
2: Feature: COUNT(order_products WHERE product_name = Banana), 0.029
3: Feature: COUNT(order_products WHERE product_name = Organic Avocado), 0.022
4: Feature: COUNT(order_products WHERE department = produce), 0.022
5: Feature: MODE(order_products.product_name) = Banana, 0.017
6: Feature: COUNT(order_products WHERE department = dairy eggs), 0.015
7: Feature: MEAN(orders.order_hour_of_day), 0.015
8: Feature: PERCENT_TRUE(order_products.reordered), 0.015
9: Feature: MEAN(orders.PERCENT_TRUE(order_products.reordered)), 0.015
10: Feature: SUM(orders.PERCENT_TRUE(order_products.reordered)), 0.015
11: Feature: MEAN(orders.NUM_UNIQUE(order_products.product_name)), 0.014
12: Feature: MIN(orders.PERCENT_TRUE(order_products.reordered)), 0.014
13: Feature: SUM(orders.NUM_UNIQUE(order_products.product_name)), 0.014
14: Feature: NUM_UNIQUE(order_products.product_name), 0.014
15: Feature: MIN(orders.order_hour_of_day),

In [0]:
ft.save_features(top_features, "top_features")

In [65]:
# take all except last order for model fit
# take only last order for user for predict
rf_banana = RandomForestClassifier(n_estimators=400, n_jobs=-1)
rf_banana.fit(X.values, y_banana.label.values)
pred_banana = rf_banana.predict(X.values)

rf_strawberries = RandomForestClassifier(n_estimators=400, n_jobs=-1)
rf_strawberries.fit(X.values, y_strawberries.label.values)
pred_strawberries = rf_strawberries.predict(X.values)

rf_avocado = RandomForestClassifier(n_estimators=400, n_jobs=-1)
rf_avocado.fit(X.values, y_avocado.label.values)
pred_avocado = rf_avocado.predict(X.values)


"""rf_milk = RandomForestClassifier(n_estimators=400, n_jobs=-1)
rf_milk.fit(X.values, y_milk.label.values)
pred_milk = rf_milk.predict(X.values)

rf_spinach = RandomForestClassifier(n_estimators=400, n_jobs=-1)
rf_spinach.fit(X.values, y_spinach.label.values)
pred_spinach = rf_spinach.predict(X.values)"""

'rf_milk = RandomForestClassifier(n_estimators=400, n_jobs=-1)\nrf_milk.fit(X.values, y_milk.label.values)\npred_milk = rf_milk.predict(X.values)\n\nrf_spinach = RandomForestClassifier(n_estimators=400, n_jobs=-1)\nrf_spinach.fit(X.values, y_spinach.label.values)\npred_spinach = rf_spinach.predict(X.values)'

In [66]:
users_buy = X_all.get(["user_id"])
users_buy["banana"] = pred_banana
users_buy["strawberries"] = pred_strawberries
users_buy["avocado"] = pred_avocado

#users_buy["milk"] = pred_milk
#users_buy["spinach"] = pred_spinach
#
users_buy.sample(5)

Unnamed: 0,user_id,banana,strawberries,avocado
757,1955,0,1,0
293,1362,0,0,0
381,1475,0,0,0
641,1814,0,0,0
456,1570,0,0,0


In [0]:
product_ids = []
for i in range(0, len(users_buy)):
  d = users_buy.iloc[i].drop("user_id").to_dict()
  product_id_list = []
  for product_id, buyTF in d.iteritems():
    if buyTF == 1:
      product_id_list.append(product_id)
  if len(product_id_list) > 0:
    product_ids.append(product_id_list)
  else:
    product_ids.append(["None"])
users_buy["product_ids"] = product_ids
users_buy = users_buy.get(["user_id", "product_ids"])

In [74]:
users_buy.sample(10)

Unnamed: 0,user_id,product_ids
378,1470,[None]
20,1023,[None]
414,1518,"[strawberries, avocado, banana]"
494,1621,[None]
678,1858,[None]
365,1451,[None]
326,1402,[strawberries]
110,1139,[avocado]
12,1014,[None]
214,1265,[None]
