In [1]:
# data munging and visualization
import numpy as np
import pandas as pd
import matplotlib
%matplotlib notebook


# feature engineering
import featuretools as ft

# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import os

## 1. Load data

This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.

In [2]:
data = pd.read_csv('online_retail_data.csv', encoding = "ISO-8859-1")
len(data)

541909

In [3]:
# deleting all rows without a CustomerID
len(data[data['CustomerID'].isnull()])

135080

In [4]:
data = data[data['CustomerID'].isnull() == False]
len(data)

406829

In [5]:
# sample the data (1/2 of data)
# data = data[0:int(len(data) / 4)]

In [6]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [7]:
# converting the InvoiceDate column to a pandas datetime
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

In [8]:
# extracting month from InvoiceDate
data['InvoiceMonth'] = data['InvoiceDate'].dt.to_period('M')

In [9]:
# building an additional feature
data['Value'] = data['Quantity'] * data['UnitPrice']
data['Value'].mean()

20.401853884564613

In [10]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,InvoiceMonth,Value
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,2010-12,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,2010-12,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12,20.34


## 2. EDA

In [11]:
# countries
len(data['Country'].unique())

37

In [12]:
# unique descriptions
len(data['Description'].unique())

3896

In [13]:
# 3684 unique stock codes
len(data['StockCode'].unique())

3684

In [14]:
# rows of data
len(data)

406829

In [15]:
# invoices per customer
int(len(data)*1.0 / len(data['CustomerID'].unique())*1.0)

93

In [16]:
# unique customers 
len(data['CustomerID'].unique())

4372

In [17]:
# first invoice timestamp to last invoice timestamp
data['InvoiceDate'].min(), ' -> ', data['InvoiceDate'].max()

(Timestamp('2010-12-01 08:26:00'), ' -> ', Timestamp('2011-12-09 12:50:00'))

In [18]:
# rows of data for the first customer by index
len(data[data['CustomerID'] == data['CustomerID'][0]])

312

In [19]:
# invoices for the first customer by index
len(data[data['CustomerID'] == data['CustomerID'][0]]['InvoiceNo'].unique())

35

In [20]:
# % of orders with a negative quantity
len(data[data['Quantity'] < 0])*1.0 /  len(data[data['Quantity'] > 0])*1.0

0.02237864516842412

In [21]:
data.to_csv('data.csv')

In [22]:
# if it loads too long
# data = pd.read_csv('data.csv')

## 3. Building an EntitySet

In [23]:
es = ft.EntitySet("online_retail")

In [24]:
orders = data[['InvoiceNo', 'CustomerID', 'InvoiceDate']]
orders.head()

Unnamed: 0,InvoiceNo,CustomerID,InvoiceDate
0,536365,17850.0,2010-12-01 08:26:00
1,536365,17850.0,2010-12-01 08:26:00
2,536365,17850.0,2010-12-01 08:26:00
3,536365,17850.0,2010-12-01 08:26:00
4,536365,17850.0,2010-12-01 08:26:00


In [25]:
len(orders)

406829

In [26]:
orders = orders.groupby('InvoiceNo').mean().reset_index()
orders.head()

Unnamed: 0,InvoiceNo,CustomerID
0,536365,17850.0
1,536366,17850.0
2,536367,13047.0
3,536368,13047.0
4,536369,13047.0


In [27]:
len(orders)

22190

In [28]:
# InvoiceNo | CustomerID | InvoiceDate
invoice_dates = data[['InvoiceNo', 'InvoiceDate']].groupby('InvoiceNo').min().reset_index()
orders = pd.merge(orders, invoice_dates, on='InvoiceNo', how='left')

In [29]:
orders.head()

Unnamed: 0,InvoiceNo,CustomerID,InvoiceDate
0,536365,17850.0,2010-12-01 08:26:00
1,536366,17850.0,2010-12-01 08:28:00
2,536367,13047.0,2010-12-01 08:34:00
3,536368,13047.0,2010-12-01 08:34:00
4,536369,13047.0,2010-12-01 08:35:00


In [30]:
len(orders) == len(data['InvoiceNo'].unique())

True

In [31]:
es.entity_from_dataframe(entity_id='orders',
                        dataframe=orders,
                        index='InvoiceNo',
                        time_index='InvoiceDate')

Entityset: online_retail
  Entities:
    orders (shape = [22190, 3])
  Relationships:
    No relationships

In [32]:
order_products = data[['InvoiceNo', 'Description', 'InvoiceDate', 'InvoiceMonth', 'Country', 'UnitPrice', 'Quantity', 'Value']]
order_products.head()

Unnamed: 0,InvoiceNo,Description,InvoiceDate,InvoiceMonth,Country,UnitPrice,Quantity,Value
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,2010-12-01 08:26:00,2010-12,United Kingdom,2.55,6,15.3
1,536365,WHITE METAL LANTERN,2010-12-01 08:26:00,2010-12,United Kingdom,3.39,6,20.34
2,536365,CREAM CUPID HEARTS COAT HANGER,2010-12-01 08:26:00,2010-12,United Kingdom,2.75,8,22.0
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,2010-12-01 08:26:00,2010-12,United Kingdom,3.39,6,20.34
4,536365,RED WOOLLY HOTTIE WHITE HEART.,2010-12-01 08:26:00,2010-12,United Kingdom,3.39,6,20.34


In [33]:
es.entity_from_dataframe(entity_id='order_products',
                        dataframe=order_products,
                        index='order_product_id',
                        time_index='InvoiceDate')



Entityset: online_retail
  Entities:
    order_products (shape = [406829, 9])
    orders (shape = [22190, 3])
  Relationships:
    No relationships

In [34]:
# add a relationship between entities -> orders and order_products
es.add_relationship(ft.Relationship(es['orders']['InvoiceNo'], es['order_products']['InvoiceNo']))

Entityset: online_retail
  Entities:
    order_products (shape = [406829, 9])
    orders (shape = [22190, 3])
  Relationships:
    order_products.InvoiceNo -> orders.InvoiceNo

In [35]:
# add customers entity (which we'll use as an entity to predict later on)
es.normalize_entity(base_entity_id='orders', new_entity_id='customers', index='CustomerID')

Entityset: online_retail
  Entities:
    order_products (shape = [406829, 9])
    orders (shape = [22190, 3])
    customers (shape = [4372, 2])
  Relationships:
    order_products.InvoiceNo -> orders.InvoiceNo
    orders.CustomerID -> customers.CustomerID

In [36]:
len(data['CustomerID'].unique()) == len(es['customers'].df)

True

## 4. Building labels

In [37]:
# dataset length
data['InvoiceDate'].max() - data['InvoiceDate'].min()

Timedelta('373 days 04:24:00')

* *training_window* = the amount of historical data we want to use when calculating features
* *cutoff_time* = the point in time to calculate the features

We use cutoff times to avoid the risk of leaking labels into prediction (and overfit the data)

In [38]:
cutoff_time = pd.Timestamp('June 10, 2011')
# 1 month prediction window
prediction_window = ft.Timedelta("4 weeks")
# 6 months of training data
training_window = ft.Timedelta("180 days")

prediction_window_end = cutoff_time + prediction_window
t_start = cutoff_time - training_window

In [39]:
prediction_window_end

Timestamp('2011-07-08 00:00:00')

In [40]:
t_start

Timestamp('2010-12-12 00:00:00')

In [41]:
orders = es['orders'].df
ops = es['order_products'].df

In [42]:
ops.head()

Unnamed: 0_level_0,order_product_id,InvoiceNo,Description,InvoiceDate,InvoiceMonth,Country,UnitPrice,Quantity,Value
order_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,536365,WHITE HANGING HEART T-LIGHT HOLDER,2010-12-01 08:26:00,2010-12,United Kingdom,2.55,6,15.3
1,1,536365,WHITE METAL LANTERN,2010-12-01 08:26:00,2010-12,United Kingdom,3.39,6,20.34
2,2,536365,CREAM CUPID HEARTS COAT HANGER,2010-12-01 08:26:00,2010-12,United Kingdom,2.75,8,22.0
3,3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,2010-12-01 08:26:00,2010-12,United Kingdom,3.39,6,20.34
4,4,536365,RED WOOLLY HOTTIE WHITE HEART.,2010-12-01 08:26:00,2010-12,United Kingdom,3.39,6,20.34


In [43]:
orders.head()

Unnamed: 0_level_0,InvoiceNo,CustomerID,InvoiceDate
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
536365,536365,17850.0,2010-12-01 08:26:00
536366,536366,17850.0,2010-12-01 08:28:00
536367,536367,13047.0,2010-12-01 08:34:00
536368,536368,13047.0,2010-12-01 08:34:00
536369,536369,13047.0,2010-12-01 08:35:00


In [44]:
training_data = ops[(ops['InvoiceDate'] <= cutoff_time) & (ops['InvoiceDate'] > t_start)]
prediction_data = ops[(ops['InvoiceDate'] > cutoff_time) & (ops['InvoiceDate'] < prediction_window_end)]

In [45]:
users_in_training = training_data.merge(orders)['CustomerID'].unique()

In [46]:
valid_pred_data = prediction_data.merge(orders)
valid_pred_data = valid_pred_data[valid_pred_data['CustomerID'].isin(users_in_training)]

In [47]:
valid_pred_data.head()

Unnamed: 0,order_product_id,InvoiceNo,Description,InvoiceDate,InvoiceMonth,Country,UnitPrice,Quantity,Value,CustomerID
13,158578,556283,DOORMAT KEEP CALM AND COME IN,2011-06-10 08:50:00,2011-06,United Kingdom,7.95,2,15.9,15628.0
14,158579,556283,HANGING HEART JAR T-LIGHT HOLDER,2011-06-10 08:50:00,2011-06,United Kingdom,1.25,12,15.0,15628.0
15,158580,556283,GARDENERS KNEELING PAD KEEP CALM,2011-06-10 08:50:00,2011-06,United Kingdom,1.65,24,39.6,15628.0
16,158581,556283,RETROSPOT TEA SET CERAMIC 11 PC,2011-06-10 08:50:00,2011-06,United Kingdom,4.95,3,14.85,15628.0
17,158582,556283,FELTCRAFT BUTTERFLY HEARTS,2011-06-10 08:50:00,2011-06,United Kingdom,1.45,12,17.4,15628.0


In [48]:
# which value do we want to predict?
label = 'high_value_customer'

### High value customer?

In [49]:
# get average spending per each month - build labels
# total spending / unique months as a customer
# binary labels > 80th percentile

In [50]:
# print("Training/validation set size: ", len(training_data),",", len(valid_pred_data))

In [51]:
# why calculating labels on the validation set?

In [52]:
customer_months = pd.DataFrame(valid_pred_data.groupby('CustomerID')['InvoiceMonth'].nunique())
customer_months.reset_index(inplace=True)
customer_months.head()

Unnamed: 0,CustomerID,InvoiceMonth
0,12362.0,1
1,12395.0,1
2,12407.0,1
3,12408.0,1
4,12415.0,1


In [53]:
total_spending = pd.DataFrame(valid_pred_data.groupby('CustomerID')['Value'].sum())
total_spending.reset_index(inplace=True)
total_spending.head()

Unnamed: 0,CustomerID,Value
0,12362.0,303.76
1,12395.0,163.9
2,12407.0,719.06
3,12408.0,652.53
4,12415.0,23426.81


In [54]:
value_per_customer = pd.merge(customer_months, total_spending)
value_per_customer.columns = ['CustomerID', 'CustomerMonths', 'TotalSpending']
value_per_customer['ValuePerMonth'] = value_per_customer['TotalSpending'] /  value_per_customer['CustomerMonths']
value_per_customer.head()

Unnamed: 0,CustomerID,CustomerMonths,TotalSpending,ValuePerMonth
0,12362.0,1,303.76,303.76
1,12395.0,1,163.9,163.9
2,12407.0,1,719.06,719.06
3,12408.0,1,652.53,652.53
4,12415.0,1,23426.81,23426.81


In [55]:
# extracting top 20% spenders
high_value = int(value_per_customer['ValuePerMonth'].quantile(0.80))

In [56]:
value_per_customer['HighValueCustomer'] = value_per_customer['ValuePerMonth'] > high_value

In [57]:
value_per_customer.head()

Unnamed: 0,CustomerID,CustomerMonths,TotalSpending,ValuePerMonth,HighValueCustomer
0,12362.0,1,303.76,303.76,False
1,12395.0,1,163.9,163.9,False
2,12407.0,1,719.06,719.06,True
3,12408.0,1,652.53,652.53,True
4,12415.0,1,23426.81,23426.81,True


In [58]:
if label == 'high_value_customer':
    # label_times = pd.DataFrame((valid_pred_data.groupby('CustomerID').sum()['Value'] / 2) > high_value).reset_index()
    label_times = value_per_customer[['CustomerID', 'HighValueCustomer']]
    label_times['cutoff_time'] = cutoff_time
    label_times.columns = ['CustomerID', 'label', 'time']

In [59]:
len(label_times['CustomerID'].unique())

722

In [60]:
label_times.head()

Unnamed: 0,CustomerID,label,time
0,12362.0,False,2011-06-10
1,12395.0,False,2011-06-10
2,12407.0,True,2011-06-10
3,12408.0,True,2011-06-10
4,12415.0,True,2011-06-10


### Repurchase product?

In [61]:
# data['Description'].value_counts()[0:20]

In [62]:
# len(data[data['Description'] == 'LUNCH BAG PINK POLKADOT']['CustomerID'].unique())

In [63]:
# product_name = 'LUNCH BAG PINK POLKADOT'

In [64]:
"""

# bought a product?
if label == 'repurchase_product':
    customer_bought_product = valid_pred_data[valid_pred_data['Description'] ==  'LUNCH BAG PINK POLKADOT']['CustomerID'].tolist()
    customer_bought_product_binary = valid_pred_data['CustomerID'].isin(customer_bought_product)
    label_times = pd.DataFrame(valid_pred_data['CustomerID'].unique())
    label_times['label'] = customer_bought_product_binary
    label_times['cutoff_time'] = cutoff_time
    label_times.columns = ['CustomerID', 'label', 'time']
    
    
"""

"\n\n# bought a product?\nif label == 'repurchase_product':\n    customer_bought_product = valid_pred_data[valid_pred_data['Description'] ==  'LUNCH BAG PINK POLKADOT']['CustomerID'].tolist()\n    customer_bought_product_binary = valid_pred_data['CustomerID'].isin(customer_bought_product)\n    label_times = pd.DataFrame(valid_pred_data['CustomerID'].unique())\n    label_times['label'] = customer_bought_product_binary\n    label_times['cutoff_time'] = cutoff_time\n    label_times.columns = ['CustomerID', 'label', 'time']\n    \n    \n"

In [65]:
label_times.head()

Unnamed: 0,CustomerID,label,time
0,12362.0,False,2011-06-10
1,12395.0,False,2011-06-10
2,12407.0,True,2011-06-10
3,12408.0,True,2011-06-10
4,12415.0,True,2011-06-10


In [66]:
# class balance
label_times['label'].value_counts()[1] / label_times['label'].value_counts()[0]

0.25129982668977469

## 5. Automated feature engineering

In [67]:
feature_matrix, features = ft.dfs(target_entity='customers',
                                 cutoff_time=label_times,
                                 training_window=training_window,
                                 entityset=es,
                                 verbose=True)

Building features: 203it [00:00, 1285.29it/s]
Progress: 100%|██████████| 1/1 [01:24<00:00, 84.70s/cutoff time]


In [68]:
# encode categorical features
fm_encoded, features_encoded = ft.encode_features(feature_matrix,
                                                 features)

print('Number of features %s' % len(features_encoded))
fm_encoded.head()

Number of features 264


Unnamed: 0_level_0,COUNT(orders),SKEW(order_products.Quantity),MONTH(first_orders_time) = 1.0,MONTH(first_orders_time) = 12.0,MONTH(first_orders_time) = 3.0,MONTH(first_orders_time) = 2.0,MONTH(first_orders_time) = 4.0,MONTH(first_orders_time) = 5.0,MONTH(first_orders_time) = 6.0,MONTH(first_orders_time) = unknown,...,MIN(orders.SUM(order_products.Quantity)),SUM(orders.MEAN(order_products.Quantity)),MIN(orders.MEAN(order_products.Value)),MIN(orders.SUM(order_products.UnitPrice)),MAX(orders.MEAN(order_products.Quantity)),SKEW(orders.NUM_UNIQUE(order_products.Country)),NUM_UNIQUE(orders.MODE(order_products.Country)),MAX(orders.NUM_UNIQUE(order_products.Description)),SUM(orders.NUM_UNIQUE(order_products.Description)),MIN(orders.NUM_UNIQUE(order_products.Description))
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12362.0,3,2.059116,0,0,0,1,0,0,0,0,...,-2.0,16.96424,-2.45,4.9,9.482759,0.0,1.0,29.0,58.0,2.0
12407.0,2,1.409863,0,0,1,0,0,0,0,0,...,204.0,31.333333,15.086667,35.71,17.0,0.0,1.0,15.0,27.0,12.0
12408.0,4,0.335942,0,0,1,0,0,0,0,0,...,-5.0,12.8125,-14.85,4.95,10.3125,0.0,1.0,32.0,42.0,1.0
12415.0,9,2.093069,1,0,0,0,0,0,0,0,...,-110.0,502.383639,-425.0,0.85,125.268293,0.0,1.0,82.0,279.0,1.0
12421.0,1,1.097479,1,0,0,0,0,0,0,0,...,104.0,9.454545,14.313636,45.73,9.454545,0.0,1.0,11.0,11.0,11.0


In [69]:
pd.Series(fm_encoded.columns)[0:10]

0                         COUNT(orders)
1         SKEW(order_products.Quantity)
2        MONTH(first_orders_time) = 1.0
3       MONTH(first_orders_time) = 12.0
4        MONTH(first_orders_time) = 3.0
5        MONTH(first_orders_time) = 2.0
6        MONTH(first_orders_time) = 4.0
7        MONTH(first_orders_time) = 5.0
8        MONTH(first_orders_time) = 6.0
9    MONTH(first_orders_time) = unknown
dtype: object

## 6. Machine Learning

In [70]:
# with the generated features we use the feature matrix
# merge all features into one
X = fm_encoded.reset_index().merge(label_times)
X.drop(['CustomerID', 'time'], axis=1, inplace=True)
X = X.fillna(0)
y = X.pop('label').astype('int')

In [71]:
y.value_counts()

0    577
1    145
Name: label, dtype: int64

In [84]:
y.value_counts()[1] / y.value_counts()[0]

0.25129982668977469

In [72]:
# random forest classifier, 5-fold cross validation
clf = RandomForestClassifier(n_estimators=400, n_jobs=-1)
scores = cross_val_score(estimator=clf, X=X, y=y, cv=5, scoring='roc_auc', verbose=True)
"AUC %.2f +/- %.2f" % (scores.mean(), scores.std())

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   21.1s finished


'AUC 0.70 +/- 0.11'

In [73]:
scores

array([ 0.77036266,  0.81168252,  0.77106447,  0.65217391,  0.5       ])

In [74]:
# F1 score -> harmonic mean of precision and recall
# precision -> how certain when we predict positive
# recall -> how certain we are aoubt predicting 

In [75]:
# based on the std check for variance
# solution: more data or less features (decrease the degrees of freedom)

In [85]:
clf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [77]:
# feature importance calculation
importances = clf.feature_importances_
# number of top features to keep
n = 20

zipped = sorted(zip(features, importances), key=lambda x: - x[1])

for i, f in enumerate(zipped[:n]):
    print("%d: Feature: %s, %.3f" % (i+1, f[0].get_name(), f[1]))

top_features = [f[0] for f in zipped[:n]]
top_features

1: Feature: SUM(orders.SKEW(order_products.UnitPrice)), 0.029
2: Feature: STD(order_products.Value), 0.015
3: Feature: STD(orders.NUM_UNIQUE(order_products.InvoiceMonth)), 0.012
4: Feature: SKEW(orders.NUM_UNIQUE(order_products.Description)), 0.011
5: Feature: SUM(orders.MIN(order_products.UnitPrice)), 0.011
6: Feature: SUM(orders.MAX(order_products.UnitPrice)), 0.011
7: Feature: MIN(order_products.Quantity), 0.011
8: Feature: SUM(order_products.UnitPrice), 0.010
9: Feature: MIN(orders.SUM(order_products.UnitPrice)), 0.009
10: Feature: MEAN(orders.MEAN(order_products.Value)), 0.009
11: Feature: STD(orders.MIN(order_products.Quantity)), 0.009
12: Feature: SUM(orders.MEAN(order_products.Value)), 0.009
13: Feature: MEAN(orders.SUM(order_products.Value)), 0.009
14: Feature: MAX(orders.STD(order_products.Quantity)), 0.009
15: Feature: MIN(orders.MAX(order_products.UnitPrice)), 0.009
16: Feature: MIN(orders.MAX(order_products.Value)), 0.009
17: Feature: MIN(orders.SKEW(order_products.Quantit

[<Feature: SUM(orders.SKEW(order_products.UnitPrice))>,
 <Feature: STD(order_products.Value)>,
 <Feature: STD(orders.NUM_UNIQUE(order_products.InvoiceMonth))>,
 <Feature: SKEW(orders.NUM_UNIQUE(order_products.Description))>,
 <Feature: SUM(orders.MIN(order_products.UnitPrice))>,
 <Feature: SUM(orders.MAX(order_products.UnitPrice))>,
 <Feature: MIN(order_products.Quantity)>,
 <Feature: SUM(order_products.UnitPrice)>,
 <Feature: MIN(orders.SUM(order_products.UnitPrice))>,
 <Feature: MEAN(orders.MEAN(order_products.Value))>,
 <Feature: STD(orders.MIN(order_products.Quantity))>,
 <Feature: SUM(orders.MEAN(order_products.Value))>,
 <Feature: MEAN(orders.SUM(order_products.Value))>,
 <Feature: MAX(orders.STD(order_products.Quantity))>,
 <Feature: MIN(orders.MAX(order_products.UnitPrice))>,
 <Feature: MIN(orders.MAX(order_products.Value))>,
 <Feature: MIN(orders.SKEW(order_products.Quantity))>,
 <Feature: MIN(orders.NUM_UNIQUE(order_products.Country))>,
 <Feature: MAX(order_products.UnitPrice

In [78]:
top_features[0]

<Feature: SUM(orders.SKEW(order_products.UnitPrice))>

In [79]:
ft.save_features(top_features, 'top features')

In [80]:
# feature engineering -> aggregations and stacking of aggregations across relationships in the dataset

In [81]:
# rerun the classifier with one the top features to reduce the variance

In [82]:
top_features = ft.load_features('top_features', es)

In [83]:
label_times, es = 

SyntaxError: invalid syntax (<ipython-input-83-1e571318a923>, line 1)