In [1]:
import pandas as pd
import numpy as np

from Standard_functions import get_user_split_data, fit_score_pred_G_NB, fit_score_pred_log

import os

%matplotlib inline

In [2]:
os.chdir('Data/')

In [3]:
df = pd.read_pickle('investigation_df')

# Establishing our Baseline

## Without Categorical Data

In [5]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df, val_size=.2, seed=42)
print('Gaussian Naive Bayes:')
fit_score_pred_G_NB(X_tr, X_val, y_tr, y_val)
print()
print('Logistic Regression:')
fit_score_pred_log(df, X_tr, X_val, y_tr, y_val)

Gaussian Naive Bayes:
Our f1-score is 0.3947739449196353
And we've predicted 44330 non-re-orders and 5818 re-orders.

Logistic Regression:
Our f1-score is 0.25419039869812854
The coefficients are: 
                                        Features  Coefficients
0                     num_ord_per_user_per_prod        0.0029
1                          product_total_orders        0.0417
2                 product_avg_add_to_cart_order       -0.0727
3                             user_total_orders        0.0066
4                             user_avg_cartsize        0.0204
5                           user_total_products       -0.0025
6               user_avg_days_since_prior_order       -0.0200
7            user_product_avg_add_to_cart_order       -0.0003
8                       user_product_order_freq        1.2600
9      number_orders_since_product_last_ordered       -0.1155
10              percent_of_time_product_ordered        1.3527
11               mean_order_hour_of_day_by_user       -0.



## Let's try adding some dummies

In [17]:
products = pd.read_csv('products.csv')
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [5]:
products.drop(['product_name','department_id'],axis=1,inplace=True)

In [19]:
product_dummies = pd.get_dummies(products.aisle_id)
product_dummies = product_dummies.merge(products.product_id,
                                       left_index=True,
                                       right_index=True)
product_dummies.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,126,127,128,129,130,131,132,133,134,product_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


In [20]:
df = df.merge(product_dummies,on='product_id')

In [10]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df, val_size=.2, seed=42)
print('Gaussian Naive Bayes:')
fit_score_pred_G_NB(X_tr, X_val, y_tr, y_val)
print()
print('Logistic Regression:')
fit_score_pred_log(df, X_tr, X_val, y_tr, y_val)

Gaussian Naive Bayes:
Our f1-score is 0.21030370097135132
And we've predicted 14380 non-re-orders and 35844 re-orders.

Logistic Regression:
Our f1-score is 0.2574481865284974
The coefficients are: 
                                         Features  Coefficients
0                      num_ord_per_user_per_prod        0.0012
1                           product_total_orders        0.0345
2                  product_avg_add_to_cart_order       -0.0563
3                              user_total_orders        0.0061
4                              user_avg_cartsize        0.0209
5                            user_total_products       -0.0024
6                user_avg_days_since_prior_order       -0.0232
7             user_product_avg_add_to_cart_order       -0.0003
8                        user_product_order_freq        1.2652
9       number_orders_since_product_last_ordered       -0.1171
10               percent_of_time_product_ordered        1.3642
11                mean_order_hour_of_day_by_



## Let's try something a little different

In [22]:
df = pd.read_pickle('investigation_df')
ord_prod_prior_df = pd.read_pickle('products_prior_reduced')
ord_prod_prior_df = ord_prod_prior_df.merge(products,on='product_id')

In [23]:
reordered_by_aisle = ord_prod_prior_df.groupby('aisle_id',as_index=False)['reordered'].sum()
reordered_by_aisle.columns = ['aisle_id','reordered_by_aisle']
reordered_by_aisle.head()

Unnamed: 0,aisle_id,reordered_by_aisle
0,1,808
1,2,732
2,3,5173
3,4,1786
4,5,314


In [24]:
reordered_by_aisle.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 134 entries, 0 to 133
Data columns (total 2 columns):
aisle_id              134 non-null int64
reordered_by_aisle    134 non-null int64
dtypes: int64(2)
memory usage: 3.1 KB


In [25]:
products.head()

Unnamed: 0,product_id,aisle_id
0,1,61
1,2,104
2,3,94
3,4,38
4,5,5


In [26]:
reordered_by_aisle = reordered_by_aisle.merge(products,on='aisle_id')
reordered_by_aisle.head()

Unnamed: 0,aisle_id,reordered_by_aisle,product_id
0,1,808,209
1,1,808,554
2,1,808,886
3,1,808,1600
4,1,808,2539


In [27]:
reordered_by_aisle.drop('aisle_id',inplace=True,axis=1)
reordered_by_aisle.head()

Unnamed: 0,reordered_by_aisle,product_id
0,808,209
1,808,554
2,808,886
3,808,1600
4,808,2539


In [28]:
df = df.merge(reordered_by_aisle,on='product_id')

In [18]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df, val_size=.2, seed=42)
print('Gaussian Naive Bayes:')
fit_score_pred_G_NB(X_tr, X_val, y_tr, y_val)
print()
print('Logistic Regression:')
fit_score_pred_log(df, X_tr, X_val, y_tr, y_val)

Gaussian Naive Bayes:
Our f1-score is 0.3488501277635818
And we've predicted 46044 non-re-orders and 4180 re-orders.

Logistic Regression:
Our f1-score is 0.13421828908554573
The coefficients are: 
                                        Features  Coefficients
0                     num_ord_per_user_per_prod        0.1261
1                          product_total_orders        0.0242
2                 product_avg_add_to_cart_order       -0.0723
3                             user_total_orders       -0.0083
4                             user_avg_cartsize        0.0447
5                           user_total_products       -0.0046
6               user_avg_days_since_prior_order       -0.0185
7            user_product_avg_add_to_cart_order       -0.0230
8                       user_product_order_freq        0.0108
9      number_orders_since_product_last_ordered       -0.1524
10              percent_of_time_product_ordered        0.0119
11               mean_order_hour_of_day_by_user       -0.



## How about Ordinal values

In [30]:
df = pd.read_pickle('investigation_df')
ord_prod_prior_df = pd.read_pickle('products_prior_reduced')
ord_prod_prior_df = ord_prod_prior_df.merge(products,on='product_id')

In [31]:
reordered_by_aisle = ord_prod_prior_df.groupby('aisle_id',as_index=False)['reordered'].sum()
reordered_by_aisle.columns = ['aisle_id','reordered_by_aisle']
reordered_by_aisle.head()

Unnamed: 0,aisle_id,reordered_by_aisle
0,1,808
1,2,732
2,3,5173
3,4,1786
4,5,314


In [32]:
ordinal_by_aisle = (reordered_by_aisle.sort_values('reordered_by_aisle')
                    .reset_index(drop=True).reset_index())
ordinal_by_aisle.drop('reordered_by_aisle',axis=1,inplace=True)
ordinal_by_aisle.columns = ['ordinal_by_aisle','aisle_id']
ordinal_by_aisle.head()

Unnamed: 0,ordinal_by_aisle,aisle_id
0,0,44
1,1,118
2,2,132
3,3,10
4,4,55


In [33]:
ordinal_by_aisle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 2 columns):
ordinal_by_aisle    134 non-null int64
aisle_id            134 non-null int64
dtypes: int64(2)
memory usage: 2.2 KB


In [34]:
products.head()

Unnamed: 0,product_id,aisle_id
0,1,61
1,2,104
2,3,94
3,4,38
4,5,5


In [35]:
ordinal_by_aisle = ordinal_by_aisle.merge(products,on='aisle_id')
ordinal_by_aisle.head()

Unnamed: 0,ordinal_by_aisle,aisle_id,product_id
0,0,44,292
1,0,44,819
2,0,44,1591
3,0,44,1596
4,0,44,2171


In [36]:
ordinal_by_aisle.drop('aisle_id',inplace=True,axis=1)
ordinal_by_aisle.head()

Unnamed: 0,ordinal_by_aisle,product_id
0,0,292
1,0,819
2,0,1591
3,0,1596
4,0,2171


In [37]:
df = df.merge(ordinal_by_aisle,on='product_id')

In [27]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df, val_size=.2, seed=42)
print('Gaussian Naive Bayes:')
fit_score_pred_G_NB(X_tr, X_val, y_tr, y_val)
print()
print('Logistic Regression:')
fit_score_pred_log(df, X_tr, X_val, y_tr, y_val)

Gaussian Naive Bayes:
Our f1-score is 0.3946876169098391
And we've predicted 44353 non-re-orders and 5871 re-orders.

Logistic Regression:
Our f1-score is 0.24699884887354054
The coefficients are: 
                                        Features  Coefficients
0                     num_ord_per_user_per_prod        0.0387
1                          product_total_orders        0.0608
2                 product_avg_add_to_cart_order       -0.0986
3                             user_total_orders       -0.0036
4                             user_avg_cartsize        0.0093
5                           user_total_products       -0.0006
6               user_avg_days_since_prior_order       -0.0265
7            user_product_avg_add_to_cart_order       -0.0065
8                       user_product_order_freq        0.8719
9      number_orders_since_product_last_ordered       -0.1743
10              percent_of_time_product_ordered        0.9369
11               mean_order_hour_of_day_by_user       -0.



## Wait... why didn't it get better???  
## Maybe this is actually a good user-product feature...

In [39]:
df = pd.read_pickle('investigation_df')
ord_prod_prior_df = pd.read_pickle('products_prior_reduced')
ord_prod_prior_df = ord_prod_prior_df.merge(products,on='product_id')

In [40]:
reordered_by_aisle = ord_prod_prior_df.groupby(['aisle_id','user_id'],as_index=False)['reordered'].sum()
reordered_by_aisle.columns = ['aisle_id','user_id','reordered_by_aisle_by_user']
reordered_by_aisle.head()

Unnamed: 0,aisle_id,user_id,reordered_by_aisle_by_user
0,1,29,1
1,1,64,0
2,1,655,0
3,1,2012,3
4,1,2345,0


In [41]:
reordered_by_aisle.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110681 entries, 0 to 110680
Data columns (total 3 columns):
aisle_id                      110681 non-null int64
user_id                       110681 non-null int64
reordered_by_aisle_by_user    110681 non-null int64
dtypes: int64(3)
memory usage: 3.4 MB


In [42]:
reordered_by_aisle = reordered_by_aisle.merge(products,on='aisle_id')
reordered_by_aisle.head()

Unnamed: 0,aisle_id,user_id,reordered_by_aisle_by_user,product_id
0,1,29,1,209
1,1,29,1,554
2,1,29,1,886
3,1,29,1,1600
4,1,29,1,2539


In [43]:
reordered_by_aisle.drop('aisle_id',inplace=True,axis=1)
reordered_by_aisle.head()

Unnamed: 0,user_id,reordered_by_aisle_by_user,product_id
0,29,1,209
1,29,1,554
2,29,1,886
3,29,1,1600
4,29,1,2539


In [44]:
df = df.merge(reordered_by_aisle,on=['user_id','product_id'])

In [34]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df, val_size=.2, seed=42)
print('Gaussian Naive Bayes:')
fit_score_pred_G_NB(X_tr, X_val, y_tr, y_val)
print()
print('Logistic Regression:')
fit_score_pred_log(df, X_tr, X_val, y_tr, y_val)

Gaussian Naive Bayes:
Our f1-score is 0.3928069682495083
And we've predicted 44292 non-re-orders and 5856 re-orders.

Logistic Regression:
Our f1-score is 0.25147155003270105
The coefficients are: 
                                        Features  Coefficients
0                     num_ord_per_user_per_prod       -0.0050
1                          product_total_orders        0.0360
2                 product_avg_add_to_cart_order       -0.0936
3                             user_total_orders        0.0055
4                             user_avg_cartsize        0.0188
5                           user_total_products       -0.0025
6               user_avg_days_since_prior_order       -0.0220
7            user_product_avg_add_to_cart_order        0.0045
8                       user_product_order_freq        1.2633
9      number_orders_since_product_last_ordered       -0.1146
10              percent_of_time_product_ordered        1.3511
11               mean_order_hour_of_day_by_user       -0.



## Why don't we think more about what we're trying to predict?

In [4]:
df = pd.read_pickle('investigation_df')

In [5]:
products = pd.read_csv('products.csv')

In [6]:
products.drop(['product_name','department_id'],axis=1,inplace=True)

``` python
# create dummies variables for every aisle
product_dummies = pd.get_dummies(products.aisle_id)

# merge dummies with aisle_id to merge
product_dummies = product_dummies.merge(products.aisle_id,
                                       left_index=True,
                                       right_index=True)

# get in_cart to my dummies dataset
# create df with aisle_id to match and my target variable
aisle_and_in_cart = df.merge(products,on='product_id')[['in_cart','aisle_id']]

# join that with my dummies
dummies_and_target = aisle_and_in_cart.merge(product_dummies,on='aisle_id').drop('aisle_id',axis=1)

from collections import defaultdict

out = defaultdict(list)
in_cart = dummies_and_target.in_cart
dummies_len = len(dummies_and_target.columns)

for i in range(1,dummies_len):
    dummy = dummies_and_target.loc[:,i]
    corr_dic = {'in_cart':in_cart,i:dummy}
    corr_df = pd.DataFrame(corr_dic).corr()
    
    out[i] = corr_df[i]['in_cart']*1000
    
aisles_corr = pd.DataFrame(out,index=range(0,1)).T.reset_index()

aisles_corr.columns = ['aisle_id','aisle_target_correlation']

pd.to_pickle(aisles_corr,'aisle_correlations')
``` 

In [7]:
aisle_corrs = pd.read_pickle('aisle_correlations')

In [8]:
df = df.merge(products,on='product_id')

In [9]:
df = df.merge(aisle_corrs,on='aisle_id')

In [10]:
df.drop('aisle_id',axis=1,inplace=True)

In [11]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df, val_size=.2, seed=42)
print('Gaussian Naive Bayes:')
fit_score_pred_G_NB(X_tr, X_val, y_tr, y_val)
print()
print('Logistic Regression:')
fit_score_pred_log(df, X_tr, X_val, y_tr, y_val)

Gaussian Naive Bayes:
Our f1-score is 0.4048183628948117
And we've predicted 46704 non-re-orders and 5787 re-orders.

Logistic Regression:
Our f1-score is 0.25306392845313014
The coefficients are: 
                                        Features  Coefficients
0                     num_ord_per_user_per_prod        0.0085
1                          product_total_orders        0.0159
2                 product_avg_add_to_cart_order       -0.0710
3                             user_total_orders        0.0041
4                             user_avg_cartsize        0.0192
5                           user_total_products       -0.0021
6               user_avg_days_since_prior_order       -0.0237
7            user_product_avg_add_to_cart_order        0.0018
8                       user_product_order_freq        1.1836
9      number_orders_since_product_last_ordered       -0.1205
10              percent_of_time_product_ordered        1.3038
11               mean_order_hour_of_day_by_user       -0.

