In [1]:
import os

from collections import OrderedDict

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import scipy
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from Standard_functions import plot_features, get_user_split_data, plot_fit_score_pred, fit_score_pred, kfold_val_fit_score_pred_log, kfold_val_fit_score_pred_G_NB, kfold_val_fit_score_pred_RF, fit_score_pred_RF, kfold_val_fit_score_pred_M_NB

import warnings

warnings.filterwarnings('always') 
np.seterr(divide='ignore', invalid='ignore')
%matplotlib inline

## Logistic Regression
The following score was achieved in Logistic Regression using the following features:  
Average f1-score: 0.144 +- 0.016:
* num_ord_per_user_per_prod  
* product_total_orders  
* product_avg_add_to_cart_order  
* user_total_orders  
* user_avg_cartsize  
* user_total_products  
* user_avg_days_since_prior_order  
* user_product_avg_add_to_cart_order  
* user_product_order_freq  
* number_orders_since_product_last_ordered  
* percent_of_time_product_ordered  
* mean_order_hour_of_day_by_user  
* diff_between_average_and_current_order_time  
* max_order_number  
* recent_order_weight  

After dropping max_order_number the following score was achieved:  
Average f1-score: 0.250 +- 0.001
   
## Gaussian Naive Bayes - All Features
Average f1-score: 0.347 +- 0.001


After dropping max_order_number the following score was achieved:  
Average f1-score: 0.400 +- 0.001


## Random Forest n_estimators = 10  - All Features
Average f1-score: 0.278 +- 0.002

## Multinomial Naive Bayes - All Features
Average f1-score: 0.183 +- 0.001


After dropping max_order_number the following score was achieved:  
Average f1-score: 0.286 +- 0.000



In [2]:
os.chdir('../Data/')

In [3]:
df = pd.read_pickle('full_features')

In [None]:
kfold_val_fit_score_pred(df, val_size=.2, seed=42)

### Dropping max order number increases score by ~10%

In [4]:
df.drop(['max_order_number'],axis=1,inplace=True)

In [None]:
kfold_val_fit_score_pred(df, val_size=.2, seed=42)

In [14]:
test = pd.read_pickle('full_features')

In [5]:
kfold_val_fit_score_pred_G_NB(df)

1
1
1
1
1
Individual f-1 score:  [0.40078980926654684, 0.398303739110914, 0.3996617211169772, 0.39949921512655223, 0.39952020112117387]
Average f1-score: 0.400 +- 0.001


In [7]:
kfold_val_fit_score_pred_M_NB(df)

1
1
1
1
1
Individual f-1 score:  [0.28663903956515396, 0.28639571016703874, 0.28620533573544504, 0.28682152283079676, 0.28556139654162765]
Average f1-score: 0.286 +- 0.000


In [4]:
kfold_val_fit_score_pred_RF(df)

1
1
1
1
1
Individual f-1 score:  [0.27658351749496574, 0.27635676415414906, 0.2769318277183592, 0.2784755989847244, 0.28041748437230507]
Average f1-score: 0.278 +- 0.002


In [12]:
def fit_score_pred_RF(df, X_train, X_val, y_train, y_val):
    """    
    Takes a DataFrame, training, and validation data as its input.
    Returns f1-score, features and their coefficients, and predicted non-re-orders and re-orders.
    """
    
    rfc = RandomForestClassifier(n_estimators=25)
    rfc.fit(X_train, y_train)        
    vals = pd.DataFrame(rfc.predict(X_val))[0].value_counts()
    print('Our f1-score is',f1_score(rfc.predict(X_val), y_val))
    print('And we\'ve predicted',vals[0],'non-re-orders and',
    vals[1],'re-orders.')

In [11]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df)
fit_score_pred_RF(df, X_tr, X_val, y_tr, y_val)

Our f1-score is 0.24183378964972507
And we've predicted 1647128 non-re-orders and 48606 re-orders.


In [13]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df)
fit_score_pred_RF(df, X_tr, X_val, y_tr, y_val)

Our f1-score is 0.27204383282364936
And we've predicted 1643243 non-re-orders and 52491 re-orders.


In [2]:
os.chdir('../Data/')

In [9]:
df = pd.read_pickle('no_feature_reduction')

### Dropping max order number increases score by ~10%

In [10]:
df.drop(['max_order_number'],axis=1,inplace=True)

In [11]:
kfold_val_fit_score_pred_G_NB(df)

1
1
1
1
1
Individual f-1 score:  [0.3227816853433815, 0.32301157731730695, 0.3229468818648686, 0.3222449635975556, 0.32207629585636083]
Average f1-score: 0.323 +- 0.000
