In [52]:
import os

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

import xgboost as xgb
import xgboost as XGBClassifier


from Standard_functions import plot_features, get_user_split_data, plot_fit_score_pred, fit_score_pred, kfold_val_fit_score_pred_log, kfold_val_fit_score_pred_G_NB, kfold_val_fit_score_pred_RF, fit_score_pred_RF, kfold_val_fit_score_pred_M_NB

import warnings

warnings.filterwarnings('always') 
np.seterr(divide='ignore', invalid='ignore')
%matplotlib inline

In [2]:
os.chdir('../Data/')

In [3]:
df = pd.read_pickle('full_features')

## Logistic Regression
The following score was achieved in Logistic Regression using the following features:  
Average f1-score: 0.144 +- 0.016:
* num_ord_per_user_per_prod  
* product_total_orders  
* product_avg_add_to_cart_order  
* user_total_orders  
* user_avg_cartsize  
* user_total_products  
* user_avg_days_since_prior_order  
* user_product_avg_add_to_cart_order  
* user_product_order_freq  
* number_orders_since_product_last_ordered  
* percent_of_time_product_ordered  
* mean_order_hour_of_day_by_user  
* diff_between_average_and_current_order_time  
* max_order_number  
* recent_order_weight  

After dropping max_order_number the following score was achieved:  
Average f1-score: 0.250 +- 0.001
   
## Gaussian Naive Bayes - All Features
Average f1-score: 0.347 +- 0.001


After dropping max_order_number the following score was achieved:  
Average f1-score: 0.400 +- 0.001


## Random Forest n_estimators = 10  - All Features
Average f1-score: 0.278 +- 0.002

## Multinomial Naive Bayes - All Features
Average f1-score: 0.183 +- 0.001


After dropping max_order_number the following score was achieved:  
Average f1-score: 0.286 +- 0.000



# Testing Models with KFold Cross Validation

### Logistic Regression

In [6]:
kfold_val_fit_score_pred_log(df)



Individual f-1 score:  [0.24854027402000123, 0.25192092680257866, 0.2501146657493407, 0.24945508310876874, 0.2505835158567738]
Average f1-score: 0.250 +- 0.001


### Gaussian Naive Bayes

In [7]:
kfold_val_fit_score_pred_G_NB(df)

Individual f-1 score:  [0.40078980926654684, 0.398303739110914, 0.3996617211169772, 0.39949921512655223, 0.39952020112117387]
Average f1-score: 0.400 +- 0.001


### Multinomial Naive Bayes

In [8]:
kfold_val_fit_score_pred_M_NB(df)

Individual f-1 score:  [0.28663903956515396, 0.28639571016703874, 0.28620533573544504, 0.28682152283079676, 0.28556139654162765]
Average f1-score: 0.286 +- 0.000


### Random Forest Classifier

In [None]:
kfold_val_fit_score_pred_RF(df)

# Adding Departments

In [9]:
df = pd.read_pickle('full_features_with_dep')

### Logistic Regression

In [10]:
kfold_val_fit_score_pred_log(df)



Individual f-1 score:  [0.24800991306779555, 0.2538462999544142, 0.24596284035381905, 0.24186042030004914, 0.24486354987113715]
Average f1-score: 0.247 +- 0.004


### Gaussian Naive Bayes

In [11]:
kfold_val_fit_score_pred_G_NB(df)

Individual f-1 score:  [0.40018050686112994, 0.40026553556336997, 0.4007927546897124, 0.40028699762080033, 0.40172806861198523]
Average f1-score: 0.401 +- 0.001


### Multinomial Naive Bayes

In [12]:
kfold_val_fit_score_pred_M_NB(df)

Individual f-1 score:  [0.28628313217835794, 0.2875283548821056, 0.2876747609777055, 0.2869519269685659, 0.28720741658977583]
Average f1-score: 0.287 +- 0.000


### Random Forest Classifier

In [None]:
kfold_val_fit_score_pred_RF(df)

# Scaling Data

In [47]:
non_scaled_columns = ['product_id','latest_cart','in_cart','user_id']
non_scaled_df = df.loc[:,non_scaled_columns]
predict_feats_df = df.drop(non_scaled_columns,axis=1)
columns = predict_feats_df.columns.tolist()

scaled_columns = []
for col in columns:
    columns.append('scaled_'+col)

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(scaled_df.values)

df = pd.DataFrame(scaled_df,columns=scaled_columns)

In [55]:
new_scaled_columns

['scaled_num_ord_per_user_per_prod',
 'scaled_product_total_orders',
 'scaled_product_avg_add_to_cart_order',
 'scaled_user_total_orders',
 'scaled_user_avg_cartsize',
 'scaled_user_total_products',
 'scaled_user_avg_days_since_prior_order',
 'scaled_user_product_avg_add_to_cart_order',
 'scaled_user_product_order_freq',
 'scaled_number_orders_since_product_last_ordered',
 'scaled_percent_of_time_product_ordered',
 'scaled_mean_order_hour_of_day_by_user',
 'scaled_diff_between_average_and_current_order_time',
 'scaled_recent_order_weight',
 'scaled_negatively_corr_departments',
 'scaled_positvely_corr_departments']

In [48]:
df = pd.merge(df,non_scaled_df,left_index=True,right_index=True)

### Logistic Regression

In [49]:
kfold_val_fit_score_pred_log(df)



Individual f-1 score:  [0.27027103557961407, 0.2657603374256232, 0.26698453899108454, 0.2643937928463923, 0.2685578929027214]
Average f1-score: 0.267 +- 0.002


### Gaussian Naive Bayes

In [50]:
kfold_val_fit_score_pred_G_NB(df)

Individual f-1 score:  [0.40018050686112994, 0.40026553556336997, 0.4007927546897124, 0.40028699762080033, 0.40172806861198523]
Average f1-score: 0.401 +- 0.001


### Multinomial Naive Bayes

In [51]:
kfold_val_fit_score_pred_M_NB(df)

Individual f-1 score:  [0.14125383841952227, 0.13835858230370232, 0.13965824271587163, 0.13766116925278168, 0.14041412911084045]
Average f1-score: 0.139 +- 0.001


### Random Forest Classifier

In [None]:
kfold_val_fit_score_pred_RF(df)