In [1]:
import os

from collections import defaultdict

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from Standard_functions import get_user_split_data, plot_fit_score_pred,fit_score_pred_G_NB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, f1_score
import copy

import warnings

warnings.filterwarnings('always') 

In [2]:
os.chdir('../Data/')

## Aisle Ordinal

In [3]:
df = pd.read_pickle('full_features')
df.drop('max_order_number',axis=1,inplace=True)
ord_prod_prior_df = pd.read_pickle('products_prior_reduced')
products = pd.read_csv('products.csv')
products.drop(['department_id','product_name'],axis=1,inplace=True)
ord_prod_prior_df = ord_prod_prior_df.merge(products,on='product_id')

In [4]:
reordered_by_aisle = ord_prod_prior_df.groupby('aisle_id',as_index=False)['reordered'].sum()
reordered_by_aisle.columns = ['aisle_id','reordered_by_aisle']

In [5]:
ordinal_by_aisle = (reordered_by_aisle.sort_values('reordered_by_aisle')
                    .reset_index(drop=True).reset_index())
ordinal_by_aisle.drop('reordered_by_aisle',axis=1,inplace=True)
ordinal_by_aisle.columns = ['ordinal_by_aisle','aisle_id']

In [6]:
ordinal_by_aisle = ordinal_by_aisle.merge(products,on='aisle_id')

In [7]:
ordinal_by_aisle.drop('aisle_id',inplace=True,axis=1)

In [8]:
df = df.merge(ordinal_by_aisle,on='product_id')

In [9]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df, val_size=.2, seed=42)
print('Gaussian Naive Bayes:')
fit_score_pred_G_NB(X_tr, X_val, y_tr, y_val)

Gaussian Naive Bayes:
Our f1-score is 0.4006502754265753
And we've predicted 1490769 non-re-orders and 204910 re-orders.


## Messing with thresholds

In [10]:
X_tr, X_val, y_tr, y_val = get_user_split_data(df)

In [11]:
clf = GaussianNB(var_smoothing=1e-9)
clf.fit(X_tr, y_tr)
probabilities = clf.predict_proba(X_val)[:,1]


In [12]:
roc_auc_score(y_val,probabilities)

0.79698487743988

In [13]:
fpr, tpr, thresholds = roc_curve(y_val,probabilities)

In [14]:
probs = pd.DataFrame(probabilities)
probs.columns = ['score']
test = copy.deepcopy(probs)

In [15]:
threshold = np.linspace(.1,.9,30)
dict_thresholdf1 = dict()

for i in threshold:
    test = copy.deepcopy(probs)
    conditions = [
    (test.score > i),
    (test.score < i)
    ]
    choices = [1,0]
    test.score = np.select(conditions, choices)
    dict_thresholdf1[i] = f1_score(y_val,test)

dict_thresholdf1

{0.1: 0.39474871740949574,
 0.12758620689655173: 0.3978379067585905,
 0.15517241379310345: 0.3994436170511704,
 0.1827586206896552: 0.40049866122382566,
 0.21034482758620693: 0.40087926640165333,
 0.23793103448275865: 0.40126180159973124,
 0.2655172413793104: 0.4015118541976216,
 0.2931034482758621: 0.4015085405553068,
 0.3206896551724138: 0.4015983811791638,
 0.34827586206896555: 0.401388222796351,
 0.3758620689655173: 0.40136565629249804,
 0.403448275862069: 0.4013639675625239,
 0.4310344827586208: 0.40117034423753334,
 0.45862068965517244: 0.4008946594363433,
 0.4862068965517242: 0.400587105638839,
 0.5137931034482759: 0.4005521882616855,
 0.5413793103448277: 0.4002648999432745,
 0.5689655172413793: 0.39986558405367895,
 0.5965517241379311: 0.39944790724401785,
 0.6241379310344828: 0.3992726407182811,
 0.6517241379310346: 0.3991185917525371,
 0.6793103448275862: 0.39883892137478444,
 0.706896551724138: 0.39846509618561154,
 0.7344827586206897: 0.39815865271155165,
 0.762068965517241