### Finding customer buying propenstiy from yesterdays data who did not purchase vs training set (today)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 

In [2]:
train_data = pd.read_csv(r'C:\Users\kami.cheung\Documents\GitHub\Customer_propensity_to_purchase_dataset\training_sample.csv')
test = pd.read_csv(r'C:\Users\kami.cheung\Documents\GitHub\Customer_propensity_to_purchase_dataset\testing_sample.csv')

In [3]:
train_data.head()

Unnamed: 0,UserID,basket_icon_click,basket_add_list,basket_add_detail,sort_by,image_picker,account_page_click,promo_banner_click,detail_wishlist_add,list_size_dropdown,...,saw_sizecharts,saw_delivery,saw_account_upgrade,saw_homepage,device_mobile,device_computer,device_tablet,returning_user,loc_uk,ordered
0,a720-6b732349-a720-4862-bd21-644732,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,a0c0-6b73247c-a0c0-4bd9-8baa-797356,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,86a8-6b735c67-86a8-407b-ba24-333055,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
3,6a3d-6b736346-6a3d-4085-934b-396834,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
4,b74a-6b737717-b74a-45c3-8c6a-421140,0,1,0,1,0,0,0,0,1,...,0,0,0,1,0,0,1,0,1,1


In [4]:
print(train_data.shape)
print(test.shape)

(455401, 25)
(151655, 25)


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455401 entries, 0 to 455400
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   UserID                   455401 non-null  object
 1   basket_icon_click        455401 non-null  int64 
 2   basket_add_list          455401 non-null  int64 
 3   basket_add_detail        455401 non-null  int64 
 4   sort_by                  455401 non-null  int64 
 5   image_picker             455401 non-null  int64 
 6   account_page_click       455401 non-null  int64 
 7   promo_banner_click       455401 non-null  int64 
 8   detail_wishlist_add      455401 non-null  int64 
 9   list_size_dropdown       455401 non-null  int64 
 10  closed_minibasket_click  455401 non-null  int64 
 11  checked_delivery_detail  455401 non-null  int64 
 12  checked_returns_detail   455401 non-null  int64 
 13  sign_in                  455401 non-null  int64 
 14  saw_checkout        

In [6]:
train_data.nunique()

UserID                     455401
basket_icon_click               2
basket_add_list                 2
basket_add_detail               2
sort_by                         2
image_picker                    2
account_page_click              2
promo_banner_click              2
detail_wishlist_add             2
list_size_dropdown              2
closed_minibasket_click         2
checked_delivery_detail         2
checked_returns_detail          2
sign_in                         2
saw_checkout                    2
saw_sizecharts                  2
saw_delivery                    2
saw_account_upgrade             2
saw_homepage                    2
device_mobile                   2
device_computer                 2
device_tablet                   2
returning_user                  2
loc_uk                          2
ordered                         2
dtype: int64

In [7]:
train_data.corr()['ordered'].sort_values(ascending=False)

ordered                    1.000000
checked_delivery_detail    0.798720
saw_checkout               0.708986
sign_in                    0.665556
basket_icon_click          0.428334
basket_add_detail          0.414420
basket_add_list            0.287666
saw_homepage               0.157778
list_size_dropdown         0.154867
closed_minibasket_click    0.140011
image_picker               0.071492
returning_user             0.060295
checked_returns_detail     0.059484
account_page_click         0.057279
promo_banner_click         0.056533
sort_by                    0.054636
device_computer            0.049208
loc_uk                     0.031643
saw_delivery               0.031461
saw_account_upgrade        0.025857
detail_wishlist_add        0.023516
device_tablet              0.016939
saw_sizecharts             0.007548
device_mobile             -0.042907
Name: ordered, dtype: float64

We can see that checked out delivery detail, visited checkout page and people who signed in have strong correlation to ordered, however whether users use mobile has low correlation to ordered - so we can remove this column.

In [8]:
# Drop columns with low correlation
predictors = train_data.drop(['ordered','UserID','device_mobile'], axis=1)
# predictors = train[['checked_delivery_pdp', 'basket_icon_click', 'sign_in', 'saw_checkout']]
targets = train_data.ordered

In [9]:
X_train, X_test, y_train, y_test = train_test_split(predictors, targets, test_size=.3)

print("Predictor - Training: ", X_train.shape, "Predictor - Testing:", X_test.shape )

Predictor - Training:  (318780, 22) Predictor - Testing: (136621, 22)


### Model selection & hyperparameter tuning

In [10]:
model_params = {
    'gaussian_nb': {
        'model': GaussianNB(),
        'params' : {
            'var_smoothing': np.logspace(0,-9, num=100)
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10,100,500,1000,5e10],
            'penalty':['l2','l1','elasticnet']
        }
    },
    'lightgbm' : {
        'model': LGBMClassifier(num_leaves = 500, num_iterations =200,  random_state=0),
        'params': {
            'learning_rate': [0.1,0.5,1,5,10]
        }
    }
}

In [11]:
from sklearn.model_selection import GridSearchCV
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df.sort_values('best_score',ascending=False)

35 fits failed out of a total of 105.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kami.cheung\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kami.cheung\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\kami.cheung\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 457, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

 0.99308

Unnamed: 0,model,best_score,best_params
2,logistic_regression,0.993092,"{'C': 1, 'penalty': 'l2'}"
1,random_forest,0.992678,{'n_estimators': 10}
3,lightgbm,0.992597,{'learning_rate': 0.1}
0,gaussian_nb,0.988384,{'var_smoothing': 0.0006579332246575676}


In [16]:
model=LogisticRegression(C=1,penalty='l2')
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9931708888091875

In [17]:
model=LogisticRegression(C=0.01,penalty='l2')
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9931196521764589

In [19]:
model=LogisticRegression(solver='sag',C=5e10,penalty='l2')
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9931708888091875

Tried tuning the regularization strength to a very small amount so as to balance out the strong regularization from default setting of LogisticRegression model C=1.0. As we can see the score actually lowers a little when the strength is strong C=0.01. We have 21 predictors, it might help to reduce some noise - but in the case the strong regularization made the score lower. I used sag solver due to a large dataset (no difference in score).

In [26]:
model=LogisticRegression()
model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [27]:
model=GaussianNB(var_smoothing= 0.0006579332246575676)
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9884790771550493

In [28]:
model=LogisticRegression(C=1,penalty='l2')
model=model.fit(X_train,y_train)

predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print()
print(classification_report(y_test,predictions))

[[129974    843]
 [    90   5714]]

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    130817
           1       0.87      0.98      0.92      5804

    accuracy                           0.99    136621
   macro avg       0.94      0.99      0.96    136621
weighted avg       0.99      0.99      0.99    136621



Now to predict on the previous days visitors!
#### Start by loading in our sample data of the previous days visitors who did not order.

In [29]:
yesterdays_prospects = pd.read_csv(r'C:\Users\kami.cheung\Documents\GitHub\Customer_propensity_to_purchase_dataset\testing_sample.csv')

In [30]:
yesterdays_prospects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151655 entries, 0 to 151654
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   UserID                   151655 non-null  object
 1   basket_icon_click        151655 non-null  int64 
 2   basket_add_list          151655 non-null  int64 
 3   basket_add_detail        151655 non-null  int64 
 4   sort_by                  151655 non-null  int64 
 5   image_picker             151655 non-null  int64 
 6   account_page_click       151655 non-null  int64 
 7   promo_banner_click       151655 non-null  int64 
 8   detail_wishlist_add      151655 non-null  int64 
 9   list_size_dropdown       151655 non-null  int64 
 10  closed_minibasket_click  151655 non-null  int64 
 11  checked_delivery_detail  151655 non-null  int64 
 12  checked_returns_detail   151655 non-null  int64 
 13  sign_in                  151655 non-null  int64 
 14  saw_checkout        

We're going to drop UserID before we predict on this data, so that is matches our training set, but before we do let's pop it into another variable, so we can pull back this identifier later. Once that's done we can drop our unwanted fields and print the head() to check our data

In [31]:
userids = yesterdays_prospects.UserID

yesterdays_prospects = yesterdays_prospects.drop(columns=['UserID','ordered','device_mobile'],axis=1)

yesterdays_prospects.head()

Unnamed: 0,basket_icon_click,basket_add_list,basket_add_detail,sort_by,image_picker,account_page_click,promo_banner_click,detail_wishlist_add,list_size_dropdown,closed_minibasket_click,...,sign_in,saw_checkout,saw_sizecharts,saw_delivery,saw_account_upgrade,saw_homepage,device_computer,device_tablet,returning_user,loc_uk
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [32]:
yesterdays_prospects.shape

(151655, 22)

Now we'll run our predictions on the best performing and tuned model, and insert them into a field called 'propensity', print the head

In [33]:
model=LogisticRegression(solver='sag',C=5e10,penalty='l2')
model.fit(X_train,y_train)
yesterdays_prospects['propensity']=model.predict_proba(yesterdays_prospects)[:,1]

yesterdays_prospects.head()

Unnamed: 0,basket_icon_click,basket_add_list,basket_add_detail,sort_by,image_picker,account_page_click,promo_banner_click,detail_wishlist_add,list_size_dropdown,closed_minibasket_click,...,saw_checkout,saw_sizecharts,saw_delivery,saw_account_upgrade,saw_homepage,device_computer,device_tablet,returning_user,loc_uk,propensity
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.280195e-10
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.280195e-10
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.280195e-10
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2.262691e-10
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.280195e-10


In [34]:
pd.DataFrame(userids)
results = pd.concat([userids, yesterdays_prospects], axis=1)

In [35]:
print(results[['UserID','propensity']].sort_values('propensity',ascending=False).head(20))

                                     UserID  propensity
61686   62d7-k40b-86b462d7-k40b-4d79-325833    0.990224
19282   7d4k-5058-46bk7d4k-5058-4j5b-892506    0.980185
30932    4jk6-b11k-dk774jk6-b11k-4825-64817    0.979476
9807    d7d3-7k0b-77b3d7d3-7k0b-4d7j-277864    0.966530
22137   0071-474d-k7740071-474d-40d7-136068    0.965503
32662   5143-72kd-d4535143-72kd-40j8-689442    0.962705
60361   747k-d723-b145747k-d723-4bb7-708577    0.960426
47568   k36b-3736-9d6jk36b-3736-4414-924963    0.960426
73743   k34d-151j-9j89k34d-151j-4bbk-290563    0.959922
101194  k45d-7k17-6k74k45d-7k17-4064-526618    0.959333
128454  k787-1450-37dbk787-1450-4318-853205    0.959333
118877  bdkd-4jk6-244bbdkd-4jk6-4j54-600198    0.959333
52271   kd4d-437d-j4dbkd4d-437d-4k64-689351    0.958978
50105   44bk-231b-j16344bk-231b-4407-278970    0.958024
19057   d477-d2k3-4665d477-d2k3-4727-128917    0.957590
66681   1484-2j6b-87441484-2j6b-4683-737330    0.957590
25483   7jk7-9842-d3b77jk7-9842-45b3-929164    0

In [36]:
results.sort_values('propensity',ascending=False).head(20)

Unnamed: 0,UserID,basket_icon_click,basket_add_list,basket_add_detail,sort_by,image_picker,account_page_click,promo_banner_click,detail_wishlist_add,list_size_dropdown,...,saw_checkout,saw_sizecharts,saw_delivery,saw_account_upgrade,saw_homepage,device_computer,device_tablet,returning_user,loc_uk,propensity
61686,62d7-k40b-86b462d7-k40b-4d79-325833,1,0,1,0,0,0,0,0,0,...,1,0,0,0,1,1,0,1,1,0.990224
19282,7d4k-5058-46bk7d4k-5058-4j5b-892506,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,1,0,1,1,0.980185
30932,4jk6-b11k-dk774jk6-b11k-4825-64817,0,0,1,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,1,0.979476
9807,d7d3-7k0b-77b3d7d3-7k0b-4d7j-277864,1,1,1,0,1,0,0,0,0,...,1,0,0,0,1,1,0,1,1,0.96653
22137,0071-474d-k7740071-474d-40d7-136068,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,1,1,0.965503
32662,5143-72kd-d4535143-72kd-40j8-689442,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0.962705
60361,747k-d723-b145747k-d723-4bb7-708577,1,1,1,1,0,0,0,0,0,...,1,0,0,0,1,1,0,1,1,0.960426
47568,k36b-3736-9d6jk36b-3736-4414-924963,1,1,1,1,0,0,0,0,0,...,1,0,0,0,1,1,0,1,1,0.960426
73743,k34d-151j-9j89k34d-151j-4bbk-290563,1,0,1,1,1,0,0,0,1,...,1,0,0,0,1,1,1,1,1,0.959922
101194,k45d-7k17-6k74k45d-7k17-4064-526618,1,1,1,0,1,0,0,0,1,...,1,0,0,0,1,1,0,1,1,0.959333


## GaussianNB model

In [37]:
X_train, X_test, y_train, y_test = train_test_split(predictors, targets, test_size=.3)

print("Predictor - Training: ", X_train.shape, "Predictor - Testing:", X_test.shape )

Predictor - Training:  (318780, 22) Predictor - Testing: (136621, 22)


In [38]:
model=GaussianNB(var_smoothing= 0.0006579332246575676)
model=model.fit(X_train,y_train)

predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print()
print(classification_report(y_test,predictions))

[[129436   1483]
 [    71   5631]]

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    130919
           1       0.79      0.99      0.88      5702

    accuracy                           0.99    136621
   macro avg       0.90      0.99      0.94    136621
weighted avg       0.99      0.99      0.99    136621



In [39]:
yesterdays_prospects = pd.read_csv(r'C:\Users\kami.cheung\Documents\GitHub\Customer_propensity_to_purchase_dataset\testing_sample.csv')

userids = yesterdays_prospects.UserID

yesterdays_prospects = yesterdays_prospects.drop(columns=['UserID','ordered','device_mobile'],axis=1)

yesterdays_prospects.head()

Unnamed: 0,basket_icon_click,basket_add_list,basket_add_detail,sort_by,image_picker,account_page_click,promo_banner_click,detail_wishlist_add,list_size_dropdown,closed_minibasket_click,...,sign_in,saw_checkout,saw_sizecharts,saw_delivery,saw_account_upgrade,saw_homepage,device_computer,device_tablet,returning_user,loc_uk
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [40]:
model=LogisticRegression(solver='sag',C=5e10,penalty='l2')
model.fit(X_train,y_train)
yesterdays_prospects['propensity']=model.predict_proba(yesterdays_prospects)[:,1]

yesterdays_prospects.head()

Unnamed: 0,basket_icon_click,basket_add_list,basket_add_detail,sort_by,image_picker,account_page_click,promo_banner_click,detail_wishlist_add,list_size_dropdown,closed_minibasket_click,...,saw_checkout,saw_sizecharts,saw_delivery,saw_account_upgrade,saw_homepage,device_computer,device_tablet,returning_user,loc_uk,propensity
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.700048e-09
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.700048e-09
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.700048e-09
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2.98877e-09
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.700048e-09


In [41]:
print(results[['UserID','propensity']].sort_values('propensity',ascending=False).head(20))

                                     UserID  propensity
61686   62d7-k40b-86b462d7-k40b-4d79-325833    0.990224
19282   7d4k-5058-46bk7d4k-5058-4j5b-892506    0.980185
30932    4jk6-b11k-dk774jk6-b11k-4825-64817    0.979476
9807    d7d3-7k0b-77b3d7d3-7k0b-4d7j-277864    0.966530
22137   0071-474d-k7740071-474d-40d7-136068    0.965503
32662   5143-72kd-d4535143-72kd-40j8-689442    0.962705
60361   747k-d723-b145747k-d723-4bb7-708577    0.960426
47568   k36b-3736-9d6jk36b-3736-4414-924963    0.960426
73743   k34d-151j-9j89k34d-151j-4bbk-290563    0.959922
101194  k45d-7k17-6k74k45d-7k17-4064-526618    0.959333
128454  k787-1450-37dbk787-1450-4318-853205    0.959333
118877  bdkd-4jk6-244bbdkd-4jk6-4j54-600198    0.959333
52271   kd4d-437d-j4dbkd4d-437d-4k64-689351    0.958978
50105   44bk-231b-j16344bk-231b-4407-278970    0.958024
19057   d477-d2k3-4665d477-d2k3-4727-128917    0.957590
66681   1484-2j6b-87441484-2j6b-4683-737330    0.957590
25483   7jk7-9842-d3b77jk7-9842-45b3-929164    0

In [42]:
results.sort_values('propensity',ascending=False).head()

Unnamed: 0,UserID,basket_icon_click,basket_add_list,basket_add_detail,sort_by,image_picker,account_page_click,promo_banner_click,detail_wishlist_add,list_size_dropdown,...,saw_checkout,saw_sizecharts,saw_delivery,saw_account_upgrade,saw_homepage,device_computer,device_tablet,returning_user,loc_uk,propensity
61686,62d7-k40b-86b462d7-k40b-4d79-325833,1,0,1,0,0,0,0,0,0,...,1,0,0,0,1,1,0,1,1,0.990224
19282,7d4k-5058-46bk7d4k-5058-4j5b-892506,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,1,0,1,1,0.980185
30932,4jk6-b11k-dk774jk6-b11k-4825-64817,0,0,1,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,1,0.979476
9807,d7d3-7k0b-77b3d7d3-7k0b-4d7j-277864,1,1,1,0,1,0,0,0,0,...,1,0,0,0,1,1,0,1,1,0.96653
22137,0071-474d-k7740071-474d-40d7-136068,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,1,1,0.965503


Both models have high score of >99%, however Logistic regression achieved a higher f1 score of 93% and a softer regularization strength vs GaussianNB at 88%.

## Handling imbalanced datasets

In [43]:
train_data = pd.read_csv(r'C:\Users\kami.cheung\Documents\GitHub\Customer_propensity_to_purchase_dataset\training_sample.csv')
test = pd.read_csv(r'C:\Users\kami.cheung\Documents\GitHub\Customer_propensity_to_purchase_dataset\testing_sample.csv')

In [44]:
# Drop columns with low correlation
X = train_data.drop(['ordered','UserID','device_mobile'], axis=1)
# predictors = train[['checked_delivery_pdp', 'basket_icon_click', 'sign_in', 'saw_checkout']]
y = train_data.ordered

In [45]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# Drop columns with low correlation
X = train_data.drop(['ordered','UserID','device_mobile'], axis=1)
# predictors = train[['checked_delivery_pdp', 'basket_icon_click', 'sign_in', 'saw_checkout']]
y = train_data.ordered

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_buy = X[X.ordered==0]
buy = X[X.ordered==1]

# upsample minority
buy_upsampled = resample(buy,
                          replace=True, # sample with replacement
                          n_samples=len(not_buy), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_buy, buy_upsampled])

# check new class counts
upsampled.ordered.value_counts()

0    327313
1    327313
Name: ordered, dtype: int64

In [46]:
y_train = upsampled.ordered
X_train = upsampled.drop(columns=['ordered'])

upsampled = LogisticRegression(solver='sag',C=5e10,penalty='l2').fit(X_train, y_train)

upsampled_pred = upsampled.predict(X_test)

# Checking accuracy
print(classification_report(y_test, upsampled_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    108995
           1       0.85      0.99      0.91      4856

    accuracy                           0.99    113851
   macro avg       0.92      0.99      0.95    113851
weighted avg       0.99      0.99      0.99    113851



In [47]:
#still using our separated classes fraud and not_fraud from above

# downsample majority
not_buy_downsampled = resample(not_buy,
                                replace = False, # sample without replacement
                                n_samples = len(buy), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([not_buy_downsampled, buy])

# checking counts
downsampled.ordered.value_counts()

0    14237
1    14237
Name: ordered, dtype: int64

In [48]:
y_train = downsampled.ordered
X_train = downsampled.drop(columns=['ordered'])

downsampled = LogisticRegression(solver='sag',C=5e10,penalty='l2').fit(X_train, y_train)

downsampled_pred = downsampled.predict(X_test)

# Checking accuracy
print(classification_report(y_test, downsampled_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    108995
           1       0.84      0.99      0.91      4856

    accuracy                           0.99    113851
   macro avg       0.92      0.99      0.95    113851
weighted avg       0.99      0.99      0.99    113851





In [49]:
from imblearn.over_sampling import SMOTE

X = train_data.drop(['ordered','UserID','device_mobile'], axis=1)
y = train_data.ordered

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

sm = SMOTE(random_state=27,k_neighbors=11)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [50]:
smote = LogisticRegression(solver='liblinear').fit(X_train, y_train)

smote_pred = smote.predict(X_test)

# Checking accuracy
print(classification_report(y_test, downsampled_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    108995
           1       0.84      0.99      0.91      4856

    accuracy                           0.99    113851
   macro avg       0.92      0.99      0.95    113851
weighted avg       0.99      0.99      0.99    113851



Upsampling, downsampling and SMOTE achieved 0.91 f1 score, but Upsampling had the highest accuracy. However, the metrics are slightly lower than the imbalanced logistic regression f1 score at 0.93. Balancing the datasets decreased f1 score by 0.2.