# MVP

MVP for predicting buy or not buy based on a web session at an e-commerce site.

In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score

Read in observation created from the feature generation process.

In [2]:
observations_df = pd.read_pickle('../data/design.pkl')
observations_df.head()

Unnamed: 0,buy_event,view_count,session_length,avg_len_per_pg
0_1,0,3.0,327.736,163.868
1000008_1,0,1.0,0.0,0.0
100000_1,0,1.0,0.0,0.0
1000025_1,0,1.0,0.0,0.0
1000026_1,0,1.0,0.0,0.0


Create `X` / `y`, and the train/test split 

In [3]:
X, y = observations_df.drop(columns='buy_event'), observations_df.buy_event
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)

Create baseline classification models

In [4]:
log_model = LogisticRegression()
svm_model = SVC()
svm_linear_model = SVC(kernel='linear')
rf_model = RandomForestClassifier()
gboost_model = GradientBoostingClassifier()

Fit all models and calcualte F1 score

In [11]:
models = {'Logistic': log_model, 
#           'SVM RBF': svm_model, 
#           'SVM Linear': svm_linear_model, 
          'Random Forrest': rf_model, 
          'Gradient Boost': gboost_model}

for k, m in models.items():
    m.fit(X_train, y_train)
    y_pred = m.predict(X_train)
    print(f'{k} F1 score: {f1_score(y_train, y_pred)}')

Logistic F1 score: 0.0
Random Forrest F1 score: 0.9665907365223994
Gradient Boost F1 score: 0.6451313037415757


In [12]:
rf_model.feature_importances_

array([0.28774709, 0.37301378, 0.33923913])

In [23]:
cv_rf_model = cross_val_score(rf_model, X_train, y_train, scoring='f1', cv=10)

print(f'10=fold F1 score: {cv_rf_model.mean():.2%} +- {cv_rf_model.std():.2%}')

Avg F1 score: 70.49% +- 2.29%


In [22]:
cv_gboost_model = cross_val_score(gboost_model, X_train, y_train, scoring='f1', cv=10)

print(f'10-fold F1 score: {cv_gboost_model.mean():.2%} +- {cv_gboost_model.std():.2%}')

Avg F1 score: 62.90% +- 4.13%
