In [17]:
import time
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from imblearn.over_sampling import SMOTE

from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

In [18]:
df = pd.read_csv("online_shoppers_intention.csv")

In [19]:
df.head()
# Ctrl+] to indent and Ctrl+[ 

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [20]:
df.shape

(12330, 18)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [22]:
df.Weekend.unique()

array([False,  True])

In [23]:
df.Weekend = df.Weekend.replace((True, False),(1,0))

In [24]:
df.Weekend.info()

<class 'pandas.core.series.Series'>
RangeIndex: 12330 entries, 0 to 12329
Series name: Weekend
Non-Null Count  Dtype
--------------  -----
12330 non-null  int64
dtypes: int64(1)
memory usage: 96.5 KB


In [25]:
df.Revenue.unique()

array([False,  True])

In [26]:
df.Revenue = df.Revenue.replace((True, False), (1,0))

In [27]:
df.Revenue.info()

<class 'pandas.core.series.Series'>
RangeIndex: 12330 entries, 0 to 12329
Series name: Revenue
Non-Null Count  Dtype
--------------  -----
12330 non-null  int64
dtypes: int64(1)
memory usage: 96.5 KB


In [28]:
df.VisitorType.unique()

array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)

In [29]:
condition = df.VisitorType == "Returning_Visitor"
df['Returning_Visitor'] = np.where(condition, 1,0)

In [30]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue,Returning_Visitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0,1


In [31]:
df = df.drop(columns = ['VisitorType'])

In [32]:
df.columns

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend',
       'Revenue', 'Returning_Visitor'],
      dtype='object')

In [33]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
Weekend                      int64
Revenue                      int64
Returning_Visitor            int32
dtype: object

In [34]:
df.Month.unique()

array(['Feb', 'Mar', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep',
       'Dec'], dtype=object)

In [35]:
ordinal_encoder = OrdinalEncoder()
df['Month'] = ordinal_encoder.fit_transform(df[['Month']])

In [36]:
df.Month.unique()

array([2., 5., 6., 8., 4., 3., 0., 7., 9., 1.])

In [37]:
df.Revenue.value_counts()

Revenue
0    10422
1     1908
Name: count, dtype: int64

In [38]:
result=df[df.columns[1:]].corr()['Revenue']

In [39]:
result.sort_values(ascending=False)

Revenue                    1.000000
PageValues                 0.492569
ProductRelated             0.158538
ProductRelated_Duration    0.152373
Informational              0.095200
Administrative_Duration    0.093587
Month                      0.080150
Informational_Duration     0.070345
Weekend                    0.029295
Browser                    0.023984
TrafficType               -0.005113
Region                    -0.011595
OperatingSystems          -0.014668
SpecialDay                -0.082305
Returning_Visitor         -0.103843
BounceRates               -0.150673
ExitRates                 -0.207071
Name: Revenue, dtype: float64

In [40]:
X = df.drop(['Revenue'], axis=1)
y = df['Revenue']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
test_size = 0.3, random_state = 0)

In [42]:
def model_pipeline(X, model):
    n_c = X.select_dtypes(exclude=['object']).columns.tolist()
    c_c = X.select_dtypes(include=['object']).columns.tolist()

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy = 'constant')),
        ('scaler', MinMaxScaler())
    ])

    categorical_pipeline = Pipeline([
        ('encoder', OneHotEncoder(handle_unknown = 'ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('numeric', numeric_pipeline, n_c),
        ('categorical', categorical_pipeline, c_c)
    ], remainder = 'passthrough')

    final_steps = [
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state = 1)), 
        ('feature_selection', SelectKBest(score_func = chi2, k=6)),
        ('model', model)
    ]

    return IMBPipeline(steps=final_steps)

In [43]:
def select_model(X, y, pipeline=None):
    classifiers = {}
    c_d1 = {"RandomForestClassifier": RandomForestClassifier()}
    classifiers.update(c_d1)

    c_d2 = {"DecisionTreeClassifier": DecisionTreeClassifier()}
    classifiers.update(c_d2)

    c_d3 = {"KNeighborsClassifier": KNeighborsClassifier()}
    classifiers.update(c_d3)

    c_d4 = {"RidgeClassifier": RidgeClassifier()}
    classifiers.update(c_d4)

    c_d5 = {"BernoulliNB": BernoulliNB()}
    classifiers.update(c_d5)

    c_d6 = {"SVC": SVC()}
    classifiers.update(c_d6)

    cols = ['model', 'run_time', 'roc_auc']
    df_models = pd.DataFrame(columns=cols)

    for key in classifiers:
        start_time = time.time()
        pipeline = model_pipeline(X_train, classifiers[key])

        cv = cross_val_score(pipeline, X, y, cv=10, scoring = 'roc_auc')

        row = {'model': key,
               'run_time': format(round((time.time()-start_time)/60,2)),
               'roc_auc': cv.mean(),
               }
        
        df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)
    df_models = df_models.sort_values(by='roc_auc', ascending = False)

    return df_models



In [44]:
import warnings
warnings.filterwarnings('ignore')

In [45]:
models= select_model(X_train, y_train)

In [46]:
print(models)

                    model run_time   roc_auc
5                     SVC     1.69  0.889927
0  RandomForestClassifier     1.23  0.886134
4             BernoulliNB     0.02  0.870663
3         RidgeClassifier     0.02  0.856459
2    KNeighborsClassifier     0.05  0.840885
1  DecisionTreeClassifier     0.05  0.736964


In [47]:
selected_model = SVC()
bundled_pipeline = model_pipeline(X_train, selected_model)
bundled_pipeline.fit(X_train, y_train)

In [48]:
y_pred = bundled_pipeline.predict(X_test)

In [49]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [50]:
roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

In [51]:
print('ROC/AUC:', roc_auc)
print('Accuracy:', accuracy)
print('F1 score:', f1_score)

ROC/AUC: 0.7872136596906621
Accuracy: 0.8775344687753447
F1 score: 0.6413301662707839


In [52]:
classif_report = classification_report(y_test, y_pred)

In [53]:
print(classif_report)

              precision    recall  f1-score   support

           0       0.93      0.92      0.93      3077
           1       0.63      0.65      0.64       622

    accuracy                           0.88      3699
   macro avg       0.78      0.79      0.78      3699
weighted avg       0.88      0.88      0.88      3699

