In [1]:
%%bash
OUTDIR="./data/ecommerce_shopping_intent"

if [ ! -d $OUTDIR ]; then
    file="https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv"

    wget $file -P $OUTDIR
fi

In [2]:
import pandas as pd
import numpy as np

from pathlib import Path

In [3]:
seed = 83282168
np.random.seed(seed)

In [4]:
datadir = Path('./data/ecommerce_shopping_intent')

In [5]:
df = pd.read_csv(datadir / 'online_shoppers_intention.csv')

In [6]:
df.head(10)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False
5,0,0.0,0,0.0,19,154.216667,0.015789,0.024561,0.0,0.0,Feb,2,2,1,3,Returning_Visitor,False,False
6,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.4,Feb,2,4,3,3,Returning_Visitor,False,False
7,1,0.0,0,0.0,0,0.0,0.2,0.2,0.0,0.0,Feb,1,2,1,5,Returning_Visitor,True,False
8,0,0.0,0,0.0,2,37.0,0.0,0.1,0.0,0.8,Feb,2,2,2,3,Returning_Visitor,False,False
9,0,0.0,0,0.0,3,738.0,0.0,0.022222,0.0,0.4,Feb,2,4,1,2,Returning_Visitor,False,False


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [8]:
def unique_values(X):
    return ', '.join([str(v) for v in X.sort_values().unique()])

def quartile(X):
    if X.dtype not in ['object', 'bool']:
        return str(np.quantile(X, [0.,0.25,0.5,0.75,1.]).tolist())
    
    return ''

df.agg(['dtype', 'nunique', unique_values, quartile]).T

Unnamed: 0,dtype,nunique,unique_values,quartile
Administrative,int64,27,"0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...","[0.0, 0.0, 1.0, 4.0, 27.0]"
Administrative_Duration,float64,3335,"0.0, 1.333333333, 2.0, 3.0, 3.5, 4.0, 4.333333...","[0.0, 0.0, 7.5, 93.25625, 3398.75]"
Informational,int64,17,"0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...","[0.0, 0.0, 0.0, 0.0, 24.0]"
Informational_Duration,float64,1258,"0.0, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 5.0, 5...","[0.0, 0.0, 0.0, 0.0, 2549.375]"
ProductRelated,int64,311,"0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...","[0.0, 7.0, 18.0, 38.0, 705.0]"
ProductRelated_Duration,float64,9551,"0.0, 0.5, 1.0, 2.333333333, 2.666666667, 3.0, ...","[0.0, 184.1375, 598.9369047499999, 1464.157213..."
BounceRates,float64,1872,"0.0, 2.7300000000000003e-05, 3.35e-05, 3.82999...","[0.0, 0.0, 0.0031124675, 0.016812558499999998,..."
ExitRates,float64,4777,"0.0, 0.00017559299999999998, 0.000250438, 0.00...","[0.0, 0.014285714, 0.0251564025, 0.05, 0.2]"
PageValues,float64,2704,"0.0, 0.038034542000000005, 0.067049546, 0.0935...","[0.0, 0.0, 0.0, 0.0, 361.76374189999996]"
SpecialDay,float64,6,"0.0, 0.2, 0.4, 0.6, 0.8, 1.0","[0.0, 0.0, 0.0, 0.0, 1.0]"


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train, test = train_test_split(
    df, 
    test_size = 0.3, 
    stratify = df[['Revenue']],
    random_state = seed)

In [11]:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTENC

from lib.custom_transforms import DtypeMapper, GroupMinority, DropColumn, TransformByDtype, PdDummyEncoder

In [12]:
def encode_features(X, copy=False):
    if copy:
        X = X.copy()

    X['Month'] = X['Month'].map(
        {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'June': 6,
        'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12})
    
    X['VistorType'] = X['VisitorType'].map(
        {'New_Visitor': 1, 'Returning_Visitor': 2, 'Other': 3})
    
    X['Weekend'] = X['Weekend'].map({False: 0, True: 1})
    X['Revenue'] = X['Revenue'].map({False: 0, True: 1})

    return X


In [13]:
def generate_features(X, copy=False):
    if copy:
        X = X.copy()

    X['AvgAdminTime'] = (X['Administrative_Duration'] / X['Administrative']).fillna(0.)
    X['AvgInfoTime'] = (X['Informational_Duration'] / X['Informational']).fillna(0.)
    X['AvgProductTime'] = (X['ProductRelated_Duration'] / X['ProductRelated']).fillna(0.)
    X['Quarter'] = np.select(
        condlist = [X['Month'] <= 3, X['Month'] <= 6, X['Month'] <= 9, X['Month'] <= 12],
        choicelist = [1, 2, 3, 4],
        default = -1
    )

    return X

In [14]:
feature_gen = Pipeline([
    ('encode_features', FunctionTransformer(encode_features, kw_args={'copy': True})), 
    ('new_features', FunctionTransformer(generate_features)),
    ('set_dtypes', DtypeMapper(
        {'category': ['Month', 'Quarter', 'OperatingSystems', 'Browser', 'Region', 
            'TrafficType', 'VisitorType', 'Weekend']}))
])

In [15]:
train_mod = feature_gen.fit_transform(train)

In [16]:
oversampler = SMOTENC(categorical_features=[train_mod.columns.get_loc(c) for c in train_mod.select_dtypes('category')], random_state=seed)

In [17]:
train_mod = oversampler.fit_resample(train_mod, train_mod['Revenue'])[0]

In [18]:
normaliser = Pipeline([
    ('drop_col', DropColumn(
        ['Administrative_Duration', 'Informational_Duration','ProductRelated_Duration', 'Month'])),
    ('minmax_normalise', TransformByDtype(
        transformer = MinMaxScaler(), 
        include_dtypes = ['number'],
        combine_strategy = 'reassign')),
    ('dummy_encoding', PdDummyEncoder(dummy_na=True, drop_first=True))
])

In [19]:
train_mod = normaliser.fit_transform(train_mod)

In [20]:
predictor = LogisticRegression(
    C = 1e12,
    fit_intercept = True,
    class_weight = 'balanced',
    random_state = seed)

In [21]:
predictor.fit(train_mod.loc[:, ~train_mod.columns.isin(['Revenue'])], train_mod['Revenue'])

LogisticRegression(C=1000000000000.0, class_weight='balanced',
                   random_state=83282168)

In [22]:
temp = predictor.predict(train_mod.loc[:, ~train_mod.columns.isin(['Revenue'])])

In [23]:
(temp == train_mod['Revenue']).sum() / len(train_mod)

0.8689513365318712

In [24]:
test = feature_gen.transform(test)

In [25]:
test = normaliser.transform(test)

In [26]:
test['predicted'] = predictor.predict(test.loc[:, ~test.columns.isin(['Revenue'])])

In [27]:
pd.crosstab(test['Revenue'], test['predicted'])

predicted,0.0,1.0
Revenue,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2891,236
1.0,250,322


In [28]:
(test.predicted == test.Revenue).sum() / len(test)

0.8686131386861314