In [1]:
import pandas as pd
import numpy as np

from pandas_profiling import ProfileReport
from lib.pandas_util import pd_summary
from pathlib import Path

In [2]:
seed = 83282168
np.random.seed(seed)

# Loading of Data

In [3]:
%%bash
OUTDIR="./data/ecommerce_shopping_intent"

if [ ! -f $OUTDIR/online_shoppers_intention.csv ]; then
    file="https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv"

    curl $file -P $OUTDIR
fi

In [4]:
datadir = Path('./data/ecommerce_shopping_intent')

In [5]:
df = pd.read_csv(datadir / 'online_shoppers_intention.csv')

In [6]:
df.head(10)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False
5,0,0.0,0,0.0,19,154.216667,0.015789,0.024561,0.0,0.0,Feb,2,2,1,3,Returning_Visitor,False,False
6,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.4,Feb,2,4,3,3,Returning_Visitor,False,False
7,1,0.0,0,0.0,0,0.0,0.2,0.2,0.0,0.0,Feb,1,2,1,5,Returning_Visitor,True,False
8,0,0.0,0,0.0,2,37.0,0.0,0.1,0.0,0.8,Feb,2,2,2,3,Returning_Visitor,False,False
9,0,0.0,0,0.0,3,738.0,0.0,0.022222,0.0,0.4,Feb,2,4,1,2,Returning_Visitor,False,False


In [7]:
pd_summary(df)

----- Administrative -----
0%       0.000000
25%      0.000000
50%      1.000000
mean     2.315166
75%      4.000000
100%    27.000000
NaN      0.000000
Name: Administrative, dtype: float64


----- Administrative_Duration -----
0%         0.000000
25%        0.000000
50%        7.500000
mean      80.818611
75%       93.256250
100%    3398.750000
NaN        0.000000
Name: Administrative_Duration, dtype: float64


----- Informational -----
0%       0.000000
25%      0.000000
50%      0.000000
mean     0.503569
75%      0.000000
100%    24.000000
NaN      0.000000
Name: Informational, dtype: float64


----- Informational_Duration -----
0%         0.000000
25%        0.000000
50%        0.000000
mean      34.472398
75%        0.000000
100%    2549.375000
NaN        0.000000
Name: Informational_Duration, dtype: float64


----- ProductRelated -----
0%        0.000000
25%       7.000000
50%      18.000000
mean     31.731468
75%      38.000000
100%    705.000000
NaN       0.000000
Name: Produc

# Splitting Your Data

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train, test = train_test_split(
    df, 
    test_size = 0.3, 
    stratify = df[['Revenue']],
    random_state = seed)

# Creating a Custom Pipeline

In [10]:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTENC

from lib.custom_transforms import DtypeMapper, GroupMinority, DropColumn, TransformByDtype, PdDummyEncoder

In [11]:
def encode_features(X, copy=False):
    if copy:
        X = X.copy()

    X['Month'] = X['Month'].map(
        {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'June': 6,
        'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12})
    
    X['VistorType'] = X['VisitorType'].map(
        {'New_Visitor': 1, 'Returning_Visitor': 2, 'Other': 3})
    
    X['Weekend'] = X['Weekend'].map({False: 0, True: 1})
    X['Revenue'] = X['Revenue'].map({False: 0, True: 1})

    return X


In [12]:
def generate_features(X, copy=False):
    if copy:
        X = X.copy()

    X['AvgAdminTime'] = (X['Administrative_Duration'] / X['Administrative']).fillna(0.)
    X['AvgInfoTime'] = (X['Informational_Duration'] / X['Informational']).fillna(0.)
    X['AvgProductTime'] = (X['ProductRelated_Duration'] / X['ProductRelated']).fillna(0.)
    X['Quarter'] = np.select(
        condlist = [X['Month'] <= 3, X['Month'] <= 6, X['Month'] <= 9, X['Month'] <= 12],
        choicelist = [1, 2, 3, 4],
        default = -1
    )

    return X

In [13]:
feature_gen = Pipeline([
    ('encode_features', FunctionTransformer(encode_features, kw_args={'copy': True})), 
    ('new_features', FunctionTransformer(generate_features)),
    ('set_dtypes', DtypeMapper(
        {'category': ['Month', 'Quarter', 'OperatingSystems', 'Browser', 'Region', 
            'TrafficType', 'VisitorType', 'Weekend']}))
])

In [14]:
normaliser = Pipeline([
    ('drop_col', DropColumn(
        ['Administrative_Duration', 'Informational_Duration','ProductRelated_Duration', 'Month'])),
    ('minmax_normalise', TransformByDtype(
        transformer = MinMaxScaler(), 
        include_dtypes = ['number'],
        combine_strategy = 'reassign')),
    ('dummy_encoding', PdDummyEncoder(dummy_na=True, drop_first=True))
])

In [15]:
predictor = LogisticRegression(
    C = 1e16,
    max_iter = 10000,
    fit_intercept = True,
    class_weight = 'balanced',
    random_state = seed)

# Transform training data

In [16]:
train = feature_gen.fit_transform(train)

In [17]:
oversampler = SMOTENC(categorical_features=[train.columns.get_loc(c) for c in train.select_dtypes('category')], random_state=seed)

In [18]:
train = oversampler.fit_resample(train, train['Revenue'])[0]
train = normaliser.fit_transform(train)

In [19]:
train.head(10)

Unnamed: 0,Administrative,Informational,ProductRelated,BounceRates,ExitRates,PageValues,SpecialDay,Revenue,VistorType,AvgAdminTime,...,TrafficType_nan,VisitorType_Other,VisitorType_Returning_Visitor,VisitorType_nan,Weekend_1.0,Weekend_nan,Quarter_2.0,Quarter_3.0,Quarter_4.0,Quarter_nan
0,0.0,0.0,0.004255,1.0,1.0,0.0,0.0,0.0,0.5,0.0,...,0,0,1,0,0,0,1,0,0,0
1,0.148148,0.0,0.052482,0.026316,0.085526,0.035839,0.0,1.0,0.5,0.016916,...,0,0,1,0,0,0,0,0,1,0
2,0.074074,0.0,0.01844,0.0,0.142857,0.0,0.0,0.0,0.5,0.088319,...,0,0,1,0,0,0,0,0,0,0
3,0.037037,0.083333,0.041135,0.240909,0.37971,0.0,0.0,0.0,0.5,0.028134,...,0,0,1,0,0,0,0,0,1,0
4,0.0,0.0,0.038298,0.12963,0.222222,0.0,0.0,0.0,0.5,0.0,...,0,0,1,0,1,0,0,0,1,0
5,0.296296,0.0,0.069504,0.0,0.042358,0.034758,0.0,0.0,0.5,0.024786,...,0,0,1,0,0,0,0,0,1,0
6,0.074074,0.0,0.015603,0.0,0.090909,0.150746,0.0,1.0,0.0,0.012108,...,0,0,0,0,0,0,1,0,0,0
7,0.0,0.0,0.017021,0.083333,0.333333,0.0,0.0,0.0,0.5,0.0,...,0,0,1,0,0,0,0,0,1,0
8,0.111111,0.0,0.01844,0.071429,0.214286,0.0,0.2,0.0,0.5,0.024691,...,0,0,1,0,0,0,1,0,0,0
9,0.37037,0.083333,0.069504,0.068555,0.275636,0.004027,1.0,0.0,0.5,0.02205,...,0,0,1,0,1,0,1,0,0,0


In [20]:
train.columns

Index(['Administrative', 'Informational', 'ProductRelated', 'BounceRates',
       'ExitRates', 'PageValues', 'SpecialDay', 'Revenue', 'VistorType',
       'AvgAdminTime', 'AvgInfoTime', 'AvgProductTime', 'OperatingSystems_2.0',
       'OperatingSystems_3.0', 'OperatingSystems_4.0', 'OperatingSystems_5.0',
       'OperatingSystems_6.0', 'OperatingSystems_7.0', 'OperatingSystems_8.0',
       'OperatingSystems_nan', 'Browser_2.0', 'Browser_3.0', 'Browser_4.0',
       'Browser_5.0', 'Browser_6.0', 'Browser_7.0', 'Browser_8.0',
       'Browser_10.0', 'Browser_11.0', 'Browser_12.0', 'Browser_13.0',
       'Browser_nan', 'Region_2.0', 'Region_3.0', 'Region_4.0', 'Region_5.0',
       'Region_6.0', 'Region_7.0', 'Region_8.0', 'Region_9.0', 'Region_nan',
       'TrafficType_2.0', 'TrafficType_3.0', 'TrafficType_4.0',
       'TrafficType_5.0', 'TrafficType_6.0', 'TrafficType_7.0',
       'TrafficType_8.0', 'TrafficType_9.0', 'TrafficType_10.0',
       'TrafficType_11.0', 'TrafficType_12.0', 'Traf

In [21]:
predictor.fit(train.loc[:, ~train.columns.isin(['Revenue'])], train['Revenue'])

LogisticRegression(C=1e+16, class_weight='balanced', max_iter=10000,
                   random_state=83282168)

# Testing Data

In [22]:
test = feature_gen.transform(test)

In [23]:
test = normaliser.transform(test)

In [24]:
test['predicted'] = predictor.predict(test.loc[:, ~test.columns.isin(['Revenue'])])

In [25]:
(test.predicted == test.Revenue).sum() / len(test)

0.8680724520140578