In [27]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

In [28]:
class Cluster_Log(BaseEstimator,TransformerMixin):
    def __init__(self, k=5):
        self.k = k
        self.model = KMeans(n_clusters=self.k,
                            random_state=0,
                            max_iter=2000)
        self.scaler = MinMaxScaler()   
    
    # Fits kmeans clusters based on Min-Max scaled data
    def fit(self, X, y=None):
        self.scaled = self.scaler.fit_transform(X)
        self.model.fit(self.scaled)
        return self
    
    # Adds cluster labels, log-transformed product-related features, and drops the original product-related features
    def transform(self, X, y=None):
        self.scaled = self.scaler.fit_transform(X)
        labels = self.model.predict(self.scaled)
        return pd.concat(
            [X.drop(columns=['ProductRelated','ProductRelated_Duration']).reset_index(drop=True),
             pd.get_dummies(labels, prefix='cluster', dtype='int').reset_index(drop=True),
             np.log1p(X[['ProductRelated','ProductRelated_Duration']]).reset_index(drop=True)], axis=1)

In [29]:
raw_data = pd.read_csv("online_shoppers_intention.csv")
# Converting certain numerical columns to categorical 
num_to_cat_cols = ["OperatingSystems", "Browser", "Region", "TrafficType"]
raw_data[num_to_cat_cols] = raw_data[num_to_cat_cols].astype("category")


raw_data["OperatingSystems"].dtype
num_df = raw_data.select_dtypes(include=["int64", "float64"])
print(raw_data.shape, raw_data.columns)
data = raw_data.drop_duplicates().reset_index(drop=True)
print(data.shape)

(12330, 18) Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
       'Weekend', 'Revenue'],
      dtype='object')
(12205, 18)


In [30]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

cat_cols = ["Month","OperatingSystems","Browser","Region","TrafficType","VisitorType","Weekend"]

dummy_cols = pd.get_dummies(data[cat_cols])
X = pd.concat([X.drop(columns=cat_cols), dummy_cols.astype(int)], axis=1)

In [31]:
num_columns = num_df.columns

preprocessor = ColumnTransformer([("add_clusters_and_log", Cluster_Log(k=5), num_columns)],
                                 remainder="passthrough")

In [32]:
pipe = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("estimator", LogisticRegression(solver='liblinear'))
])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True)

In [34]:
model = pipe.fit(X_train,y_train)
preds = model.predict(X_test)

In [35]:
print(accuracy_score(y_test,preds))
train_preds = model.predict(X_train)
print(accuracy_score(y_train,train_preds))

0.8713641950020483
0.8840639082343302


In [36]:
probs = model.predict_proba(X_test)[:,1]

In [37]:
print(roc_auc_score(y_test, probs))

0.8942582269815658


In [38]:
kf = KFold(10, shuffle=True)
params = {'estimator__C': np.logspace(-4, 2, 20)}
results = GridSearchCV(pipe, param_grid=params, cv=kf).fit(X_train,y_train)

In [39]:
print(results.best_params_)
print(results.best_score_)

{'estimator__C': np.float64(100.0)}
0.883860135577294
