In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

In [2]:
pd.set_option('display.max_columns',50)

In [3]:
class Cluster_Log(BaseEstimator,TransformerMixin):
    def __init__(self, k=5):
        self.k = k
        self.model = KMeans(n_clusters=self.k,
                            random_state=0,
                            max_iter=2000)
        self.scaler = MinMaxScaler()   
    
    # Fits kmeans clusters based on Min-Max scaled data
    def fit(self, X, y=None):
        self.scaled = self.scaler.fit_transform(X)
        self.model.fit(self.scaled)
        return self
    
    # Adds cluster labels, log-transformed product-related features, and drops the original product-related features
    def transform(self, X, y=None):
        labels = self.model.predict(self.scaled)
        return pd.concat(
            [X.drop(columns=['ProductRelated','ProductRelated_Duration']),
             pd.get_dummies(labels, prefix='cluster', dtype='int'),
             np.log1p(X[['ProductRelated','ProductRelated_Duration']])], axis=1)
        
    

# Testing preprocessor

In [4]:
raw_data = pd.read_csv("online_shoppers_intention.csv")
# Converting certain numerical columns to categorical 
num_to_cat_cols = ["OperatingSystems", "Browser", "Region", "TrafficType"]
raw_data[num_to_cat_cols] = raw_data[num_to_cat_cols].astype("category")


raw_data["OperatingSystems"].dtype
num_df = raw_data.select_dtypes(include=["int64", "float64"])
print(raw_data.shape, raw_data.columns)

(12330, 18) Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
       'Weekend', 'Revenue'],
      dtype='object')


In [5]:
num_columns = num_df.columns

preprocessor = ColumnTransformer([("add_clusters_and_log", Cluster_Log(k=5), num_columns)],
                                 remainder="passthrough")



In [6]:
transformed = preprocessor.fit_transform(raw_data)
print(raw_data.shape)
print(transformed.shape)

(12330, 18)
(12330, 23)
