# Setup

In [148]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [163]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv"
df = pd.read_csv(url);

## Data preprocessing

In [67]:
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x : ordering.index(x))
    return df

def onehot_encode(df, column, prefix):
    df= df.copy();
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1);
    df = df.drop(column, axis = 1)
    return df;

In [68]:
month_ordering = [
    "Jan", 
    "Feb", 
    "Mar", 
    "Apr", 
    "May", 
    "June", 
    "Jul", 
    "Aug", 
    "Sep", 
    "Oct", 
    "Nov", 
    "Dec"
]
visitor_prefix = "V"

In [69]:
df = ordinal_encode(df, "Month", month_ordering)
df = onehot_encode(df, "VisitorType", visitor_prefix)

df["Weekend"] = df["Weekend"].astype(int)
df["Revenue"] = df["Revenue"].astype(int)

## Spliting and Scaling

In [74]:
y = df["Revenue"].copy()
x = df.drop("Revenue", axis = 1).copy()

In [78]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [123]:
x_train, x_test, y_train, y_test = 
    train_test_split(x, y, train_size = 0.7, random_state = 20)

# Training

## K-Nearest Neighbors

In [160]:
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(x_train, y_train)
knnScore = knn.score(x_test, y_test)

# Cross Validation Score
knnValidationList = cross_val_score(knn, x, y, cv=5, scoring='accuracy')
knnValidation = sum(knnValidationList) / 5

print("KNN Accuracy: " + str(knnScore))
print("KNN Validation: " + str(knnValidation))

KNN Accuracy: 0.8799675587996756
KNN Validation: 0.8759935117599351


## Naive Bayes

In [161]:
nb = GaussianNB()
nb.fit(x_train, y_train)
nbScore = nb.score(x_test, y_test)

# Cross Validation Score
nbValidationList = cross_val_score(nb, x, y, cv=5, scoring='accuracy')
nbValidation = sum(nbValidationList) / 5

print("NB Accuracy: " + str(nbScore))
print("NB Validation: " + str(nbValidation))

NB Accuracy: 0.7675047310083807
NB Validation: 0.7695863746958637


## Support Vector Machine

In [162]:
sv = SVC()
sv.fit(x_train, y_train)
svScore = sv.score(x_test, y_test)

# Cross Validation Score
svValidationList = cross_val_score(sv, x, y, cv=5, scoring='accuracy')
svValidation = sum(svValidationList) / 5

print("SV Accuracy: " + str(svScore))
print("SV Validation: " + str(svValidation))

SV Accuracy: 0.8959178156258448
SV Validation: 0.8882400648824007
