### Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(42)

### Paths to various files

In [None]:
test_data_file = "/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv"
train_data_file ="/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv"
output_file = "/kaggle/working/submissions.csv"

### Importing data

In [None]:
train_data = pd.read_csv(train_data_file)
test_data = pd.read_csv(test_data_file)

### Checking for imbalance in the data

In [None]:
T = 0
F = 0
for i in train_data["Made_Purchase"]:
    if i == True:
        T +=1 
    elif i == False:
        F += 1
assert(T + F == len(train_data["Made_Purchase"]))
print("True: ", T)
print("False: ", F)
print("Percentage of Trues: ", T/(T+F))
print("Percentager of Falses: ", F/(T+F))
X  = ["True", "False"]
Y = [T/(T+F), F/(T+F)]
plt.bar(X, Y)
plt.show()

### Observing the distributions of various features

In [None]:
def count(col_name):
    unique_os = train_data[col_name].unique()
    os_index = np.where(train_data.columns == col_name)
    os_index = os_index[0][0]
    os = {}
    train_data_array = np.asarray(train_data)
    for i in train_data_array:
        try:
#             print(i[-1])
            os[i[os_index]] += i[-1]
        except:
            os[i[os_index]] = 1
    
    return_dict = {}
    sorted_dict = {}
    
    is_categorical = train_data[col_name].dtype != "float_64"
    
    for i in unique_os:
        if is_categorical:
            if pd.isnull(i) == False:
                return_dict[i] = os[i]
            
    sorted_list = sorted(return_dict.items(), key = lambda x:x[1], reverse=True)
    
    for i in sorted_list:
        sorted_dict.setdefault(str(i[0]), i[1])
        
    return sorted_dict


In [None]:
self_explanatory_variables = ["OS", "SearchEngine", "Zone", "Type of Traffic", "WeekendPurchase", 
                              "SeasonalPurchase", "Month_SeasonalPurchase", "CustomerType", "Gender", 
                              "Cookies Setting","Education","Marital Status"]
for i in self_explanatory_variables:
    print(i, ":")

    return_dict = count(i)
    print(return_dict)
    
    X  = return_dict.keys()
    Y = return_dict.values()
    
    plt.bar(X, Y)
    plt.show()
    
    print("------------------------------------")
    
    
    
# CONCLUSION: Marital Status, Cookies Setting, Education, Gender has no bearing on whether a user made a purchase.
# Therefore, Drop them

In [None]:
# # CONCLUSION: Marital Status, Cookies Setting, Education, Gender has no bearing on whether a user made a purchase.
# # Therefore, Drop them

train_data.drop(columns=["Marital Status", "Cookies Setting", "Education", "Gender"], inplace=True, axis=1)
test_data.drop(columns=["Marital Status", "Cookies Setting", "Education", "Gender"], inplace=True, axis=1)


### Checking for null values and imputing them

In [None]:
train_data.isnull().sum()

In [None]:
def simple_imputer(data, strategy, columns):
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy=strategy)
    for i in columns:
        data[i] = imputer.fit_transform(data[i].values.reshape(-1, 1))
    return data

In [None]:
cat_columns = train_data.select_dtypes(include='object').columns.tolist()
train_data = simple_imputer(train_data, "most_frequent", cat_columns)
test_data = simple_imputer(test_data, "most_frequent", cat_columns)

In [None]:
num_columns = train_data.select_dtypes(include='float').columns.tolist()
train_data = simple_imputer(train_data, "mean", num_columns)
test_data = simple_imputer(test_data, "mean", num_columns)

In [None]:
train_data.isnull().sum()

### Converting categorical variables to numerical using get_dummies function of pandas

In [None]:
cat_columns = train_data.select_dtypes(include='object').columns.tolist()

def encode_categorical_variables(data, columns):
    for i in columns:

        dummies = pd.get_dummies(data[i])

        data = pd.concat([data, dummies], axis=1)

        data.drop(i, inplace=True, axis=1)
        
    return data

train_data = encode_categorical_variables(train_data, cat_columns)
test_data = encode_categorical_variables(test_data, cat_columns)

In [None]:
corr = train_data.corr()
plt.figure(figsize=(20, 18))
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
# # DROP one amongst Bounce Rates and Exit Rates since they are hightly co-related
train_data.drop(columns=["GoogleMetric:Exit Rates"], inplace=True)
test_data.drop(columns=["GoogleMetric:Exit Rates"], inplace=True)

In [None]:
# DROP the 2 Pages features, and replace them by one.

def AvgMinutes(Count, Duration):
    if Duration == 0:
        output = 0
    elif Duration != 0:
        if Count == 0:
            output = 0
        else:
            output = float(Duration)/float(Count)
    return output

Columns = [['HomePage', 'HomePage_Duration', 'AvgHomePage'],
           ['LandingPage', 'LandingPage_Duration', 'AvgLandingPage'],
           ['ProductDescriptionPage', 'ProductDescriptionPage_Duration', 'AvgProductDescriptionPage']
          ]


def transform_page_data(data, columns):
    for i in columns:
        [count, duration, label] = i
#         print(i)
        data[label] = data.apply(lambda x: AvgMinutes(Count = x[count], Duration = x[duration]), axis = 1)
        data.drop(count, inplace=True, axis=1)
        data.drop(duration, inplace=True, axis=1)
    return data
        
train_data = transform_page_data(train_data, Columns)
test_data = transform_page_data(test_data, Columns)

In [None]:
corr = train_data.corr()
plt.figure(figsize=(20, 18))
sns.heatmap(corr, annot=True)
plt.show()

### Seperate the label from features

In [None]:
X = train_data.drop(columns=["Made_Purchase"], axis=1)
y  = train_data["Made_Purchase"]

### Scale the data

In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
scalers = [RobustScaler()]
for scaler in scalers:
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    test_data = pd.DataFrame(scaler.fit_transform(test_data), columns = test_data.columns)

### Splitting the dataset into Train and Test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state = 42)

### Linear Model

In [None]:

from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

sgd = SGDClassifier(random_state = 42)

sgd.fit(X_train, y_train)
y_pred_train = sgd.predict(X_train)
y_pred_test = sgd.predict(X_test)

from sklearn.metrics import f1_score
print("F1 Score of Train Data: ", f1_score(y_train, y_pred_train))
print("F1 Score of Test Data: ", f1_score(y_test, y_pred_test))

### Post HPT on Linear Model

In [None]:

from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

sgd = SGDClassifier(alpha=0.001, max_iter=1000, penalty='l2', random_state = 42)

sgd.fit(X_train, y_train)
y_pred_train = sgd.predict(X_train)
y_pred_test = sgd.predict(X_test)

from sklearn.metrics import f1_score
print("F1 Score of Train Data: ", f1_score(y_train, y_pred_train))
print("F1 Score of Test Data: ", f1_score(y_test, y_pred_test))

### Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
list_one =[]
feature_ranking = SelectKBest(f_classif, k=5)
fit = feature_ranking.fit(X, y)

fmt = '%-8s%-20s%s'

for i, (score, feature) in enumerate(zip(feature_ranking.scores_, X.columns)):
    list_one.append((score, feature))
    
dfObj = pd.DataFrame(list_one) 
dfObj = dfObj.sort_values(by=[0], ascending = False)
print(dfObj)

In [None]:
X.drop(['OS','Jul', 'Other', 'Sep'],axis=1,inplace=True)
test_data.drop(['OS','Jul', 'Other', 'Sep'],axis=1,inplace=True)


### Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state = 42)

### Import AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# The principle behind boosting algorithms is that we first build a model on the training dataset and 
# then build a second model to rectify the errors present in the first model. 
# This procedure is continued until and unless the errors are minimized and the dataset is predicted correctly. 
# Boosting algorithms work in a similar way, it combines multiple models (weak learners) to reach the final output (strong learners).

ab = AdaBoostClassifier(random_state = 42)

In [None]:
def get_score(estimator):
    estimator.fit(X_train, y_train)
    y_train_pred = estimator.predict(X_train)
    y_test_pred = estimator.predict(X_test)
    from sklearn.metrics import f1_score
    print("F1-Score on Train Data: ", f1_score(y_train, y_train_pred), "F1-Score on Test Data : ", f1_score(y_test, y_test_pred))
    estimator.fit(X, y)
    predictions = estimator.predict(test_data)
    return predictions

In [None]:
get_score(ab)

### HPT on Adaboost

In [None]:
parameters = {'learning_rate':[0.1,0.2,0.3],'n_estimators':[100,200,300,400]}

from sklearn.model_selection import RandomizedSearchCV

gcv = RandomizedSearchCV(ab, param_distributions = parameters, scoring='f1', cv=4)
gcv.fit(X_train, y_train)
gcv.score(X_train, y_train)

In [None]:
gcv.best_estimator_

# Default performs better

### Making the Submission File

In [None]:
sub = pd.DataFrame(get_score(ab), columns=["Made_Purchase"])
sub.index.name = "id"
sub.to_csv(output_file, encoding='utf-8')