### **Importing necessary libraries**

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

### **Importing data**

In [None]:
test_data = pd.read_csv("/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv")
train_data = pd.read_csv("/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv")
sample_data = pd.read_csv("/kaggle/input/e-commerce-shoppers-behaviour-understanding/sample.csv")

### **Data exploration**

In [None]:
train_data.info()
# can see here that there are null values in the dataset

In [None]:
train_data.describe()

### Correlation between features using Heatmap

In [None]:
#plotting heatmap to check for high correlation between features

plt.figure(figsize=(12,9))
correlation = train_data.corr()
sns.heatmap(correlation, annot=True)

### Dropping columns with high correlation with other columns

In [None]:
train_data.drop(['ProductDescriptionPage', 'GoogleMetric:Exit Rates','OS'], inplace=True, axis=1)

### Seperating feature from labels

In [None]:
X = train_data[train_data.columns[:-1]]
y = train_data[train_data.columns[-1]]

### Checking for imbalace in the data

In [None]:
f = y[y==False].count()
t = y[y==True].count()

print("% of False:",(f/y.shape[0])*100)
print("% of True:",(t/y.shape[0])*100)

d = np.array([f,t])
labels = np.array(["Not made purchase", "Made purchase"])
plt.pie(d, labels = labels)
plt.show()

In [None]:
# seperating categorical and numerical variables for preprocessing
Xcat = X[['Month_SeasonalPurchase','Gender', 'CustomerType', 'Cookies Setting','Marital Status','Education']]
Xnum = X.drop(['Month_SeasonalPurchase','Gender', 'CustomerType', 'Cookies Setting','Marital Status','Education'], axis=1)

### Imputing null values and scaling for numerical data. Imputation only for categorical data.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

numpipe = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

Xnum_trans = pd.DataFrame(numpipe.fit_transform(Xnum))

In [None]:
Xcat = pd.DataFrame(SimpleImputer(strategy='most_frequent').fit_transform(Xcat))

### Encoding categorical data using OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, drop='first')
Xcat_trans = pd.DataFrame((ohe.fit_transform(Xcat)))

### Stacking processed data together

In [None]:
Xfin = pd.DataFrame(np.hstack((Xnum_trans, Xcat_trans)))
Xfin

### Splitting the data

In [None]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(Xfin, y, test_size=0.15, random_state=1729) 

### Fitting on train data

In [None]:
from sklearn.ensemble import RandomForest
rf = RandomForest()
rf.fit(Xfin,y)

In [None]:
from sklearn.metrics import classification_report
ypred = rf.predict(Xtrain)
print(classification_report(ytest, ypred))

### HPT on choosen model

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'n_estimators': [50, 100, 150], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [3, 6, 9], 
    'max_leaf_nodes': [3, 6, 9], 
} 

gcv = GridSearchCV(mlpc, param_grid, cv=4)
gcv.fit(Xfin, y)

In [None]:
from sklearn.metrics import classification_report
ypred = gcv.predict(Xtest)
print(classification_report(ytest, ypred))

In [None]:
gcv.score(Xtest, ytest)

In [None]:
gcv.best_params_

### Preprocessing test data similar to train data

In [None]:
test_data.drop(['ProductDescriptionPage', 'GoogleMetric:Exit Rates', 'OS'], inplace=True, axis=1)

In [None]:
test_cat = test_data[['Month_SeasonalPurchase','Gender', 'CustomerType', 'Cookies Setting','Marital Status','Education']]
test_num = test_data.drop(['Month_SeasonalPurchase','Gender', 'CustomerType', 'Cookies Setting','Marital Status','Education'], axis=1)

In [None]:
test_num_trans = pd.DataFrame(numpipe.fit_transform(test_num))

In [None]:
test_cat = pd.DataFrame(SimpleImputer(strategy='most_frequent').fit_transform(test_cat))

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, drop='first')
test_cat_trans = pd.DataFrame((ohe.fit_transform(test_cat)))

In [None]:
test_fin = pd.DataFrame(np.hstack((test_num_trans, test_cat_trans)))
test_fin

### **Predicting on test data**

In [None]:
test_pred = gcv.predict(test_fin)

In [None]:
sub = pd.DataFrame(test_pred, columns=['Made_Purchase'])
sub.index.name = 'id'
sub.to_csv("submission.csv", encoding='utf-8')

submission = pd.read_csv("submission.csv")