## Data Description

| Column Name | Criteria |
|------------|----------|
| customer_id | Integer. Unique identifier for each customer. No missing values. |
| time_spent | Float. Minutes spent on website per session. Missing values should be replaced with median. |
| pages_viewed | Integer. Number of pages viewed in session. Missing values should be replaced with mean. |
| basket_value | Float. Value of items in basket. Missing values should be replaced with 0. |
| device_type | String. One of: Mobile, Desktop, Tablet. Missing values should be replaced with "Unknown". |
| customer_type | String. One of: New, Returning. Missing values should be replaced with "New". |
| purchase | Binary. Whether customer made a purchase (1) or not (0). Target variable. |

In [4]:
import pandas as pd
df = pd.read_csv('raw_customer_data.csv')
clean_data = df.copy()
clean_data['customer_id'] = df['customer_id'].astype(int)
clean_data['time_spent'] = df['time_spent'].fillna(df['time_spent'].median()).astype(float)
clean_data['pages_viewed'] = df['pages_viewed'].fillna(df['pages_viewed'].mean()).astype(int)
clean_data['basket_value'] = df['basket_value'].fillna(0).astype(float)
clean_data['device_type'] = df['device_type'].fillna("Unknown")
clean_data['customer_type'] = df['customer_type'].fillna("New")
clean_data['purchase'] = df['purchase'].astype(int)

clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   customer_id    500 non-null    int64  
 1   time_spent     500 non-null    float64
 2   pages_viewed   500 non-null    int64  
 3   basket_value   500 non-null    float64
 4   device_type    500 non-null    object 
 5   customer_type  500 non-null    object 
 6   purchase       500 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 27.5+ KB


In [6]:
df = pd.read_csv('model_data.csv')
not_feature_col = ['customer_id','purchase']
from sklearn.preprocessing import MinMaxScaler

num_col = [col for col in df.select_dtypes('number').columns if col not in not_feature_col]
scaler = MinMaxScaler()
df[num_col] = scaler.fit_transform(df[num_col])

cat_col = [col for col in df.select_dtypes('object').columns if col not in not_feature_col + num_col]
model_feature_set = pd.get_dummies(df[cat_col])

model_feature_set[not_feature_col+num_col] = df[not_feature_col+num_col]
all_col = list(model_feature_set.columns[-5:-4]) + list(model_feature_set.columns[-3:]) + list(model_feature_set.columns[:-5]) + list(model_feature_set.columns[-4:-3])
model_feature_set = model_feature_set[all_col]
display(model_feature_set.head())
display(model_feature_set.info())

Unnamed: 0,customer_id,time_spent,pages_viewed,basket_value,device_type_Desktop,device_type_Mobile,device_type_Tablet,device_type_Unknown,customer_type_New,customer_type_Returning,purchase
0,501,0.664167,0.5,0.0,True,False,False,False,True,False,1
1,502,0.483681,0.222222,0.524981,False,True,False,False,False,True,1
2,503,0.231359,0.111111,0.457291,False,True,False,False,False,True,0
3,504,0.792944,0.277778,0.0,False,False,False,True,True,False,1
4,505,0.64921,0.166667,0.484283,False,False,True,False,True,False,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   customer_id              500 non-null    int64  
 1   time_spent               500 non-null    float64
 2   pages_viewed             500 non-null    float64
 3   basket_value             500 non-null    float64
 4   device_type_Desktop      500 non-null    bool   
 5   device_type_Mobile       500 non-null    bool   
 6   device_type_Tablet       500 non-null    bool   
 7   device_type_Unknown      500 non-null    bool   
 8   customer_type_New        500 non-null    bool   
 9   customer_type_Returning  500 non-null    bool   
 10  purchase                 500 non-null    int64  
dtypes: bool(6), float64(3), int64(2)
memory usage: 22.6 KB


None

In [16]:
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.nn import init
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# load data
df = pd.read_csv('input_model_features.csv')

# prep data loader
X_train = df[df.drop(columns=not_feature_col).columns].to_numpy()
y_train = df['purchase'].to_numpy()

tensor_X_train = torch.tensor(X_train, dtype=torch.float32)
tensor_y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1,1)

train_dataset = TensorDataset(tensor_X_train, tensor_y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# build model
input_shape=len(df.columns)-2

class ClassificationModel(nn.Module):
    def __init__(self, input_dim=input_shape):
        super(ClassificationModel, self).__init__()
        self.fc1_layer = nn.Linear(input_dim, 8)
        self.fc2_layer = nn.Linear(8, 256)
        self.fc3_layer = nn.Linear(256, 512)
        self.fc4_layer = nn.Linear(512, 256)
        self.fc5_layer = nn.Linear(256, 128)
        self.fc6_layer = nn.Linear(128, 56)
        self.fc7_layer = nn.Linear(56, 32)
        self.fc8_layer = nn.Linear(32, 16)
        self.fc9_layer = nn.Linear(16, 8)
        self.output_layer = nn.Linear(8, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1_layer(x))
        
        x = self.relu(self.fc2_layer(x))
        
        x = self.relu(self.fc3_layer(x))
        
        x = self.relu(self.fc4_layer(x))
        
        x = self.relu(self.fc5_layer(x))

        x = self.relu(self.fc6_layer(x))

        x = self.relu(self.fc7_layer(x))

        x = self.relu(self.fc8_layer(x))
        x = self.dropout(x)

        x = self.relu(self.fc9_layer(x))
        x = self.dropout(x)
        
        x = self.sigmoid(self.output_layer(x))
        
        return x

# assign custom weight init
def init_weights(m):
    if isinstance(m, nn.Linear):
        init.xavier_uniform_(m.weight)
        if m.bias is not None:
            init.constant_(m.bias, 0)

# init model
purchase_model = ClassificationModel()
purchase_model.apply(init_weights)

# add compiler to model
loss_fn = nn.BCELoss()
optimizer = optim.Adam(purchase_model.parameters(), lr=0.001)
threshold=0.5

# train model
loss_total = []
accuracy_scores = []
epochs = 250
purchase_model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch+1}")
    for i, (x, y) in enumerate(train_loader):
        output = purchase_model(x)
        loss = loss_fn(output, y)
        loss_score = loss.item()
        loss_total.append(loss_score)
        accuracy = accuracy_score(y.detach().numpy().flatten(), (output>=threshold).int().detach().numpy().flatten())
        accuracy_scores.append(accuracy)
        
        if i % 10 == 0:
            print(f"\tStep {i}")
            print(f"\t\tLoss={loss_score:.4f}")
            print(f"\t\tAccuracy={loss_score:.4f}")
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

loss_avg = np.mean(loss_total)
print(f"\n\nLoss Average={loss_avg}")

# create inference function
def predict(X,threshold=threshold):
    tensor_X = torch.tensor(X[X.drop(columns='customer_id').columns].to_numpy(), dtype=torch.float32)
    proba = purchase_model(tensor_X)
    output = (proba>=threshold).int()
    return proba.detach().numpy().flatten(), output.detach().numpy().flatten()

# output train_data
df['purchase_pred'] = predict(df.drop(columns=['purchase']))[1]

# load val data
df_val = pd.read_csv('validation_features.csv')

# output val data
validation_predictions = df_val.copy()
validation_predictions['purchase'] = predict(df_val)[1]
display(validation_predictions.head())

print("Evalution metrics")
print(classification_report(df['purchase'],df['purchase_pred']))
print(f"Accuracy score={accuracy_score(df['purchase'],df['purchase_pred'])}")

Epoch 1
	Step 0
		Loss=0.6943
		Accuracy=0.6943
	Step 10
		Loss=0.4637
		Accuracy=0.4637
	Step 20
		Loss=0.4442
		Accuracy=0.4442
Epoch 2
	Step 0
		Loss=0.5307
		Accuracy=0.5307
	Step 10
		Loss=0.3936
		Accuracy=0.3936
	Step 20
		Loss=0.5998
		Accuracy=0.5998
Epoch 3
	Step 0
		Loss=0.6339
		Accuracy=0.6339
	Step 10
		Loss=0.5195
		Accuracy=0.5195
	Step 20
		Loss=0.4913
		Accuracy=0.4913
Epoch 4
	Step 0
		Loss=0.4735
		Accuracy=0.4735
	Step 10
		Loss=0.4326
		Accuracy=0.4326
	Step 20
		Loss=0.5187
		Accuracy=0.5187
Epoch 5
	Step 0
		Loss=0.5723
		Accuracy=0.5723
	Step 10
		Loss=0.3646
		Accuracy=0.3646
	Step 20
		Loss=0.5708
		Accuracy=0.5708
Epoch 6
	Step 0
		Loss=0.5327
		Accuracy=0.5327
	Step 10
		Loss=0.4532
		Accuracy=0.4532
	Step 20
		Loss=0.3612
		Accuracy=0.3612
Epoch 7
	Step 0
		Loss=0.5395
		Accuracy=0.5395
	Step 10
		Loss=0.5116
		Accuracy=0.5116
	Step 20
		Loss=0.4757
		Accuracy=0.4757
Epoch 8
	Step 0
		Loss=0.4879
		Accuracy=0.4879
	Step 10
		Loss=0.3740
		Accuracy=0.3740
	

Unnamed: 0,customer_id,time_spent,pages_viewed,basket_value,device_type_Desktop,device_type_Mobile,device_type_Tablet,device_type_Unknown,customer_type_New,customer_type_Returning,purchase
0,1801,0.951925,0.166667,0.062782,0,0,0,1,0,1,0
1,1802,0.708313,0.944444,0.0,0,0,1,0,0,1,1
2,1803,0.280857,0.277778,0.115038,0,1,0,0,0,1,0
3,1804,0.217765,0.055556,0.268982,1,0,0,0,1,0,0
4,1805,0.27691,0.555556,0.493252,0,1,0,0,0,1,1


Evalution metrics
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       168
           1       0.96      0.95      0.96       632

    accuracy                           0.93       800
   macro avg       0.90      0.90      0.90       800
weighted avg       0.93      0.93      0.93       800

Accuracy score=0.9325
