# PART 3 : USING MODEL ON THE TEST FILE

In [1]:
# Loading librairies needed
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, ConfusionMatrixDisplay, RocCurveDisplay

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import joblib

In [2]:
# Loading dataset
dataset = pd.read_csv("src/conversion_data_test.csv")

In [3]:
# Visualize the 5 first rows
dataset.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited
0,UK,28,0,Seo,16
1,UK,22,1,Direct,5
2,China,32,1,Seo,1
3,US,32,1,Ads,6
4,China,25,0,Seo,3


In [4]:
# Checking the shape
dataset.shape

(31620, 5)

STEP 2 : MISSING VALUE MANAGEMENT

In [5]:
dataset.isnull().sum()

country                0
age                    0
new_user               0
source                 0
total_pages_visited    0
dtype: int64

STEP 3 : REPLACING THE NUMBER BY CORRESPONDING NAME CATEGORY

In [7]:
dataset['new_user'] = dataset["new_user"].apply(lambda x : "Yes" if x == 1 else "No")

In [8]:
dataset.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited
0,UK,28,No,Seo,16
1,UK,22,Yes,Direct,5
2,China,32,Yes,Seo,1
3,US,32,Yes,Ads,6
4,China,25,No,Seo,3


In [10]:
X = dataset

STEP 4 : PREPROCESSING

In [13]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['age', 'total_pages_visited']
Found categorical features  ['country', 'new_user', 'source']


In [14]:
# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first') 

# Create pipeline for numeric features
numeric_transformer = StandardScaler() 

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
X = preprocessor.fit_transform(X)

STEP 5 : LOADING AND APPLY MODEL ON THE TEST

In [15]:
model = joblib.load('src/model.joblib')
y_pred = model.predict(X)


dataset['converted'] = y_pred

In [16]:
dataset.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,UK,28,No,Seo,16,1
1,UK,22,Yes,Direct,5,0
2,China,32,Yes,Seo,1,0
3,US,32,Yes,Ads,6,0
4,China,25,No,Seo,3,0


In [17]:
dataset.shape

(31620, 6)

STEP 6 : SAVE DATASET FOR ANALYSIS

In [19]:
dataset.to_csv('src/conversion_best_model.csv', index=False)

# END PART 3