*CONVERSION RATE CHALLENGE*

In [261]:
# Import libraries
import pandas as pd
import numpy as np

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import matplotlib
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import seaborn as sns

from IPython.display import display
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

pio.templates.default = "plotly_dark"
matplotlib.style.use('dark_background')

In [262]:
!pip install plotly



In [263]:
# Let's import the dataset
data = pd.read_csv('conversion_data_train.csv')
print('Set with labels (our train+test) :', data.shape)

Set with labels (our train+test) : (284580, 6)


##### *Part 1 : Presentation of the dataset* ####

In [264]:
# Let's check the firest 5 rows
data.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [265]:
# The dataset is quite big : you must create a sample of the dataset before making any visualizations !
data_sample = data.sample(10000)

In [266]:
# Basic stats & info
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()

Number of rows : 10000
Number of columns : 6



In [267]:
print("Basics statistics: ")
data_desc = data_sample.describe(include='all')
display(data_desc)
print()

Basics statistics: 


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
count,10000,10000.0,10000.0,10000,10000.0,10000.0
unique,4,,,3,,
top,US,,,Seo,,
freq,5604,,,4846,,
mean,,30.7102,0.6875,,4.9027,0.0347
std,,8.317616,0.463536,,3.393698,0.183028
min,,17.0,0.0,,1.0,0.0
25%,,24.0,0.0,,2.0,0.0
50%,,30.0,1.0,,4.0,0.0
75%,,36.0,1.0,,7.0,0.0





In [268]:
# Let's check how many missing value we have
data_sample.isna().sum()

country                0
age                    0
new_user               0
source                 0
total_pages_visited    0
converted              0
dtype: int64

##### *Part 2 : EDA* #####

In [269]:
# Our target variable is the column "converted"
# Let's explore our target with some EDA's

In [270]:
# Let's see the average age present in the dataset
fig = px.box(data_sample, x="age", title = "animation_group=Boxplot of ages in the dataset")
fig.show()

We can see that people are mostly between 20 and 40 years old

In [271]:
# Let's the distribution of converted users (people who subscribed to the site's newsletter) for each countries
fig = px.histogram(data_sample, x="country", y="new_user", title="Distribution of converted users per country", width= 650,
                      height = 500)
fig.show()

We can see a huge distribution of the converted users in the U.S

In [272]:
# Let's see how many converted users we have in each countries
fig = px.histogram(data_sample, x="country", y="converted", title="Distribution of subscribed newsletters", width= 650,
                      height = 500)
fig.show()

US have the most newsletter subscribers

In [273]:
# Let's see how users go to the newsletter site
fig = px.histogram(data_sample, x="source", title="Distribution of the source used", width= 500,
                      height = 500)
fig.show()

We can see that a majority of people use SEO (Search Engine Optimisation) to connect to the site, rather than ads or direct connections

In [274]:
# Previously, we got : 
# f1-score on train set :  0.6938517686692869
# f1-score on test set :  0.7060240963855423

In [275]:
# Let's do a logistic regression with all the features

##### *Part 3 : Logistic Regression with all the features* #####

*Preprocessing*

In [276]:
features_list = ['country','age','new_user','source','total_pages_visited']
target_variable = ['converted']

numeric_features = [1,4]
categorical_features = [0,2,3]

In [277]:
X = data.loc[:, features_list]
Y = data.loc[:, target_variable]

print('Explanatory variables : ', X.columns)
print('Target : ', Y.columns)

Explanatory variables :  Index(['country', 'age', 'new_user', 'source', 'total_pages_visited'], dtype='object')
Target :  Index(['converted'], dtype='object')


In [278]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=0, stratify=Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [279]:
# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.values
Y_test = Y_test.values
print("...Done")

print(X_train[0:5,:])
print(X_test[0:2,:])
print()
print(Y_train[0:5])
print(Y_test[0:2])

Convert pandas DataFrames to numpy arrays...
...Done
[['China' 23 0 'Direct' 2]
 ['China' 28 0 'Ads' 2]
 ['China' 30 1 'Seo' 7]
 ['UK' 37 1 'Seo' 3]
 ['US' 31 1 'Ads' 8]]
[['China' 24 1 'Ads' 14]
 ['US' 35 0 'Direct' 5]]

[[0]
 [0]
 [0]
 [0]
 [0]]
[[0]
 [0]]


In [280]:
# Training pipeline
# Put here all the preprocessings
print("Encoding categorical features and standardizing numerical features...")

numeric_indices = [1,4]
numeric_transformer = StandardScaler()

categorical_indices = [0,2,3]
categorical_transformer = OneHotEncoder(drop='first') # first column will be dropped to avoid creating correlations between features

feature_encoder = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_indices),('cat', categorical_transformer, categorical_indices)])

X_train = feature_encoder.fit_transform(X_train)

print("...Done")
print('X_train')
print(X_train[0:5,:])

Encoding categorical features and standardizing numerical features...
...Done
X_train
[[-0.91458053 -0.85939501  0.          0.          0.          0.
   1.          0.        ]
 [-0.30994956 -0.85939501  0.          0.          0.          0.
   0.          0.        ]
 [-0.06809718  0.63639894  0.          0.          0.          1.
   0.          1.        ]
 [ 0.77838618 -0.56023622  0.          1.          0.          1.
   0.          1.        ]
 [ 0.05282902  0.93555773  0.          0.          1.          1.
   0.          0.        ]]


##### *Training pipeline* #####

*Model Logistic Regression*

In [281]:
# Train model
print("Train model...")
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
print("...Done.")

Train model...



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



...Done.


In [282]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = classifier.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]



##### *Test pipeline* #####

In [283]:
# Test pipeline
# Use X_test, and the same preprocessings as in training pipeline, 
# but call "transform()" instead of "fit_transform" methods (see example below)

print("Encoding categorical features and standardizing numerical features...")

X_test = feature_encoder.transform(X_test)
print("...Done")
print("X_test")
print(X_test[0:5,:])

Encoding categorical features and standardizing numerical features...
...Done
X_test
[[-0.79365434  2.73051047  0.          0.          0.          1.
   0.          0.        ]
 [ 0.53653379  0.03808136  0.          0.          1.          0.
   1.          0.        ]
 [-0.18902337 -0.26107743  0.          0.          1.          0.
   1.          0.        ]
 [ 0.05282902  0.93555773  0.          0.          0.          1.
   0.          1.        ]
 [-0.30994956 -0.85939501  0.          0.          1.          1.
   0.          1.        ]]


In [284]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = classifier.predict(X_test)
print("...Done.")
print(Y_test_pred)

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]


##### *Performance assessment* #####

In [285]:
# Performance assessment
# WARNING : Use the same score as the one that will be used by Kaggle !
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

f1-score on train set :  0.7640283915896611
f1-score on test set :  0.7595396729254997


*Our baseline model reaches a better f1-score than the previous one🚀🚀*

##### *Train best classifier on all data and use it to make predictions on X_without_labels* #####

*Before making predictions on the file conversion_data_test.csv, let's train our model on ALL the data that was in conversion_data_train.csv. Sometimes, this allows to make tiny improvements in the score because we're using more examples to train the model.*



In [288]:
# Before making predictions on the file conversion_data_test.csv, let's train our model on ALL the data that was in conversion_data_train.csv. Sometimes, 
# this allows to make tiny improvements in the score because we're using more examples to train the model.

# Concatenate our train and test set to train your best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

classifier.fit(X,Y)

LogisticRegression()

In [289]:
# Read data without labels
data_without_labels = pd.read_csv('conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

# Warning : check consistency of features_list (must be the same than the features 
# used by your best classifier)
features_list = ['total_pages_visited']
X_without_labels = data_without_labels.loc[:, features_list]

# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_without_labels = X_without_labels.values
print("...Done")

print(X_without_labels[0:5,:])

Prediction set (without labels) : (31620, 5)
Convert pandas DataFrames to numpy arrays...
...Done
[[16]
 [ 5]
 [ 1]
 [ 6]
 [ 3]]


In [290]:
data_without_labels.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited
0,UK,28,0,Seo,16
1,UK,22,1,Direct,5
2,China,32,1,Seo,1
3,US,32,1,Ads,6
4,China,25,0,Seo,3


In [291]:
# Warning : check consistency of features_list (must be the same than the features 
# used by your best classifier)
features_list = ['country','age','new_user','source','total_pages_visited']
X_without_labels = data_without_labels.loc[:, features_list]

# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_without_labels = X_without_labels.values
print("...Done")
print(X_without_labels[0:5,:])

Convert pandas DataFrames to numpy arrays...
...Done
[['UK' 28 0 'Seo' 16]
 ['UK' 22 1 'Direct' 5]
 ['China' 32 1 'Seo' 1]
 ['US' 32 1 'Ads' 6]
 ['China' 25 0 'Seo' 3]]


In [292]:
# WARNING : PUT HERE THE SAME PREPROCESSING AS FOR YOUR TEST SET
# CHECK YOU ARE USING X_without_labels
print("Encoding categorical features and standardizing numerical features...")

X_without_labels = feature_encoder.transform(X_without_labels)
print("...Done")
print('X_without_labels')
print(X_without_labels[0:5,:])

Encoding categorical features and standardizing numerical features...
...Done
X_without_labels
[[-0.30994956  3.32882805  0.          1.          0.          0.
   0.          1.        ]
 [-1.03550673  0.03808136  0.          1.          0.          1.
   1.          0.        ]
 [ 0.17375521 -1.1585538   0.          0.          0.          1.
   0.          1.        ]
 [ 0.17375521  0.33724015  0.          0.          1.          1.
   0.          0.        ]
 [-0.67272814 -0.56023622  0.          0.          0.          0.
   0.          1.        ]]


##### *Part 4 : Convert our predictions into a csv file* ######

In [293]:
# Make predictions and dump to file
data = {'converted': classifier.predict(X_without_labels)}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_Haikel.csv', index=False)

In [294]:
Y_predictions.head()

Unnamed: 0,converted
0,1
1,0
2,0
3,0
4,0
