# KAGGLE-LIKE CHALLENGE

Ici, on vous propose d'essayer de créer le meilleur modèle pour prédire des 
conversions en fonction de différentes variables explicatives. Vos modèles seront 
évalués à l'aide du f1-score.

In [18]:
#import libraries
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import sys
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

from scipy import stats

from IPython.display import display

In [19]:
np.set_printoptions(threshold=15)

In [20]:
#load data
df = pd.read_csv('conversion_data_train.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [21]:
#explore df

In [22]:
df.shape

(284580, 6)

In [23]:
df.describe()

Unnamed: 0,age,new_user,total_pages_visited,converted
count,284580.0,284580.0,284580.0,284580.0
mean,30.564203,0.685452,4.873252,0.032258
std,8.266789,0.464336,3.341995,0.176685
min,17.0,0.0,1.0,0.0
25%,24.0,0.0,2.0,0.0
50%,30.0,1.0,4.0,0.0
75%,36.0,1.0,7.0,0.0
max,123.0,1.0,29.0,1.0


In [24]:
df.isnull().any()

country                False
age                    False
new_user               False
source                 False
total_pages_visited    False
converted              False
dtype: bool

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284580 entries, 0 to 284579
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   country              284580 non-null  object
 1   age                  284580 non-null  int64 
 2   new_user             284580 non-null  int64 
 3   source               284580 non-null  object
 4   total_pages_visited  284580 non-null  int64 
 5   converted            284580 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 13.0+ MB


In [26]:
#flatten dataset
df_dummies = pd.get_dummies(df['source'])
del df_dummies[df_dummies.columns[-1]]
df_new = pd.concat([df, df_dummies], axis=1)
del df_new['source']
df_new.head()

Unnamed: 0,country,age,new_user,total_pages_visited,converted,Ads,Direct
0,China,22,1,2,0,0,1
1,UK,21,1,3,0,1,0
2,Germany,20,0,14,1,0,0
3,US,23,1,3,0,0,0
4,US,28,1,3,0,0,1


In [27]:
df_dummies = pd.get_dummies(df_new['country'])
del df_dummies[df_dummies.columns[-1]]
df_complete = pd.concat([df_new, df_dummies], axis=1)
del df_complete['country']
df_complete.head()

Unnamed: 0,age,new_user,total_pages_visited,converted,Ads,Direct,China,Germany,UK
0,22,1,2,0,0,1,1,0,0
1,21,1,3,0,1,0,0,0,1
2,20,0,14,1,0,0,0,1,0
3,23,1,3,0,0,0,0,0,0
4,28,1,3,0,0,1,0,0,0


In [28]:
#try some feature engineering

In [29]:
df_complete['corr'] = df_complete['total_pages_visited']/df_complete['age']

In [30]:
df_complete.corr()

#even though my 'page_age' variable has a higher correlation 
#with 'converted' than 'total_pages_visited,' the f1 score 
#went way down.

Unnamed: 0,age,new_user,total_pages_visited,converted,Ads,Direct,China,Germany,UK,corr
age,1.0,0.011676,-0.045365,-0.088265,-0.002583,-0.000902,0.006839,-0.00266,-0.00672,-0.36745
new_user,0.011676,1.0,-0.082986,-0.152115,0.001301,0.00082,0.01668,-0.003317,-0.006318,-0.083697
total_pages_visited,-0.045365,-0.082986,1.0,0.529192,0.005228,-0.00941,-0.054513,0.020712,0.0262,0.906133
converted,-0.088265,-0.152115,0.529192,1.0,0.009001,-0.013696,-0.099283,0.035349,0.04875,0.545628
Ads,-0.002583,0.001301,0.005228,0.009001,1.0,-0.340569,0.001479,0.003575,-0.00222,0.005391
Direct,-0.000902,0.00082,-0.00941,-0.013696,-0.340569,1.0,-0.001608,-0.004716,0.000962,-0.008517
China,0.006839,0.01668,-0.054513,-0.099283,0.001479,-0.001608,1.0,-0.117246,-0.241057,-0.055023
Germany,-0.00266,-0.003317,0.020712,0.035349,0.003575,-0.004716,-0.117246,1.0,-0.088098,0.021211
UK,-0.00672,-0.006318,0.0262,0.04875,-0.00222,0.000962,-0.241057,-0.088098,1.0,0.027985
corr,-0.36745,-0.083697,0.906133,0.545628,0.005391,-0.008517,-0.055023,0.021211,0.027985,1.0


In [31]:
#identify target variable
features_list = ['total_pages_visited']
target_variable = ['converted']

In [32]:
#split into train/test sets
x = df_complete.loc[:, features_list]
y = df_complete.loc[:, target_variable]

x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.1, 
                                                    random_state=0,
                                                    stratify=y)

x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

x_train= x_train.reshape(-1, 1)
x_test = x_test.reshape(-1, 1)
y_train= y_train.reshape(-1, 1)
y_test= y_test.reshape(-1, 1)

featureencoder = StandardScaler()
x_train = featureencoder.fit_transform(x_train)
x_train

array([[-0.85939501],
       [-0.85939501],
       [ 0.63639894],
       ...,
       [-1.1585538 ],
       [ 1.23471652],
       [ 0.93555773]])

In [43]:
#make log reg model
classifier_log = LogisticRegressionCV()
classifier_log.fit(x_train, y_train.ravel())

LogisticRegressionCV()

In [34]:
#store predictions
y_train_pred = classifier_log.predict(x_train)
y_train_pred

array([0, 0, 0, ..., 0, 0, 0])

In [35]:
x_test = featureencoder.transform(x_test)
x_test

array([[ 2.73051047],
       [ 0.03808136],
       [-0.26107743],
       ...,
       [ 0.63639894],
       [-0.85939501],
       [ 0.63639894]])

In [36]:
y_test_pred = classifier_log.predict(x_test)
y_test_pred

array([1, 0, 0, ..., 0, 0, 0])

In [37]:
classifier_log.score(x_test,y_test)

0.9820437135427648

In [44]:
#evaluate
print("f1-score train set : ", f1_score(y_train, y_train_pred))
print("f1-score test set : ", f1_score(y_test, y_test_pred))

f1-score train set :  0.6971029668411867
f1-score test set :  0.6771951989892608


In [40]:
x = np.append(x_train,x_test,axis=0)
y = np.append(y_train,y_test)

classifier_log.fit(x,y)

LogisticRegressionCV()

In [45]:
#test unlabeled data on model
data_no_labels = pd.read_csv('conversion_data_test.csv')

features_list = ['total_pages_visited']
data_no_labels = data_no_labels.loc[:, features_list]


data_no_labels = data_no_labels.to_numpy()

print(data_no_labels)

[[16]
 [ 5]
 [ 1]
 ...
 [ 5]
 [14]
 [ 2]]


In [42]:
data_no_labels = featureencoder.transform(data_no_labels)

In [None]:
data = {
    'converted': classifier.predict(data_no_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_haske.csv', index=False)