In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import sys
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder

from scipy import stats

from IPython.display import display

In [2]:
df = pd.read_csv('conversion_data_train.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [3]:
df_dummies = pd.get_dummies(df['source'])
del df_dummies[df_dummies.columns[-1]]
df_new = pd.concat([df, df_dummies], axis=1)
del df_new['source']
df_new.head()

df_dummies = pd.get_dummies(df_new['country'])
del df_dummies[df_dummies.columns[-1]]
df_complete = pd.concat([df_new, df_dummies], axis=1)
del df_complete['country']
df_complete.head()

Unnamed: 0,age,new_user,total_pages_visited,converted,Ads,Direct,China,Germany,UK
0,22,1,2,0,0,1,1,0,0
1,21,1,3,0,1,0,0,0,1
2,20,0,14,1,0,0,0,1,0
3,23,1,3,0,0,0,0,0,0
4,28,1,3,0,0,1,0,0,0


In [5]:
features_list = ['age', 'new_user', 'total_pages_visited', 
                 'Ads', 'Direct', 'China', 'Germany', 'UK']
target_variable = ['converted']

In [6]:
x = df_complete.loc[:, features_list]
y = df_complete.loc[:, target_variable]

x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42,
                                                    stratify=y)

x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

featureencoder = StandardScaler()
x_train = featureencoder.fit_transform(x_train)
x_train

array([[-0.06905373, -1.47802048, -0.56068654, ..., -0.56644798,
        -0.20739455, -0.42636791],
       [ 1.86294015, -1.47802048,  1.5340925 , ..., -0.56644798,
         4.82172755, -0.42636791],
       [ 0.65544398,  0.67658061, -1.15919483, ..., -0.56644798,
        -0.20739455, -0.42636791],
       ...,
       [-1.03505067,  0.67658061,  0.63633006, ...,  1.76538719,
        -0.20739455, -0.42636791],
       [-0.18980335, -1.47802048,  0.03782176, ..., -0.56644798,
        -0.20739455, -0.42636791],
       [-0.06905373, -1.47802048, -0.26143239, ..., -0.56644798,
        -0.20739455, -0.42636791]])

In [7]:
classifier_gini = DecisionTreeClassifier(criterion="gini",
                                        max_leaf_nodes=10,
                                        random_state=42)
classifier_gini.fit(x_train, y_train)

DecisionTreeClassifier(max_leaf_nodes=10, random_state=42)

In [8]:
y_train_pred = classifier_gini.predict(x_train)
y_train_pred

array([0, 0, 0, ..., 0, 0, 0])

In [9]:
x_test = featureencoder.transform(x_test)
x_test

array([[ 0.17244551,  0.67658061, -0.85994069, ..., -0.56644798,
        -0.20739455, -0.42636791],
       [ 1.74219054,  0.67658061, -0.56068654, ...,  1.76538719,
        -0.20739455, -0.42636791],
       [ 0.77619359, -1.47802048, -0.56068654, ..., -0.56644798,
        -0.20739455,  2.34539225],
       ...,
       [-1.27654991,  0.67658061,  0.93558421, ..., -0.56644798,
        -0.20739455, -0.42636791],
       [ 0.89694321, -1.47802048,  0.03782176, ..., -0.56644798,
        -0.20739455, -0.42636791],
       [-0.18980335,  0.67658061,  0.33707591, ..., -0.56644798,
        -0.20739455, -0.42636791]])

In [10]:
y_test_pred = classifier_gini.predict(x_test)
y_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [11]:
classifier_gini.score(x_test,y_test)

0.9849954318645021

In [12]:
print("f1-score train set : ", f1_score(y_train, y_train_pred))
print("f1-score test set : ", f1_score(y_test, y_test_pred))

f1-score train set :  0.7596169720983985
f1-score test set :  0.7536064627813039


In [13]:
x = np.append(x_train,x_test,axis=0)
y = np.append(y_train,y_test)

classifier_gini.fit(x,y)

DecisionTreeClassifier(max_leaf_nodes=10, random_state=42)

In [14]:
data_no_labels = pd.read_csv('conversion_data_test.csv')

df_dummies2 = pd.get_dummies(data_no_labels['source'])
del df_dummies2[df_dummies2.columns[-1]]
df_new2 = pd.concat([data_no_labels, df_dummies2], axis=1)
del df_new2['source']
df_new2.head()

df_dummies3 = pd.get_dummies(df_new2['country'])
del df_dummies3[df_dummies3.columns[-1]]
df_complete2 = pd.concat([df_new2, df_dummies3], axis=1)
del df_complete2['country']
df_complete2.head()

Unnamed: 0,age,new_user,total_pages_visited,Ads,Direct,China,Germany,UK
0,28,0,16,0,0,0,0,1
1,22,1,5,0,1,0,0,1
2,32,1,1,0,0,1,0,0
3,32,1,6,1,0,0,0,0
4,25,0,3,0,0,1,0,0


In [16]:
features_list = ['age', 'new_user', 'total_pages_visited', 
                 'Ads', 'Direct', 'China', 'Germany', 'UK']
df_complete2 = df_complete2.loc[:, features_list]

df_complete2 = df_complete2.to_numpy()

print(df_complete2)

[[28  0 16 ...  0  0  1]
 [22  1  5 ...  0  0  1]
 [32  1  1 ...  1  0  0]
 ...
 [33  1  5 ...  0  0  1]
 [25  1 14 ...  0  0  1]
 [22  1  2 ...  0  0  0]]


In [17]:
df_complete2 = featureencoder.transform(df_complete2)

In [19]:
data = {
    'converted': classifier_gini.predict(df_complete2)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_haske-modelgini.csv', index=False)

In [20]:
data

{'converted': array([1, 0, 0, ..., 0, 0, 0])}