In [75]:
### Importing libraries 

import numpy as np

import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import scipy.optimize as opt
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

from sklearn.metrics import jaccard_similarity_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from scipy import stats

In [6]:
data = pd.read_csv('conversion_data.csv')

In [8]:
data.head(10)

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,UK,25,1,Ads,1,0
1,US,23,1,Seo,5,0
2,US,28,1,Seo,4,0
3,China,39,1,Seo,5,0
4,US,30,1,Seo,6,0
5,US,31,0,Seo,1,0
6,China,27,1,Seo,4,0
7,US,23,0,Ads,4,0
8,UK,29,0,Direct,4,0
9,US,25,0,Ads,2,0


### Exploratory Data Analysis

In [11]:
### number of examples of each categories of conversion

data.groupby('converted').count()

Unnamed: 0_level_0,country,age,new_user,source,total_pages_visited
converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,306000,306000,306000,306000,306000
1,10200,10200,10200,10200,10200


In [69]:
data.describe()

Unnamed: 0,age,new_user,total_pages_visited,converted
count,316200.0,316200.0,316200.0,316200.0
mean,30.569858,0.685465,4.872966,0.032258
std,8.271802,0.464331,3.341104,0.176685
min,17.0,0.0,1.0,0.0
25%,24.0,0.0,2.0,0.0
50%,30.0,1.0,4.0,0.0
75%,36.0,1.0,7.0,0.0
max,123.0,1.0,29.0,1.0


In [63]:
data.corr()

Unnamed: 0,age,new_user,total_pages_visited,converted
age,1.0,0.012343,-0.045922,-0.088797
new_user,0.012343,1.0,-0.082541,-0.152374
total_pages_visited,-0.045922,-0.082541,1.0,0.528994
converted,-0.088797,-0.152374,0.528994,1.0


In [76]:
pearson_coef, p_value = stats.pearsonr(data['new_user'], data['converted'])
print("The Pearson Correlation Coefficient between conversion and new user is", pearson_coef, " with a P-value of P =", p_value)  

The Pearson Correlation Coefficient is -0.15237386775904088  with a P-value of P = 0.0


In [77]:
pearson_coef, p_value = stats.pearsonr(data['age'], data['converted'])
print("The Pearson Correlation Coefficient between conversion and age is", pearson_coef, " with a P-value of P =", p_value)  

The Pearson Correlation Coefficient is -0.088797350840055  with a P-value of P = 0.0


In [78]:
pearson_coef, p_value = stats.pearsonr(data['total_pages_visited'], data['converted'])
print("The Pearson Correlation Coefficient between conversion and total pages visited is", pearson_coef, " with a P-value of P =", p_value)  

The Pearson Correlation Coefficient is 0.528993955018847  with a P-value of P = 0.0


#### The features "new user", "age", and "total pages visited" are correlated with conversion because the P-value < 0. Total pages visited is positively correlated while age and new user is negavtively correlated with conversion, as can be inferred from the Pearson Correlation Coefficient.

### Splitting the data into train-test sets to evaluate models 

In [30]:
Y=data[['converted']]
Y.head()

Unnamed: 0,converted
0,0
1,0
2,0
3,0
4,0


In [32]:
X=data[['country','age','new_user','source','total_pages_visited']]
X.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited
0,UK,25,1,Ads,1
1,US,23,1,Seo,5
2,US,28,1,Seo,4
3,China,39,1,Seo,5
4,US,30,1,Seo,6


In [36]:
### train-test split to evaluate models

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(211854, 5)
(104346, 5)
(211854, 1)
(104346, 1)


### Logistic Regression classfier 

#### Since this is a classification problem, a logistic regression classifier seems most appropriate.  

In [90]:
LR = LogisticRegression(solver='liblinear').fit(X_train[['new_user']],Y_train)
LR

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [91]:
Yhat = LR.predict(X_test[['new_user']])
Yhat

array([0, 0, 0, ..., 0, 0, 0])

In [92]:
jaccard_similarity_score(Y_test, Yhat)



0.9685565330726621

In [87]:
LR1 = LogisticRegression(solver='liblinear').fit(X_train[['age']],Y_train)
LR1

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [95]:
Yhat1 = LR1.predict(X_test[['age']])
Yhat1

array([0, 0, 0, ..., 0, 0, 0])

In [96]:
jaccard_similarity_score(Y_test, Yhat1)



0.9685565330726621

In [100]:
LR2 = LogisticRegression(solver='liblinear').fit(X_train[['total_pages_visited']],Y_train)
LR2

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [101]:
Yhat2 = LR2.predict(X_test[['total_pages_visited']])
Yhat2

array([0, 0, 0, ..., 0, 0, 0])

In [102]:
jaccard_similarity_score(Y_test, Yhat2)



0.9832863741782147

#### Since the jaccard index is close to 1 for the features "new user", "age", and "total pages visited", the model is accurate. 

### Recommendations based on the correlations:

#### - focus on younger users and customers who are loyal who keep returning to the website - they tend to convert more than others. 