In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source

In [2]:
data = pd.read_csv('emails.csv')

In [3]:
data.head()

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,clicked
0,8,short_email,generic,9,Thursday,US,3,0
1,33,long_email,personalized,6,Monday,US,0,0
2,46,short_email,generic,14,Tuesday,US,3,0
3,49,long_email,personalized,11,Thursday,US,10,0
4,65,short_email,generic,8,Wednesday,UK,3,0


In [4]:
data['clicked'].value_counts()

0    97881
1     2069
Name: clicked, dtype: int64

In [5]:
data.describe()

Unnamed: 0,email_id,hour,user_past_purchases,clicked
count,99950.0,99950.0,99950.0,99950.0
mean,498695.729065,9.0591,3.878559,0.0207
std,289226.115244,4.439618,3.196324,0.14238
min,8.0,1.0,0.0,0.0
25%,246721.5,6.0,1.0,0.0
50%,498441.5,9.0,3.0,0.0
75%,749936.75,12.0,6.0,0.0
max,999998.0,24.0,22.0,1.0


In [6]:
data[data['clicked']==1]['user_past_purchases'].describe()

count    2069.00000
mean        6.22958
std         3.59897
min         0.00000
25%         3.00000
50%         6.00000
75%         9.00000
max        22.00000
Name: user_past_purchases, dtype: float64

In [7]:
data[data['clicked']==0]['user_past_purchases'].describe()

count    97881.000000
mean         3.828864
std          3.168514
min          0.000000
25%          1.000000
50%          3.000000
75%          6.000000
max         21.000000
Name: user_past_purchases, dtype: float64

In [8]:
data_categorical = data.select_dtypes(['object']).astype("category")

In [9]:
data['email_text'].value_counts()

long_email     50248
short_email    49702
Name: email_text, dtype: int64

In [10]:
data['email_version'].value_counts()

generic         50178
personalized    49772
Name: email_version, dtype: int64

In [11]:
data['weekday'].value_counts()

Saturday     14564
Sunday       14374
Monday       14358
Thursday     14274
Friday       14165
Tuesday      14137
Wednesday    14078
Name: weekday, dtype: int64

In [12]:
data['user_country'].value_counts()

US    60069
UK    19928
FR     9989
ES     9964
Name: user_country, dtype: int64

In [13]:
print(data_categorical.apply(lambda x: x.cat.categories[0]))


email_text       long_email
email_version       generic
weekday              Friday
user_country             ES
dtype: object


In [14]:
### Converting to Dummy Vavriables

In [15]:
data_d = pd.get_dummies(data, drop_first=True)

In [16]:
data_d

Unnamed: 0,email_id,hour,user_past_purchases,clicked,email_text_short_email,email_version_personalized,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,user_country_FR,user_country_UK,user_country_US
0,8,9,3,0,1,0,0,0,0,1,0,0,0,0,1
1,33,6,0,0,0,1,1,0,0,0,0,0,0,0,1
2,46,14,3,0,1,0,0,0,0,0,1,0,0,0,1
3,49,11,10,0,0,1,0,0,0,1,0,0,0,0,1
4,65,8,3,0,1,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99945,999969,21,0,0,1,0,0,0,0,1,0,0,0,0,1
99946,999972,6,5,0,0,1,0,0,0,0,1,0,0,0,1
99947,999976,5,0,0,0,1,0,0,0,0,0,1,0,1,0
99948,999980,10,4,0,0,0,0,0,0,1,0,0,1,0,0


In [17]:
data_d['intercept'] = 1

## Dropping Target Variable

In [18]:
train = data_d.drop('clicked',axis=1)

## Logit Function

In [19]:
logit = sm.Logit(data_d['clicked'],train)
output = logit.fit()

Optimization terminated successfully.
         Current function value: 0.092770
         Iterations 9


## One way to view output

In [20]:
output.summary()

0,1,2,3
Dep. Variable:,clicked,No. Observations:,99950.0
Model:,Logit,Df Residuals:,99935.0
Method:,MLE,Df Model:,14.0
Date:,"Wed, 02 Feb 2022",Pseudo R-squ.:,0.07923
Time:,13:07:29,Log-Likelihood:,-9272.4
converged:,True,LL-Null:,-10070.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
email_id,-3.849e-08,7.78e-08,-0.495,0.621,-1.91e-07,1.14e-07
hour,0.0167,0.005,3.337,0.001,0.007,0.027
user_past_purchases,0.1878,0.006,32.801,0.000,0.177,0.199
email_text_short_email,0.2793,0.045,6.165,0.000,0.191,0.368
email_version_personalized,0.6387,0.047,13.615,0.000,0.547,0.731
weekday_Monday,0.5410,0.093,5.792,0.000,0.358,0.724
weekday_Saturday,0.2829,0.098,2.893,0.004,0.091,0.475
weekday_Sunday,0.1836,0.100,1.834,0.067,-0.013,0.380
weekday_Thursday,0.6254,0.092,6.773,0.000,0.444,0.806


In [21]:
output_table = pd.DataFrame(dict(coefficients = output.params,SE = output.bse, z_value = output.tvalues,p_value=output.pvalues))

In [22]:
output_table

Unnamed: 0,coefficients,SE,z_value,p_value
email_id,-3.848609e-08,7.780379e-08,-0.494656,0.6208432
hour,0.01670684,0.005005879,3.337445,0.0008455247
user_past_purchases,0.1878107,0.005725787,32.800855,5.725039e-236
email_text_short_email,0.2793085,0.04530477,6.165101,7.043829e-10
email_version_personalized,0.6387251,0.04691461,13.614631,3.2779889999999996e-42
weekday_Monday,0.5410326,0.09341014,5.792011,6.954864e-09
weekday_Saturday,0.2828638,0.09777629,2.892969,0.00381619
weekday_Sunday,0.1836278,0.1001194,1.834088,0.06664099
weekday_Thursday,0.625404,0.09233999,6.772839,1.26279e-11
weekday_Tuesday,0.6162222,0.09237223,6.671077,2.539336e-11


In [23]:
output_table[output_table['p_value']<0.05].sort_values(by='coefficients',ascending=False)

Unnamed: 0,coefficients,SE,z_value,p_value
user_country_UK,1.155255,0.12206,9.464618,2.946372e-21
user_country_US,1.14136,0.115963,9.842487,7.386228e-23
weekday_Wednesday,0.755464,0.090845,8.31595,9.102053000000001e-17
email_version_personalized,0.638725,0.046915,13.614631,3.2779889999999996e-42
weekday_Thursday,0.625404,0.09234,6.772839,1.26279e-11
weekday_Tuesday,0.616222,0.092372,6.671077,2.539336e-11
weekday_Monday,0.541033,0.09341,5.792011,6.954864e-09
weekday_Saturday,0.282864,0.097776,2.892969,0.00381619
email_text_short_email,0.279308,0.045305,6.165101,7.043829e-10
user_past_purchases,0.187811,0.005726,32.800855,5.725039e-236


## Now we make a tree to assess gain more insights in the data

In [24]:
data_dummy = pd.get_dummies(data,drop_first=True)

In [25]:
train = data_dummy.drop('clicked',axis = 1)

In [26]:
tree = DecisionTreeClassifier(max_depth=4,class_weight='balanced',min_impurity_decrease=0.001)

### What does class balanced do?

In [27]:
tree.fit(train,data_dummy['clicked'])

DecisionTreeClassifier(class_weight='balanced', max_depth=4,
                       min_impurity_decrease=0.001)

In [28]:
tree.get_params()

{'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.001,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [29]:
export_graphviz(tree, out_file="tree.dot",feature_names=train.columns,proportion=True,rotate=True)

In [30]:
s = Source.from_file('tree.dot')

In [31]:
s.view()

'tree.dot.pdf'