In [23]:
import pandas as pd
#read from google drive
data = pd.read_csv('https://drive.google.com/uc?export=download&id=1H_-mi6NzeZt1PhcanHofcf3H6UiItmxO')

In [24]:
data.head()

Unnamed: 0,user_id,source,device,operative_system,lat,long,weekday,yearweek,converted
0,3,seo_facebook,web,mac,38.89,-94.81,Friday,16,0
1,9,seo-google,mobile,android,41.68,-72.94,Friday,18,0
2,14,friend_referral,mobile,iOS,39.74,-75.53,Saturday,13,0
3,16,ads-google,mobile,android,37.99,-121.8,Friday,21,0
4,19,ads-google,mobile,android,41.08,-81.52,Wednesday,14,0


In [26]:
data.device.value_counts()

mobile    162321
web       113295
Name: device, dtype: int64

In [27]:
data.operative_system.value_counts()

windows    87721
iOS        82982
android    65305
mac        21831
other      14143
linux       3634
Name: operative_system, dtype: int64

In [28]:
data.weekday.value_counts()

Friday       80047
Saturday     64632
Sunday       48512
Thursday     32187
Tuesday      17104
Monday       16722
Wednesday    16412
Name: weekday, dtype: int64

In [25]:
data.source.value_counts()

direct_traffic     52594
ads-google         51576
ads_facebook       46365
ads_other          26084
seo-google         20157
ads-bing           19887
seo_facebook       18473
friend_referral    18011
seo-other           8058
ads-yahoo           6576
seo-yahoo           5961
seo-bing            1874
Name: source, dtype: int64

In [29]:
data.yearweek.value_counts()

15    21779
10    21489
19    21463
12    21332
13    21247
18    21215
22    21134
11    21081
16    21037
17    21034
20    21021
14    20895
21    20889
Name: yearweek, dtype: int64

In [30]:
data.converted.value_counts()

0    270597
1      5019
Name: converted, dtype: int64

In [77]:
converted_adsbysource = data.groupby('source')[['source','converted']].sum('converted').sort_values(by='converted',ascending=False)

In [76]:
data.groupby('source')[['source','converted']].sum('converted').sort_values(by='converted',ascending=False)

Unnamed: 0_level_0,converted
source,Unnamed: 1_level_1
ads-google,1102
ads_facebook,983
friend_referral,683
direct_traffic,639
ads_other,375
seo-google,343
seo_facebook,293
ads-bing,238
seo-other,128
seo-yahoo,99


## Data Preprocessing

In [32]:
data_dummy = pd.get_dummies(data,drop_first=True)

In [33]:
data_dummy.head()

Unnamed: 0,user_id,lat,long,yearweek,converted,source_ads-google,source_ads-yahoo,source_ads_facebook,source_ads_other,source_direct_traffic,...,operative_system_linux,operative_system_mac,operative_system_other,operative_system_windows,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
0,3,38.89,-94.81,16,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,9,41.68,-72.94,18,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,14,39.74,-75.53,13,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,16,37.99,-121.8,21,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,19,41.08,-81.52,14,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Building a logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

In [58]:
train = data_dummy.drop('converted',axis=1)

In [67]:
train['intercept'] = 1 

In [68]:
logit = sm.Logit(data_dummy['converted'],train)

In [69]:
output = logit.fit()

         Current function value: 0.089555
         Iterations: 35




In [70]:
output_df = pd.DataFrame(dict(coeff=output.params,se=output.bse,pvalue=output.pvalues,t_Value=output.tvalues))

In [138]:
output_df[output_df['pvalue']<0.05].sort_values(by='coeff',ascending=False)

Unnamed: 0,coeff,se,pvalue,t_Value
source_friend_referral,1.177281,0.076029,4.409649e-54,15.484575
source_ads-google,0.587031,0.071997,3.53316e-16,8.153577
source_ads_facebook,0.577365,0.072771,2.121665e-15,7.934018
source_seo-bing,0.5738,0.172699,0.0008920106,3.322543
operative_system_iOS,0.407536,0.040065,2.646118e-24,10.17193
source_seo-google,0.356743,0.084992,2.700258e-05,4.197376
source_seo-yahoo,0.336044,0.120564,0.005315571,2.787261
source_seo_facebook,0.290116,0.087898,0.0009647307,3.300615
source_seo-other,0.287583,0.110455,0.009224834,2.603607
source_ads_other,0.186544,0.083446,0.02538346,2.235517


In [84]:
converted_adsbysource.reset_index(inplace=True)

In [88]:
source_count = pd.DataFrame(data.source.value_counts())

In [91]:
source_count.reset_index(inplace=True)

In [93]:
percentage_conversion = pd.merge(converted_adsbysource,source_count,left_on='source',right_on='index')

In [95]:
percentage_conversion['perc'] = percentage_conversion['converted']/percentage_conversion['source_y']

In [98]:
percentage_conversion.sort_values('perc',ascending=False)

Unnamed: 0,source_x,converted,index,source_y,perc
2,friend_referral,683,friend_referral,18011,0.037921
0,ads-google,1102,ads-google,51576,0.021367
11,seo-bing,40,seo-bing,1874,0.021345
1,ads_facebook,983,ads_facebook,46365,0.021201
5,seo-google,343,seo-google,20157,0.017016
9,seo-yahoo,99,seo-yahoo,5961,0.016608
8,seo-other,128,seo-other,8058,0.015885
6,seo_facebook,293,seo_facebook,18473,0.015861
10,ads-yahoo,96,ads-yahoo,6576,0.014599
4,ads_other,375,ads_other,26084,0.014377


## Insights: 

#### 1. Friend Referrals are the way to go, we should talk to the manager regarding what kind of efforts are being taken in this direction and possibly spend more money in this program, if its justified by the Customer Lifetime Value.

#### 2. Coefficients of ADS by Google, Facebook and Bing are almost the same, so is the percentage of conversion however we are sending out a lot of "Other" ads we would like to check the spending on those ads, also on bing ads the conversion rate is too low, we maybe need to target the users in a different way.

#### 3. We see that iOS users are more likely to click on the ad, Could we compare the positioning, placement and UI for the ad? Maybe we could redesign the Windows/Android App.

#### 4. It seems like the days dont matter, but Tuesday is significantly worse than Friday.

## Creating a tree to check for additional Rules :

In [118]:
from sklearn.tree import DecisionTreeClassifier
from graphviz import Source
from sklearn.tree import export_graphviz

In [119]:
tree = DecisionTreeClassifier(max_depth=6,class_weight='balanced',min_impurity_decrease=0.001)

In [120]:
tree.fit(train,data_dummy['converted'])

DecisionTreeClassifier(class_weight='balanced', max_depth=6,
                       min_impurity_decrease=0.001)

In [121]:
export_graphviz(tree,out_file="ex_tree.dot",feature_names=train.columns,proportion=True,rotate=True)

In [122]:
S = Source.from_file("ex_tree.dot")

In [123]:
S.view()

'ex_tree.dot.pdf'

## Analysis

### 1. We see that as we guessed "Refered by a friend" is very important here as well and as we go down that is "Refer Friend" is true the probabllity  of correct prediction is incerased to 68%, Everything becomes irrelevant as well. Thus we should foucus on this area by both of our analysis

### 2. When a person directly visits a website without a referral, we see that only 30% people converted, we need to probably check in with the manager if the UI/UX is upto the mark.

### 3. Anything after that doesnt give us much information

## We already have a couple of insights by now, I will check on PDP

In [129]:
import pdp

In [133]:
feat_original = data_dummy.columns.drop('converted')

In [None]:
pdp.pdp_

In [136]:
for i in range(len(feat_original)):
    
    plot_variable = [e for e in train.columns if e.startswith(feat_original[i])]
   
    if len(plot_variable) == 1:
        pdp_iso = pdp.pdp_isolate( model=rf, dataset=train.columns, model_features=train.columns, feature=plot_variable[0], num_grid_points=50)
        pdp_dataset = pandas.Series(pdp_iso.pdp, index=pdp_iso.feature_grids)
        #pdpbox has several options if you want to use their built-in plots. I personally prefer just using .plot. It is totally subjective obviously.
        pdp_dataset.plot(title=feat_original[i])
        plt.show()
         
    #categorical variables with several levels
    else:
        pdp_iso = pdp.pdp_isolate( model=rf, dataset=train.columns, model_features=train.columns,  feature=plot_variable, num_grid_points=50)
        pdp_dataset = pandas.Series(pdp_iso.pdp, index=pdp_iso.display_columns)
        pdp_dataset.sort_values(ascending=False).plot(kind='bar', title=feat_original[i])
        plt.show()
    plt.close()   
    
    

AttributeError: module 'pdp' has no attribute 'pdp_isolate'

In [None]:
# Matplot Lib WHeel Error while installing pdpbox, still trying to fix it 

# My Top 3 Insights

### 1. Imporve referral rewards, introduce some kind of gamification where users can collect more points together

### 2. Improve UI/UX design, Make Better Funnel for people visiting the website

### 3. Target more iOS users, talk with the manager on why Windows/Android ads are not doing better

### 4. Saves costs on "other" types of ad