In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [70]:
df = pd.read_csv('../data/digital_marketing_campaign_dataset.csv')
df.head()

Unnamed: 0,CustomerID,Age,Gender,Income,CampaignChannel,CampaignType,AdSpend,ClickThroughRate,ConversionRate,WebsiteVisits,PagesPerVisit,TimeOnSite,SocialShares,EmailOpens,EmailClicks,PreviousPurchases,LoyaltyPoints,AdvertisingPlatform,AdvertisingTool,Conversion
0,8000,56,Female,136912,Social Media,Awareness,6497.870068,0.043919,0.088031,0,2.399017,7.396803,19,6,9,4,688,IsConfid,ToolConfid,1
1,8001,69,Male,41760,Email,Retention,3898.668606,0.155725,0.182725,42,2.917138,5.352549,5,2,7,2,3459,IsConfid,ToolConfid,1
2,8002,46,Female,88456,PPC,Awareness,1546.429596,0.27749,0.076423,2,8.223619,13.794901,0,11,2,8,2337,IsConfid,ToolConfid,1
3,8003,32,Female,44085,PPC,Conversion,539.525936,0.137611,0.088004,47,4.540939,14.688363,89,2,2,0,2463,IsConfid,ToolConfid,1
4,8004,60,Female,83964,PPC,Conversion,1678.043573,0.252851,0.10994,0,2.046847,13.99337,6,6,6,8,4345,IsConfid,ToolConfid,1


In [71]:
df.columns

Index(['CustomerID', 'Age', 'Gender', 'Income', 'CampaignChannel',
       'CampaignType', 'AdSpend', 'ClickThroughRate', 'ConversionRate',
       'WebsiteVisits', 'PagesPerVisit', 'TimeOnSite', 'SocialShares',
       'EmailOpens', 'EmailClicks', 'PreviousPurchases', 'LoyaltyPoints',
       'AdvertisingPlatform', 'AdvertisingTool', 'Conversion'],
      dtype='object')

In [72]:
df.Conversion.value_counts()

Conversion
1    7012
0     988
Name: count, dtype: int64

In [73]:
df.PreviousPurchases.value_counts()

PreviousPurchases
0    838
3    822
9    819
6    818
4    797
8    796
1    794
5    779
2    773
7    764
Name: count, dtype: int64

In [74]:
df['CustomerLifetimeValue'] = df.PreviousPurchases * np.random.randint(10000, 15001, size=len(df))

df['RevenuePerConversion'] = np.random.randint(10000, 15001, size=len(df))
df['Revenue'] = df['Conversion'] * df['RevenuePerConversion']
df['ROI'] = (df['Revenue'] - df['AdSpend']) / df['AdSpend'] * 100
df.head()

Unnamed: 0,CustomerID,Age,Gender,Income,CampaignChannel,CampaignType,AdSpend,ClickThroughRate,ConversionRate,WebsiteVisits,...,EmailClicks,PreviousPurchases,LoyaltyPoints,AdvertisingPlatform,AdvertisingTool,Conversion,CustomerLifetimeValue,RevenuePerConversion,Revenue,ROI
0,8000,56,Female,136912,Social Media,Awareness,6497.870068,0.043919,0.088031,0,...,9,4,688,IsConfid,ToolConfid,1,57040,14671,14671,125.781677
1,8001,69,Male,41760,Email,Retention,3898.668606,0.155725,0.182725,42,...,7,2,3459,IsConfid,ToolConfid,1,25404,10587,10587,171.554243
2,8002,46,Female,88456,PPC,Awareness,1546.429596,0.27749,0.076423,2,...,2,8,2337,IsConfid,ToolConfid,1,94376,11044,11044,614.16119
3,8003,32,Female,44085,PPC,Conversion,539.525936,0.137611,0.088004,47,...,2,0,2463,IsConfid,ToolConfid,1,0,10316,10316,1812.048952
4,8004,60,Female,83964,PPC,Conversion,1678.043573,0.252851,0.10994,0,...,6,8,4345,IsConfid,ToolConfid,1,83920,10739,10739,539.971463


In [75]:
df.drop(['AdvertisingPlatform', 'AdvertisingTool'], inplace = True, axis = 1)

In [76]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,Income,CampaignChannel,CampaignType,AdSpend,ClickThroughRate,ConversionRate,WebsiteVisits,...,SocialShares,EmailOpens,EmailClicks,PreviousPurchases,LoyaltyPoints,Conversion,CustomerLifetimeValue,RevenuePerConversion,Revenue,ROI
0,8000,56,Female,136912,Social Media,Awareness,6497.870068,0.043919,0.088031,0,...,19,6,9,4,688,1,57040,14671,14671,125.781677
1,8001,69,Male,41760,Email,Retention,3898.668606,0.155725,0.182725,42,...,5,2,7,2,3459,1,25404,10587,10587,171.554243
2,8002,46,Female,88456,PPC,Awareness,1546.429596,0.27749,0.076423,2,...,0,11,2,8,2337,1,94376,11044,11044,614.16119
3,8003,32,Female,44085,PPC,Conversion,539.525936,0.137611,0.088004,47,...,89,2,2,0,2463,1,0,10316,10316,1812.048952
4,8004,60,Female,83964,PPC,Conversion,1678.043573,0.252851,0.10994,0,...,6,6,6,8,4345,1,83920,10739,10739,539.971463


In [77]:
df.ROI.describe()

count     8000.000000
mean       378.595100
std        953.771289
min       -100.000000
25%         45.698237
50%        113.224892
75%        309.553195
max      13858.661806
Name: ROI, dtype: float64

In [78]:
df.CampaignChannel.value_counts()

CampaignChannel
Referral        1719
PPC             1655
Email           1557
SEO             1550
Social Media    1519
Name: count, dtype: int64

In [79]:
#X = df[['Age', 'Gender', 'Income', 'CampaignChannel', 'CampaignType', 'CustomerLifetimeValue']]
X = df[['CampaignChannel', 'CampaignType']]
y = df['ROI']

In [80]:
X = pd.get_dummies(X)

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [82]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)


In [83]:
rf.score(X_train, y_train)

0.004923525094186809

In [84]:
importances = rf.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

                        Feature  Importance
7       CampaignType_Conversion    0.391584
0         CampaignChannel_Email    0.098230
1           CampaignChannel_PPC    0.092473
4  CampaignChannel_Social Media    0.089890
3           CampaignChannel_SEO    0.084540
6    CampaignType_Consideration    0.071288
2      CampaignChannel_Referral    0.063553
8        CampaignType_Retention    0.058331
5        CampaignType_Awareness    0.050110
