In [1]:
#DataExplorer package for exploratory data analysis 
#Useful Documentation- https://cran.r-project.org/web/packages/DataExplorer/vignettes/dataexplorer-intro.html
if (!require(DataExplorer)) install.packages("DataExplorer")
library(DataExplorer)


Loading required package: DataExplorer
“there is no package called ‘DataExplorer’”also installing the dependency ‘gridExtra’

“installation of package ‘DataExplorer’ had non-zero exit status”Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


ERROR: Error in library(DataExplorer): there is no package called ‘DataExplorer’


In [2]:
#dplyr - Data Wrangling Package. Check R Learning Guide for resources to quickly learn dplyr
if (!require(dplyr)) install.packages("dplyr")
library(dplyr)


Loading required package: dplyr

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [17]:
#Download this dataset and place it in your working directory
data <- read.csv("KAG_conversion_data_wrangled.csv",stringsAsFactors = FALSE)


In [18]:
head(data)

X,ad_id,campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,CTR,CPC,CostPerConv_Total,CostPerConv_Approved,CPM
1,708746,916,32,0,15,7350,1,1.43,2,1,0.0136,1.43,0.715,1.43,0.19
2,708749,916,32,0,16,17861,2,1.82,2,0,0.0112,0.91,0.91,1.82,0.1
3,708771,916,32,0,20,693,0,0.0,1,0,0.0,0.0,0.0,0.0,0.0
4,708815,916,32,0,28,4259,1,1.25,1,0,0.0235,1.25,1.25,1.25,0.29
5,708818,916,32,0,28,4133,1,1.29,1,1,0.0242,1.29,1.29,1.29,0.31
6,708820,916,32,0,29,1915,0,0.0,1,1,0.0,0.0,0.0,0.0,0.0


In [20]:
#Check the data dictionary provided with the code for meanings of various columns

#Columns: 
#
#1.) ad_id: unique ID for each ad. Acts as primray key/sole identifier for that ad 
#
#2.) campaign_id: an ID associated with each ad campaign of a company.
#
#3.) age: age of the person to whom the ad is shown.
#
#4.) gender: gender of the person to whom the add is shown
#
#5.) interest: a code specifying the category to which the person's interest belongs (interests are as mentioned in the person's Facebook public profile).
#
#6.) Impressions: the number of times the ad was shown.
#
#7.) Clicks: number of clicks on for that ad.
#
#8.) Spent: Amount paid by a company to Facebook, to show that ad.
#
#9.) Total conversion: Total number of people who enquired about the product after seeing the ad.
#
#10.) Approved conversion: Total number of people who bought the product after seeing the ad.


In [29]:
##### Creating Additional Useful Features

#1) Click-through-rate (CTR). This is the percentage of how many of our impressions became clicks. 
#A high CTR is often seen as a sign of good creative being presented to a relevant audience. 
#A low click through rate is suggestive of less-than-engaging adverts (design and / or messaging) and / or 
#presentation of adverts to an inappropriate audience. What is seen as a good CTR will depend on the type of 
#advert (website banner, Google Shopping ad, search network test ad etc.) and can vary across sectors, but 2% would be a reasonable benchmark.

#2) Cost Per Click (CPC). Self-explanatory this one: how much (on average) did each click cost. 
#While it can often be seen as desirable to reduce the cost per click, the CPC needs to be considered along with 
#other variables. For example, a campaign with an average CPC of £0.5 and a CR of 5% is 
#likely achieving more with its budget than one with a CPC of £0.2 and a CR of 1% (assuming the conversion value is the same.


#3) Cost per Conversion (CostPerConv_Total). This is the cost per 'conversion'. What a conversion is will be
#determined by the objectives of the campaign. It could be a partial sale, someone completing a contact form on a landing page,
#downloading an e-book, watching a video, or simply spending more than a particular amount of time or 
#viewing over a target number of pages on a website.

#4) Cost per Approved Conversion (CostPerConv_Approved). This is the cost per approved conversion (guaranteed sale).

#5) Cost per mille: Cost Per Mille (CPM). This number is the cost of one thousand impressions. 
#If your objective is ad exposure to increase brand awareness, this might be an important KPI for you to measure.


In [30]:
######## Analysis of dataset through some questions- 


#How many ads belong to camapaign(campaign_id) 1178?
data %>% filter(campaign_id == 1178 ) %>% nrow()


In [31]:
#Which is the smallest and the largest campaign? 
data %>% group_by(campaign_id) %>% summarise(n_ads = length(ad_id))
#Smallest - 916 , Largest - 1178


campaign_id,n_ads
916,54
936,464
1178,625


In [32]:
#How many ads which have never caused any approved conversion were targeted towards female audiences?
data %>% filter(Approved_Conversion ==0 ) %>% group_by(gender) %>% summarise(count=length(ad_id))#nrow()
#281

gender,count
0,278
1,281


In [33]:
#What percentage of total ads (use ad_ids for calculation) have never caused any approved conversion?
(data %>% filter(Approved_Conversion ==0 ) %>% nrow()) / (data %>% nrow()) * 100
(nrow(data[data$Approved_Conversion==0,]))/(nrow(data))*100 #Traditional R syntax - Also works
#48.9064%

In [34]:
#Which campaign had the best brand awareness result wrt the Impressions on an average? 
data %>% group_by(campaign_id) %>% summarise(n_ads = length(ad_id),campaign_Impr = mean(Impressions))
#1178 - Mean Impressions of 327718 per ad


campaign_id,n_ads,campaign_Impr
916,54,8943.056
936,464,17517.644
1178,625,327717.946


In [35]:
#Which campaign had spent most efficiently on brand awareness on an average (least Cost per mille)?
data %>% group_by(campaign_id) %>% summarise(n_ads = length(ad_id),campaign_CPM = mean(CPM)) %>%
  arrange(campaign_CPM)
#936 - 0.224 CPM

campaign_id,n_ads,campaign_CPM
936,464,0.2240948
916,54,0.2405556
1178,625,0.250336


In [36]:
#Which ad was the most successful wrt approved conversions?
data %>% filter(Approved_Conversion == max(Approved_Conversion))
#1121104 - Approved Conversion : 40
#Alternative way to do this - Arrange in descending order of approved conversion and check first row
data %>% arrange(desc(Approved_Conversion))

X,ad_id,campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,CTR,CPC,CostPerConv_Total,CostPerConv_Approved,CPM
529,1121104,1178,32,0,16,2080666,202,360.15,40,21,0.0097,1.7829,9.0038,17.15,0.17


X,ad_id,campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,CTR,CPC,CostPerConv_Total,CostPerConv_Approved,CPM
529,1121104,1178,32,0,16,2080666,202,360.15,40,21,0.0097,1.7829,9.0038,17.1500,0.17
526,1121100,1178,32,0,15,3052003,340,639.95,60,17,0.0111,1.8822,10.6658,37.6441,0.21
519,1121091,1178,32,0,10,1194718,141,254.05,28,14,0.0118,1.8018,9.0732,18.1464,0.21
532,1121108,1178,32,0,16,984521,95,163.90,26,14,0.0096,1.7253,6.3038,11.7071,0.17
861,1121814,1178,32,1,27,2223278,421,612.30,38,13,0.0189,1.4544,16.1132,47.1000,0.28
580,1121206,1178,32,0,29,227925,22,35.31,22,12,0.0097,1.6050,1.6050,2.9425,0.15
525,1121098,1178,32,0,15,1267550,123,236.77,24,10,0.0097,1.9250,9.8654,23.6770,0.19
808,1121677,1178,47,0,29,1391924,258,422.84,17,10,0.0185,1.6389,24.8729,42.2840,0.30
868,1121824,1178,32,1,29,1705246,295,429.48,23,10,0.0173,1.4559,18.6730,42.9480,0.25
1128,1314398,1178,42,1,112,1137635,211,301.05,30,10,0.0185,1.4268,10.0350,30.1050,0.26


In [37]:
#How many ads which have not spent any money led to approved conversions? 
data %>% filter(Spent==0 & Approved_Conversion !=0) %>% nrow()
#71

In [38]:
#Which campaign did most of such cost effective ads as described previously?
data %>% filter(Spent==0 & Approved_Conversion !=0) %>% group_by(campaign_id) %>%
  summarise(n_ads = length(ad_id)) %>% arrange(desc(n_ads))
#936 - 60 ads

campaign_id,n_ads
936,60
916,8
1178,3


In [39]:
#Which ad generated least impressions?
data %>% filter(Impressions == min(Impressions))
#951641

X,ad_id,campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,CTR,CPC,CostPerConv_Total,CostPerConv_Approved,CPM
501,951641,936,42,1,2,87,0,0,1,1,0,0,0,0,0


In [40]:
#To get only ad_id above: 
data %>% filter(Impressions == min(Impressions)) %>% select(ad_id)


ad_id
951641


In [41]:
#Which ad that costed the least based on CPC led to least impressions?
data %>% filter(CPC == min(CPC)) %>% filter(Impressions == min(Impressions)) %>% select(ad_id)
#951641

ad_id
951641


In [136]:
# from HW3

# What are the number of unique interests that the dataset cover?
length(unique(data$interest))


In [137]:
# Which gender was targeted by the ad which has the most Impressions among all the ads?
data %>% filter(Impressions == max(Impressions)) 


ad_id,campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,CTR,CPC,CostPerConv_Total,CostPerConv_Approved,CPM
1121100,1178,32,0,15,3052003,340,639.95,60,17,0.0111,1.8822,10.6658,37.6441,0.21


In [138]:
max(data$Impressions)

In [139]:
# How many Impressions does the ad which has the most Approved Conversion have?
data %>% filter(Approved_Conversion == max(Approved_Conversion)) 


ad_id,campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,CTR,CPC,CostPerConv_Total,CostPerConv_Approved,CPM
1121104,1178,32,0,16,2080666,202,360.15,40,21,0.0097,1.7829,9.0038,17.15,0.17


In [140]:
# Which ad (provide ad_id as the answer) among the ads that have the least CPC led to the most Impressions?
data %>% filter(CPC == min(CPC)) %>% filter(Impressions == max(Impressions)) 


ad_id,campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,CTR,CPC,CostPerConv_Total,CostPerConv_Approved,CPM
1121094,1178,32,0,10,24362,0,0,1,1,0,0,0,0,0


In [141]:
# Which campaign (provide campaign_id as the answer) had spent 
# least efficiently on brand awareness on an average (i.e. most Cost per mille or CPM)?
data %>% group_by(campaign_id) %>% summarise(n_ads = length(ad_id),campaign_CPM = mean(CPM)) %>%
  arrange(campaign_CPM)


campaign_id,n_ads,campaign_CPM
936,464,0.2240948
916,54,0.2405556
1178,625,0.250336


In [142]:
# How many unique ads by campaign 1178 which have never caused any approved 
# conversion at all were targeted towards the age of 32?
data %>% filter(campaign_id == 1178) %>% filter(Approved_Conversion == 0) %>% filter(age == 32) %>% unique() %>% nrow()
