# EDA for promoted_content.csv

In [126]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
import pandas as pd
import seaborn as sns

In [127]:
# bucket 설정

mangodm_bucket = "gs://upload-bigquery180927/"

In [128]:
# promoted_contetn 스키마 설정

promoted_content_schema = StructType(
                    [StructField("ad_id", IntegerType(), True),
                    StructField("document_id", IntegerType(), True),                    
                    StructField("campaign_id", IntegerType(), True),
                    StructField("advertiser_id", IntegerType(), True)]
                    )

# google storage에 저장된 promoted_content.csv 파일 로드

promoted_content_df = spark.read.schema(promoted_content_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(mangodm_bucket + "promoted_content.csv")

In [129]:
# spark의 dataframe을 pandas의 dataframe으로 변환

promoted = promoted_content_df.toPandas()

In [130]:
promoted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559583 entries, 0 to 559582
Data columns (total 4 columns):
ad_id            559583 non-null int64
document_id      559583 non-null int64
campaign_id      559583 non-null int64
advertiser_id    559583 non-null int64
dtypes: int64(4)
memory usage: 17.1 MB


## advertiser_id

In [131]:
# advertiser_id로 groupby해서 campaign_id를 count

grouped_advertiser = promoted.groupby('advertiser_id').agg({'campaign_id' : 'count'}).reset_index()
grouped_advertiser.head()

Unnamed: 0,advertiser_id,campaign_id
0,2,2
1,3,12
2,4,168
3,5,45
4,6,15


In [132]:
# 광고주별 캠페인의 기초통계량.
# 광고주는 평균적으로 127개의 캠페인을 갖고 있고, 중위값은 8개이다.

grouped_advertiser.columns = ['advertiser_id', 'the_number_of_campaign'] # 변수명 재정의
grouped_advertiser['the_number_of_campaign'].describe().astype(int)

count     4385
mean       127
std        683
min          1
25%          2
50%          8
75%         36
max      16529
Name: the_number_of_campaign, dtype: int64

In [133]:
# 다시 advertiser_id로 groupby
# 즉, 캠페인을 1개만 갖고 있는 광고주는 704명이다.

re_grouped_advertiser = grouped_advertiser.groupby('the_number_of_campaign').agg({'advertiser_id' : 'count'}).reset_index()
re_grouped_advertiser.columns = ['the_number_of_campaign', 'the_number_of_advertiser']

print(re_grouped_advertiser.head())
print(re_grouped_advertiser.tail())

   the_number_of_campaign  the_number_of_advertiser
0                       1                       704
1                       2                       405
2                       3                       315
3                       4                       245
4                       5                       211
     the_number_of_campaign  the_number_of_advertiser
510                   10046                         1
511                   10552                         1
512                   14844                         1
513                   15450                         1
514                   16529                         1


In [134]:
# 시각화

#ax = sns.boxplot(grouped_advertiser['the_number_of_campaign'])
#ax = sns.distplot(re_grouped_advertiser, kde = True)
#ax.set_yscale('log')

## campaign_id

In [135]:
promoted.head()

Unnamed: 0,ad_id,document_id,campaign_id,advertiser_id
0,1,6614,1,7
1,2,471467,2,7
2,3,7692,3,7
3,4,471471,2,7
4,5,471472,2,7


In [136]:
# campaign_id로 groupby해서 document_id를 count

grouped_campaign = promoted.groupby('campaign_id').agg({'document_id' : 'count'}).reset_index()
grouped_campaign.head()

Unnamed: 0,campaign_id,document_id
0,1,31
1,2,57
2,3,2
3,4,267
4,5,16


In [137]:
# 캠페인별 랜딩페이지의 기초통계량
# 캠페인에는 평균적으로 16개의 랜딩페이지가 담겨 있고 중위값은 5개이다.

grouped_campaign.columns = ['campaign_id', 'the_number_of_document'] # 변수명 재정의
grouped_campaign['the_number_of_document'].describe().astype(int)

count    34675
mean        16
std         70
min          1
25%          2
50%          5
75%         12
max       3806
Name: the_number_of_document, dtype: int64