# EDA for Train set and Validation set

In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, when, count, col
import pyspark.sql.functions as f
from pyspark.sql.window import Window
import pandas as pd

In [2]:
# data load

train_valid_merged_df = spark.read.parquet("gs://cap-18/output/train_subset_final")

In [3]:
# 반복된 작업과 빠른 처리를 위한 캐싱

df = train_valid_merged_df.cache()

### 데이터 구조 확인

In [4]:
# 컬럼명 확인

df.columns

['display_id',
 'ad_id',
 'label',
 'doc_id',
 'is_leak',
 'event_weekend',
 'user_views',
 'ad_views',
 'doc_views',
 'doc_event_hour',
 'pop_ad_id',
 'pop_ad_id_conf',
 'pop_ad_id_conf_multipl',
 'pop_document_id',
 'pop_document_id_conf',
 'pop_document_id_conf_multipl',
 'pop_publisher_id',
 'pop_publisher_id_conf',
 'pop_publisher_id_conf_multipl',
 'pop_advertiser_id',
 'pop_advertiser_id_conf',
 'pop_advertiser_id_conf_multipl',
 'pop_campaign_id',
 'pop_campaign_id_conf',
 'pop_campaign_id_conf_multipl',
 'pop_doc_event_doc_ad',
 'pop_doc_event_doc_ad_conf',
 'pop_doc_event_doc_ad_conf_multipl',
 'pop_source_id',
 'pop_source_id_conf',
 'pop_source_id_conf_multipl',
 'pop_source_id_country',
 'pop_source_id_country_conf',
 'pop_source_id_country_conf_multipl',
 'pop_topic_id',
 'pop_topic_id_conf',
 'pop_topic_id_conf_multipl',
 'pop_category_id',
 'pop_category_id_conf',
 'pop_category_id_conf_multipl',
 'pop_category_id_country',
 'pop_category_id_country_conf',
 'pop_categor

In [6]:
# 컬럼 개수 확인

len(df.columns)

70

총 60개의 컬럼이 있다.

In [7]:
# row 개수 확인

df.count()

87141731

87,141,731개의 row가 있다.

In [8]:
# display 옵션 조정

pd.set_option('display.max_columns', 100)

### 변수별 결측치 개수 확인

In [9]:
# 변수별 결측치 확인

df_null = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]) # 변수별 결측치 수 확인
df_null_pandas = df_null.toPandas() # 보기 편하도록 판다스 데이터프레임으로 변환
df_null_pandas.transpose()

Unnamed: 0,0
display_id,0
ad_id,0
label,0
doc_id,0
is_leak,0
event_weekend,0
user_views,55311626
ad_views,33833186
doc_views,52700407
doc_event_hour,0


## 함수 정의

1. 기초통계량 및 중앙값 계산을 위한 함수 정의

In [19]:
def describe_median(variable):
  print("Statistics for {}".format(variable))
  df.describe(variable).show() # 기초통계량 계산
  variable_median = df.approxQuantile(variable, [0.5], 0.0) # 중앙값 계산
  print("The median of this variable is {}".format(variable_median))

2. 빈도 계산을 위한 함수 정의

In [28]:
def count_freq(variable):
  freq = df.groupBy(variable).count()
  freq_df = freq.withColumn('percent', f.col('count')/f.sum('count').over(Window.partitionBy()))
  freq_df = freq_df.withColumn('percent', freq_df.percent.cast(DecimalType(30, 10)))
  freq_df.orderBy('percent', ascending=False).show()

3. CTR 기초통계량 계산을 위한 함수 정의

In [23]:
# CTR, CTR 신뢰도, CTR과 신뢰도를 곱한 변수의 기초통계량을 구하는 함수 정의

def grouped_stats(key, base, conf, multipl):  
  print('1. CTR by {}'.format(key))
  base_df = df.select([key, base]).dropna() # 데이터프레임에서 key와 CTR 열만 가져와서 결측치를 제거
  base_df_count = base_df.count() # 행의 개수 확인
  print("The number of rows is {:,d}".format(base_df_count))
  grouped_base_df = base_df.groupBy(key).agg({base : 'mean'}) # key를 기준으로 그룹화해서 CTR의 평균을 구함
  grouped_base_df.describe('avg(' + base + ')').show(30, False) # 기초통계량 계산
  base_median = grouped_base_df.approxQuantile('avg(' + base + ')', [0.5], 0.0) # 중앙값 계산
  print("The median of CTR is {}".format(base_median)) 
  
  print('')
  print('2. CTR confidence by {}'.format(key))
  conf_df = df.select([key, conf]).dropna()
  conf_df_count = conf_df.count()
  print("The number of rows is {:,d}".format(conf_df_count))
  grouped_conf_df = conf_df.groupBy(key).agg({conf : 'mean'})
  grouped_conf_df.describe('avg(' + conf + ')').show(30, False)
  conf_median = grouped_conf_df.approxQuantile('avg(' + conf + ')', [0.5], 0.0)
  print("The median of CTR confidence is {}".format(conf_median))
  
  print('')
  print('3. CTR multiplied by confidence')
  multipl_df = df.select([key, multipl]).dropna()
  multipl_df_count = multipl_df.count()
  print("The number of rows is {:,d}".format(multipl_df_count))
  grouped_multipl_df = multipl_df.groupBy(key).agg({multipl : 'mean'})
  grouped_multipl_df.describe('avg(' + multipl + ')').show(30, False)
  multipl_median = grouped_multipl_df.approxQuantile('avg(' + multipl + ')', [0.5], 0.0)
  print("The median of CTR multiplied by confidence is {}".format(multipl_median))

## 변수별 EDA

### user_views

In [29]:
describe_median('user_views')

Statistics for user_views
+-------+------------------+
|summary|        user_views|
+-------+------------------+
|  count|          31830105|
|   mean| 15.42115962231353|
| stddev|24.242983550755422|
|    min|                 1|
|    max|               660|
+-------+------------------+

The median of this variable is [7.0]


### traffic_source

In [26]:
count_freq('traffic_source')

+--------------+--------+-------+
|traffic_source|   count|percent|
+--------------+--------+-------+
|             1|64541983| 0.7407|
|             2|11304279| 0.1297|
|             3|11100006| 0.1274|
|          null|  195463| 0.0022|
+--------------+--------+-------+



### event_platform

In [27]:
count_freq('event_platform')

+--------------+--------+-------+
|event_platform|   count|percent|
+--------------+--------+-------+
|             1|37519782| 0.4306|
|             2|36836541| 0.4227|
|             3|12785378| 0.1467|
|          null|      30| 0.0000|
+--------------+--------+-------+



### event_country

In [30]:
count_freq('event_country')

+-------------+--------+------------+
|event_country|   count|     percent|
+-------------+--------+------------+
|           US|69467046|0.7971731248|
|           CA| 4825996|0.0553809977|
|           GB| 4389876|0.0503762772|
|           AU| 1909614|0.0219138865|
|           IN|  904682|0.0103817309|
|           ZA|  435104|0.0049930612|
|           NZ|  431388|0.0049504181|
|           DE|  324978|0.0037293039|
|           PH|  323204|0.0037089463|
|           SG|  294342|0.0033777387|
|           MY|  200615|0.0023021691|
|           NL|  200305|0.0022986117|
|           NG|  153241|0.0017585260|
|           FR|  150387|0.0017257748|
|           IE|  149778|0.0017187861|
|           SE|  149531|0.0017159517|
|           MX|  133416|0.0015310231|
|           IT|  110850|0.0012720656|
|           AE|   99640|0.0011434246|
|           KE|   97972|0.0011242834|
+-------------+--------+------------+
only showing top 20 rows



### event_hour

In [31]:
count_freq('event_hour')

+----------+--------+------------+
|event_hour|   count|     percent|
+----------+--------+------------+
|         6|26693449|0.3063222258|
|         4|23853300|0.2737299308|
|         3|13769120|0.1580083370|
|         5|13632321|0.1564384921|
|         2| 5655401|0.0648988829|
|         1| 3538140|0.0406021313|
+----------+--------+------------+



### event_weekend

In [32]:
count_freq('event_weekend')

+-------------+--------+------------+
|event_weekend|   count|     percent|
+-------------+--------+------------+
|            0|63515114|0.7288713831|
|            1|23626617|0.2711286169|
+-------------+--------+------------+



### doc_views

In [33]:
describe_median('doc_views')

Statistics for doc_views
+-------+-----------------+
|summary|        doc_views|
+-------+-----------------+
|  count|         34441324|
|   mean|3546.584138577251|
| stddev| 2915.35179302884|
|    min|                6|
|    max|             9996|
+-------+-----------------+

The median of this variable is [2857.0]


### pop_document_id

In [45]:
grouped_stats('ad_doc_id', 'pop_document_id', 'pop_document_id_conf', 'pop_document_id_conf_multipl')

1. CTR by ad_doc_id
The number of rows is 86,574,723
+-------+--------------------+
|summary|avg(pop_document_id)|
+-------+--------------------+
|count  |74766               |
|mean   |0.15048819150738502 |
|stddev |0.12099630630684827 |
|min    |0.0                 |
|max    |1.0                 |
+-------+--------------------+

The median of CTR is [0.1317799985408783]

2. CTR confidence by ad_doc_id
The number of rows is 86,574,723
+-------+-------------------------+
|summary|avg(pop_document_id_conf)|
+-------+-------------------------+
|count  |74766                    |
|mean   |0.20448861286514278      |
|stddev |0.09744845066179102      |
|min    |0.041731998324394226     |
|max    |0.6821900010108948       |
+-------+-------------------------+

The median of CTR confidence is [0.17726999521255493]

3. CTR multiplied by confidence
The number of rows is 86,574,723
+-------+---------------------------------+
|summary|avg(pop_document_id_conf_multipl)|
+-------+------------------

### pop_ad_id

In [36]:
grouped_stats('ad_id', 'pop_ad_id', 'pop_ad_id_conf', 'pop_ad_id_conf_multipl')

1. CTR by ad_id
The number of rows is 85,640,218
+-------+-------------------+
|summary|avg(pop_ad_id)     |
+-------+-------------------+
|count  |192107             |
|mean   |0.15528313130606444|
|stddev |0.13208383154722939|
|min    |0.0                |
|max    |1.0                |
+-------+-------------------+

The median of CTR is [0.13333000242710114]

2. CTR confidence by ad_id
The number of rows is 85,640,218
+-------+-------------------+
|summary|avg(pop_ad_id_conf)|
+-------+-------------------+
|count  |192107             |
|mean   |0.20855778710115186|
|stddev |0.09364983201711577|
|min    |0.11715999990701675|
|max    |1.0                |
+-------+-------------------+

The median of CTR confidence is [0.17726999521255493]

3. CTR multiplied by confidence
The number of rows is 85,640,218
+-------+---------------------------+
|summary|avg(pop_ad_id_conf_multipl)|
+-------+---------------------------+
|count  |192107                     |
|mean   |0.03370003487878561     

### pop_category_id

In [37]:
grouped_stats('ad_category_id', 'pop_category_id', 'pop_category_id_conf', 'pop_category_id_conf_multipl')

1. CTR by ad_category_id
The number of rows is 87,061,223
+-------+--------------------+
|summary|avg(pop_category_id)|
+-------+--------------------+
|count  |94                  |
|mean   |0.1900896306979002  |
|stddev |0.04358999906188991 |
|min    |0.11436964897595608 |
|max    |0.36764654483836684 |
+-------+--------------------+

The median of CTR is [0.19596156674394571]

2. CTR confidence by ad_category_id
The number of rows is 87,061,223
+-------+-------------------------+
|summary|avg(pop_category_id_conf)|
+-------+-------------------------+
|count  |94                       |
|mean   |0.095192389123402        |
|stddev |0.046730482678639586     |
|min    |0.01335848864799538      |
|max    |0.24068757745084507      |
+-------+-------------------------+

The median of CTR confidence is [0.08845278745405359]

3. CTR multiplied by confidence
The number of rows is 87,061,223
+-------+---------------------------------+
|summary|avg(pop_category_id_conf_multipl)|
+-------+-------

### pop_advertiser_id

In [38]:
grouped_stats('advertiser_id', 'pop_advertiser_id', 'pop_advertiser_id_conf', 'pop_advertiser_id_conf_multipl')

1. CTR by advertiser_id
The number of rows is 87,103,542
+-------+----------------------+
|summary|avg(pop_advertiser_id)|
+-------+----------------------+
|count  |3620                  |
|mean   |0.1904422268724244    |
|stddev |0.1050305087575542    |
|min    |0.0                   |
|max    |0.7111200094223022    |
+-------+----------------------+

The median of CTR is [0.16899999976158142]

2. CTR confidence by advertiser_id
The number of rows is 87,103,542
+-------+---------------------------+
|summary|avg(pop_advertiser_id_conf)|
+-------+---------------------------+
|count  |3620                       |
|mean   |0.29160332148740303        |
|stddev |0.10601754957214017        |
|min    |0.04977099969983101        |
|max    |0.6821900010108948         |
+-------+---------------------------+

The median of CTR confidence is [0.2899700105190277]

3. CTR multiplied by confidence
The number of rows is 87,103,542
+-------+-----------------------------------+
|summary|avg(pop_advertis

### pop_category_id

In [39]:
grouped_stats('ad_category_id', 'pop_category_id', 'pop_category_id_conf', 'pop_category_id_conf_multipl')

1. CTR by ad_category_id
The number of rows is 87,061,223
+-------+--------------------+
|summary|avg(pop_category_id)|
+-------+--------------------+
|count  |94                  |
|mean   |0.19008963069790016 |
|stddev |0.04358999906188991 |
|min    |0.11436964897595608 |
|max    |0.36764654483836684 |
+-------+--------------------+

The median of CTR is [0.19596156674394571]

2. CTR confidence by ad_category_id
The number of rows is 87,061,223
+-------+-------------------------+
|summary|avg(pop_category_id_conf)|
+-------+-------------------------+
|count  |94                       |
|mean   |0.09519238912340194      |
|stddev |0.04673048267863959      |
|min    |0.01335848864799538      |
|max    |0.24068757745084507      |
+-------+-------------------------+

The median of CTR confidence is [0.08845278745405359]

3. CTR multiplied by confidence
The number of rows is 87,061,223
+-------+---------------------------------+
|summary|avg(pop_category_id_conf_multipl)|
+-------+-------

### pop_topic_id

In [40]:
grouped_stats('ad_topic_id', 'pop_topic_id', 'pop_topic_id_conf', 'pop_topic_id_conf_multipl')

1. CTR by ad_topic_id
The number of rows is 86,254,382
+-------+--------------------+
|summary|avg(pop_topic_id)   |
+-------+--------------------+
|count  |300                 |
|mean   |0.20290310014369278 |
|stddev |0.041006311525321235|
|min    |0.09531529571612084 |
|max    |0.41986692493014743 |
+-------+--------------------+

The median of CTR is [0.2021033303788934]

2. CTR confidence by ad_topic_id
The number of rows is 86,254,382
+-------+----------------------+
|summary|avg(pop_topic_id_conf)|
+-------+----------------------+
|count  |300                   |
|mean   |0.006110534430288354  |
|stddev |0.0051224054120583016 |
|min    |1.4904477700033173E-4 |
|max    |0.02860257227891189   |
+-------+----------------------+

The median of CTR confidence is [0.004612766720973637]

3. CTR multiplied by confidence
The number of rows is 86,254,382
+-------+------------------------------+
|summary|avg(pop_topic_id_conf_multipl)|
+-------+------------------------------+
|count  |300  

### pop_campaign_id

In [41]:
grouped_stats('campaign_id', 'pop_campaign_id', 'pop_campaign_id_conf', 'pop_campaign_id_conf_multipl')

1. CTR by campaign_id
The number of rows is 86,927,337
+-------+--------------------+
|summary|avg(pop_campaign_id)|
+-------+--------------------+
|count  |25270               |
|mean   |0.1723316806173752  |
|stddev |0.11288109336786849 |
|min    |0.0                 |
|max    |0.916670024394989   |
+-------+--------------------+

The median of CTR is [0.15306000411510468]

2. CTR confidence by campaign_id
The number of rows is 86,927,337
+-------+-------------------------+
|summary|avg(pop_campaign_id_conf)|
+-------+-------------------------+
|count  |25270                    |
|mean   |0.23342127954203312      |
|stddev |0.1196821580233413       |
|min    |0.041731998324394226     |
|max    |1.0                      |
+-------+-------------------------+

The median of CTR confidence is [0.20476999878883362]

3. CTR multiplied by confidence
The number of rows is 86,927,337
+-------+---------------------------------+
|summary|avg(pop_campaign_id_conf_multipl)|
+-------+-------------

### pop_source_id

In [42]:
grouped_stats('ad_source_id', 'pop_source_id', 'pop_source_id_conf', 'pop_source_id_conf_multipl')

1. CTR by ad_source_id
The number of rows is 87,066,991
+-------+-------------------+
|summary|avg(pop_source_id) |
+-------+-------------------+
|count  |5628               |
|mean   |0.17961432565981292|
|stddev |0.10526522880523281|
|min    |0.0                |
|max    |0.836359977722168  |
+-------+-------------------+

The median of CTR is [0.15906000137329102]

2. CTR confidence by ad_source_id
The number of rows is 87,066,991
+-------+-----------------------+
|summary|avg(pop_source_id_conf)|
+-------+-----------------------+
|count  |5628                   |
|mean   |0.2666226234684176     |
|stddev |0.1118473851645954     |
|min    |0.04358400031924248    |
|max    |0.6821900010108948     |
+-------+-----------------------+

The median of CTR confidence is [0.2628999948501587]

3. CTR multiplied by confidence
The number of rows is 87,066,991
+-------+-------------------------------+
|summary|avg(pop_source_id_conf_multipl)|
+-------+-------------------------------+
|count  |5

### pop_publisher_id

In [43]:
grouped_stats('ad_publisher_id','pop_publisher_id', 'pop_publisher_id_conf', 'pop_publisher_id_conf_multipl')

1. CTR by ad_publisher_id
The number of rows is 33,907,412
+-------+---------------------+
|summary|avg(pop_publisher_id)|
+-------+---------------------+
|count  |723                  |
|mean   |0.209063659794371    |
|stddev |0.10991537239565012  |
|min    |0.0                  |
|max    |0.836359977722168    |
+-------+---------------------+

The median of CTR is [0.18934999406337738]

2. CTR confidence by ad_publisher_id
The number of rows is 33,907,412
+-------+--------------------------+
|summary|avg(pop_publisher_id_conf)|
+-------+--------------------------+
|count  |723                       |
|mean   |0.2786183170355949        |
|stddev |0.10098877898002925       |
|min    |0.05516599863767624       |
|max    |0.5475599765777588        |
+-------+--------------------------+

The median of CTR confidence is [0.28299999237060547]

3. CTR multiplied by confidence
The number of rows is 33,907,412
+-------+----------------------------------+
|summary|avg(pop_publisher_id_conf_mult

### ad_views

In [44]:
describe_median('ad_views')

Statistics for ad_views
+-------+------------------+
|summary|          ad_views|
+-------+------------------+
|  count|          53308545|
|   mean| 3137.686828875183|
| stddev|2846.0630035507047|
|    min|                 6|
|    max|              9995|
+-------+------------------+

The median of this variable is [2292.0]
