In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, when, count, col
import pyspark.sql.functions as f
from pyspark.sql.window import Window
import pandas as pd

In [2]:
# data load

train_valid_merged_df = spark.read.parquet("gs://cap-18/output/train_subset_final")

In [3]:
# 캐싱

df = train_valid_merged_df.cache()

In [11]:
# 컬럼명 확인

df.columns

['display_id',
 'ad_id',
 'label',
 'doc_id',
 'is_leak',
 'event_weekend',
 'user_views',
 'ad_views',
 'doc_views',
 'doc_event_hour',
 'pop_ad_id',
 'pop_ad_id_conf',
 'pop_ad_id_conf_multipl',
 'pop_document_id',
 'pop_document_id_conf',
 'pop_document_id_conf_multipl',
 'pop_publisher_id',
 'pop_publisher_id_conf',
 'pop_publisher_id_conf_multipl',
 'pop_advertiser_id',
 'pop_advertiser_id_conf',
 'pop_advertiser_id_conf_multipl',
 'pop_campaign_id',
 'pop_campaign_id_conf',
 'pop_campaign_id_conf_multipl',
 'pop_doc_event_doc_ad',
 'pop_doc_event_doc_ad_conf',
 'pop_doc_event_doc_ad_conf_multipl',
 'pop_source_id',
 'pop_source_id_conf',
 'pop_source_id_conf_multipl',
 'pop_source_id_country',
 'pop_source_id_country_conf',
 'pop_source_id_country_conf_multipl',
 'pop_topic_id',
 'pop_topic_id_conf',
 'pop_topic_id_conf_multipl',
 'pop_category_id',
 'pop_category_id_conf',
 'pop_category_id_conf_multipl',
 'pop_category_id_country',
 'pop_category_id_country_conf',
 'pop_categor

In [8]:
# 총 60개의 컬럼이 있다.

len(df.columns)

60

총 60개의 컬럼이 있다.

In [9]:
# row 개수 확인

df.count()

87141731

87,141,731개의 row가 있다.

In [27]:
# 변수별 결측치 탐색

df_null=df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]) # 변수별 결측치 수 계산
df_null.show()

+----------+-----+-----+------------+-------+-------------+----------+--------+---------+---------+--------------+----------------------+---------------+--------------------+----------------------------+----------------+---------------------+-----------------------------+-----------------+----------------------+------------------------------+---------------+--------------------+----------------------------+--------------------+-------------------------+---------------------------------+-------------+------------------+--------------------------+------------+-----------------+-------------------------+--------------------+-------------------------+---------------------------------+---------------+--------------------+----------------------------+----------------------+-------------------+----------+--------------+--------------+--------+----------+-----------------+-------------+-----------------+-----------+-------------+---------------------+-----------------------+------------------+

In [29]:
# 판다스 데이터프레임으로 변환

%time df_null_pandas=df_null.toPandas()

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 270 ms


In [18]:
# display 옵션을 조정

pd.set_option('display.max_columns', 100)

In [60]:
df_null_pandas.transpose()

Unnamed: 0,0
display_id,0
ad_id,0
label,0
doc_event_id,0
is_leak,0
event_weekend,0
user_views,55311626
ad_views,33833186
doc_views,52700407
pop_ad_id,1501513


### user_views

In [10]:
# user_views 기초통계량

df.describe('user_views').show()

+-------+------------------+
|summary|        user_views|
+-------+------------------+
|  count|          31830105|
|   mean| 15.42115962231353|
| stddev|24.242983550755422|
|    min|                 1|
|    max|               660|
+-------+------------------+



In [11]:
# user_views 중앙값

%time df.approxQuantile("user_views", [0.5], 0.0)

CPU times: user 44 ms, sys: 16 ms, total: 60 ms
Wall time: 2min 43s


[7.0]

### traffic_source

In [12]:
# traffic_source의 빈도 확인

a = df.groupBy('traffic_source').count()
a = a.withColumn('percent', f.col('count')/f.sum('count').over(Window.partitionBy()))
a.orderBy('percent', ascending=False).show()

+--------------+--------+--------------------+
|traffic_source|   count|             percent|
+--------------+--------+--------------------+
|             1|64541983|  0.7406552780091091|
|             2|11304279| 0.12972291082902634|
|             3|11100006| 0.12737876414229138|
|          null|  195463|0.002243047019573...|
+--------------+--------+--------------------+



### event_platform

In [13]:
# event_platform의 빈도 확인

b = df.groupBy('event_platform').count()
b = b.withColumn('percent', f.col('count')/f.sum('count').over(Window.partitionBy()))
b = b.withColumn('percent', b.percent.cast(DecimalType(30, 7)))
b.orderBy('percent', ascending=False).show()

+--------------+--------+---------+
|event_platform|   count|  percent|
+--------------+--------+---------+
|             1|37519782|0.4305604|
|             2|36836541|0.4227199|
|             3|12785378|0.1467193|
|          null|      30|     3E-7|
+--------------+--------+---------+



### event_country

In [14]:
# event_country의 빈도 확인

c = df.groupBy('event_country').count()
c = c.withColumn('percent', f.col('count')/f.sum('count').over(Window.partitionBy()))
c = c.withColumn('percent', c.percent.cast(DecimalType(30, 3)))
c.orderBy('percent', ascending=False).show()

+-------------+--------+-------+
|event_country|   count|percent|
+-------------+--------+-------+
|           US|69467046|  0.797|
|           CA| 4825996|  0.055|
|           GB| 4389876|  0.050|
|           AU| 1909614|  0.022|
|           IN|  904682|  0.010|
|           NZ|  431388|  0.005|
|           ZA|  435104|  0.005|
|           DE|  324978|  0.004|
|           PH|  323204|  0.004|
|           SG|  294342|  0.003|
|           MY|  200615|  0.002|
|           IE|  149778|  0.002|
|           NL|  200305|  0.002|
|           NG|  153241|  0.002|
|           FR|  150387|  0.002|
|           MX|  133416|  0.002|
|           SE|  149531|  0.002|
|           TH|   50145|  0.001|
|           NO|   89372|  0.001|
|           PK|   95565|  0.001|
+-------------+--------+-------+
only showing top 20 rows



In [15]:
# 전체 행 개수에서 US, CA, GB를 뺀 나머지의 행 개수

87141731 - (69467046 + 4825996 + 4389876)

8458813

### event_hour

In [5]:
# event_hour의 빈도 확인

e = df.groupBy('event_hour').count()
e = e.withColumn('percent', f.col('count')/f.sum('count').over(Window.partitionBy()))
e = e.withColumn('percent', e.percent.cast(DecimalType(30, 3)))
e.orderBy('percent', ascending=False).show()

+----------+--------+-------+
|event_hour|   count|percent|
+----------+--------+-------+
|         6|26693449|  0.306|
|         4|23853300|  0.274|
|         3|13769120|  0.158|
|         5|13632321|  0.156|
|         2| 5655401|  0.065|
|         1| 3538140|  0.041|
+----------+--------+-------+



### event_weekend

In [16]:
# event_weekend 빈도 확인

d = df.groupBy('event_weekend').count()
d = d.withColumn('percent', f.col('count')/f.sum('count').over(Window.partitionBy()))
d = d.withColumn('percent', d.percent.cast(DecimalType(30, 3)))
d.orderBy('percent', ascending=False).show()

+-------------+--------+-------+
|event_weekend|   count|percent|
+-------------+--------+-------+
|            0|63515114|  0.729|
|            1|23626617|  0.271|
+-------------+--------+-------+



### doc_views

In [17]:
# doc_views 기초통계량

df.describe('doc_views').show()

+-------+-----------------+
|summary|        doc_views|
+-------+-----------------+
|  count|         34441324|
|   mean|3546.584138577251|
| stddev| 2915.35179302884|
|    min|                6|
|    max|             9996|
+-------+-----------------+



In [18]:
# doc_views 중앙값

%time df.approxQuantile("doc_views", [0.5], 0.0)

CPU times: user 48 ms, sys: 8 ms, total: 56 ms
Wall time: 2min 53s


[2857.0]

### pop_document_id

In [26]:
# na값 제거 후, 남은 row 개수 카운트

aa = df.select(['ad_doc_id','pop_document_id']).dropna()
aa.count()

86574723

In [28]:
# 랜딩페이지의 id별로 groupby해서 

aa = aa.groupBy('ad_doc_id').agg({'pop_document_id' : 'mean'})
aa.describe('avg(pop_document_id)').show()

+-------+--------------------+
|summary|avg(pop_document_id)|
+-------+--------------------+
|  count|               74766|
|   mean| 0.15048819150738502|
| stddev|  0.1209963063068483|
|    min|                 0.0|
|    max|                 1.0|
+-------+--------------------+



In [29]:
# 중앙값 확인

%time aa.approxQuantile("avg(pop_document_id)", [0.5], 0.0)

CPU times: user 8 ms, sys: 12 ms, total: 20 ms
Wall time: 1.35 s


[0.1317799985408783]

In [37]:
aa.show()

+---------+--------------------+
|ad_doc_id|avg(pop_document_id)|
+---------+--------------------+
|    29834| 0.04761900007724762|
|    29993|                 0.0|
|   130003|                 0.0|
|   131931| 0.09459500014781952|
|   132406| 0.14285999536514282|
|   193640|                 0.0|
|   194034|  0.3333300054073334|
|   267740|                 0.0|
|   296807| 0.11676999926567078|
|   312132| 0.24490000307559967|
|   334772| 0.13333000242710114|
|   344121| 0.07142899930477142|
|   351175| 0.38266998529434204|
|   351750|                 0.0|
|   371765| 0.02430099993944168|
|   373035| 0.12213999778032303|
|   376650|  0.1212100014090538|
|   398542|  0.2617399990558624|
|   423444| 0.14285999536514282|
|   426301|  0.1899999976158142|
+---------+--------------------+
only showing top 20 rows



### pop_document_id_conf

### pop_publisher_id

In [19]:
# pop_publisher_id 기초통계량

df.describe('pop_publisher_id').show()

+-------+-------------------+
|summary|   pop_publisher_id|
+-------+-------------------+
|  count|           33907412|
|   mean|0.19122963722612996|
| stddev|0.10087599316192765|
|    min|                0.0|
|    max|            0.83636|
+-------+-------------------+



In [20]:
# pop_publisher_id 중앙값

%time df.approxQuantile("pop_publisher_id", [0.5], 0.0)

CPU times: user 56 ms, sys: 4 ms, total: 60 ms
Wall time: 2min 42s


[0.1783600002527237]

### pop_publisher_id_conf

### pop_publisher_id_conf_multipl

### pop_source_id

### pop_source_id_conf

### pop_source_id_conf_multipl

### pop_topic_id

### pop_topic_id_conf

### pop_topic_id_conf_multipl

### pop_category_id

### pop_category_id_conf

### pop_category_id_conf_multipl

### ad_views