# Validation Set Split

As of Dec 06, 2018

## Part A: 분석을 위한 환경 설정

### A-1) 모듈/패키지 로드

In [1]:
evaluation = True
evaluation_verbose = False

OUTPUT_BUCKET_FOLDER = "gs://line_2018/output/"
DATA_BUCKET_FOLDER = "gs://upload-bigquery180927/data/"

In [2]:
from IPython.display import display

In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [4]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all" # 한 셀(cell)에서의 코드 실행 결과가 다 보이도록 설정

In [5]:
import numpy as np
import scipy.sparse

In [6]:
import warnings 
warnings.filterwarnings('ignore') # warning 메시지 반환되지 않도록 설정

In [7]:
import math
import datetime
import time
import itertools

In [8]:
import pickle

In [9]:
import random
random.seed(42)

In [10]:
import pandas as pd
%matplotlib inline

In [11]:
# 테이블 로드
train_valid_merged_df = spark.read.parquet("gs://line-2018-1/output/train_final_3")

## Part B: 테이블 합치기

In [12]:
# CTR에 해당하는 컬럼 제외
train_valid_merged_df = train_valid_merged_df.drop('pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id')

In [13]:
# train_valid_merged_df.columns

['display_id',
 'ad_id',
 'label',
 'is_leak',
 'weekend',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'platform',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'campaign_id',
 'advertiser_id',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'timestamp_event',
 'ad_source_id',
 'ad_publisher_id']

In [14]:
# column별 수준 수 세기
# for col in train_valid_merged_df.columns:
#   print(col, train_valid_merged_df.select(col).distinct().count())

In [15]:
# column별 NA 세기
# train_valid_merged_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in train_valid_merged_df.columns]).show()

In [13]:
train_valid_merged_df2 = train_valid_merged_df

In [14]:
# 상당수를 차지하는 미국, 캐나다, 영국이 아닌 경우 U로 할당
train_valid_merged_df2 = train_valid_merged_df2.withColumn("country", \
              F.when((train_valid_merged_df2["country"] != 'US') &
                     (train_valid_merged_df2["country"] != 'CA') &
                     (train_valid_merged_df2["country"] != 'GB'), 'U').otherwise(train_valid_merged_df2["country"]))

In [18]:
# train_valid_merged_df2.select('country').distinct().count()

4

In [19]:
# train_valid_merged_df2.select('country').groupBy('country').count().show()

In [15]:
cont_binned_schema = StructType(
                    [StructField("pop_ad_id", StringType(), True),
                    StructField("pop_ad_doc_id", StringType(), True),                    
                    StructField("pop_ad_publisher_id", StringType(), True),
                    StructField("pop_advertiser_id", StringType(), True),
                    StructField("pop_campaign_id", StringType(), True),
                    StructField("pop_view_doc_ad_doc", StringType(), True),
                     StructField("pop_ad_source_id", StringType(), True),
                     StructField("pop_ad_topic_id", StringType(), True),
                     StructField("pop_ad_category_id", StringType(), True),
                    ]
                    )

cont_binned_df = spark.read.schema(cont_binned_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "train_bin.csv")

In [21]:
# # column별 수준 수 세기 
# for col in cont_binned_df.columns:
#   print(col, cont_binned_df.select(col).distinct().count())

In [22]:
# cont_binned_df.count()

In [23]:
# train_valid_merged_df2.count()

In [24]:
# column별 NA 세기
# cont_binned_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in cont_binned_df.columns]).show()

In [25]:
# cont_binned_df.select('pop_ad_id').groupBy('pop_ad_id').count().show()

In [16]:
# NA인 값을 U로 바꾸기
for col in cont_binned_df.columns:
  cont_binned_df = cont_binned_df.withColumn(col, F.regexp_replace(col, 'NA', 'U'))

In [27]:
# cont_binned_df.select('pop_ad_id').groupBy('pop_ad_id').count().show()

In [28]:
# cont_binned_df.count()

In [29]:
# train_valid_merged_df2.count()

In [17]:
from pyspark.sql.window import Window

cont_binned_df = cont_binned_df.withColumn('index', F.row_number().over(Window.orderBy(F.lit(1))))

In [33]:
# cont_binned_df.select('index').describe().show()

+-------+--------------------+
|summary|               index|
+-------+--------------------+
|  count|            87141731|
|   mean|         4.3570866E7|
| stddev|2.5155651069590893E7|
|    min|                   1|
|    max|            87141731|
+-------+--------------------+



In [18]:
train_valid_merged_df2 = train_valid_merged_df2.withColumn('index', F.row_number().over(Window.orderBy(F.lit(1))))

In [35]:
# train_valid_merged_df2.select('index').describe().show()

+-------+--------------------+
|summary|               index|
+-------+--------------------+
|  count|            87141731|
|   mean|         4.3570866E7|
| stddev|2.5155651069590893E7|
|    min|                   1|
|    max|            87141731|
+-------+--------------------+



In [19]:
train_final = train_valid_merged_df2.join(cont_binned_df, 'index', how = 'left')

In [38]:
# train_final.columns

['index',
 'display_id',
 'ad_id',
 'label',
 'is_leak',
 'weekend',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'platform',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'campaign_id',
 'advertiser_id',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'timestamp_event',
 'ad_source_id',
 'ad_publisher_id',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id']

In [39]:
# train_final.count() # 87141731 기원!

87141731

In [None]:
# train_final.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in train_final.columns]).show()

## Part C: Validation / Train Set으로 구분

In [20]:
dp_id_list = train_final.select('display_id', 'timestamp_event').distinct()

In [23]:
dp_id_list = dp_id_list.sort("timestamp_event").withColumn('index', F.row_number().over(Window.orderBy(F.lit(1))))

In [46]:
dp_id_train_sample_id = dp_id_list.filter((F.col('index') >= 1) & (F.col('index') <= 20000))
dp_id_train_sample_id = dp_id_train_sample_id.select('display_id').toPandas()
dp_id_train_sample_id = dp_id_train_sample_id['display_id'].tolist()

In [47]:
dp_id_validation_sample_id = dp_id_list.filter((F.col('index') >= 20001) & (F.col('index') <= 25000))
dp_id_validation_sample_id = dp_id_validation_sample_id.select('display_id').toPandas()
dp_id_validation_sample_id = dp_id_validation_sample_id['display_id'].tolist()

In [48]:
train_sample = train_final.filter(F.col('display_id').isin(dp_id_train_sample_id))

In [49]:
valid_sample = train_final.filter(F.col('display_id').isin(dp_id_validation_sample_id))

In [50]:
train_sample.count() # 80000 희망: failed

102091

In [51]:
valid_sample.count() # 

25656

정상적으로 테이블이 생성되었다.

`index` 컬럼을 제거하자.

In [52]:
train_sample = train_sample.drop('index')

In [53]:
valid_sample = valid_sample.drop('index')

In [54]:
# csv 파일로 쓰기: validation_sample_df
# valid_sample.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'validation_sample_df.csv', header = True)

In [55]:
# csv 파일로 쓰기: train_sample_df
# train_sample.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'train_sample_df.csv', header = True)

In [56]:
# 파케이로 쓰기: validation_sample_df
# valid_sample.write.parquet(OUTPUT_BUCKET_FOLDER + 'validation_sample_parquet', mode='overwrite')

In [None]:
# # 파케이로 쓰기: train_sample_df
# train_sample.write.parquet(OUTPUT_BUCKET_FOLDER + 'train_sample_parquet', mode='overwrite')

## Part D: 최종 Train Set 생성

최종 Train Set을 파케이로 만들어보자.

In [None]:
# index 컬럼 없애기
train_final = train_final.drop('index')

In [None]:
# 파케이로 쓰기: train_final
train_final.write.parquet(OUTPUT_BUCKET_FOLDER + 'train_final_parquet', mode='overwrite')