## Sampling & Splitting Data

## Part A: 분석을 위한 환경 설정

### 모듈/패키지 로드

In [87]:
evaluation = True
evaluation_verbose = False

OUTPUT_BUCKET_FOLDER = "gs://cap-18/output/"
DATA_BUCKET_FOLDER = "gs://cap-18/data/"

In [88]:
from IPython.display import display

In [89]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [90]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all" # 한 셀(cell)에서의 코드 실행 결과가 다 보이도록 설정

In [91]:
import numpy as np
import scipy.sparse

In [92]:
import warnings 
warnings.filterwarnings('ignore') # warning 메시지 반환되지 않도록 설정

In [93]:
import math
import datetime
import time
import itertools

In [94]:
import pickle

In [95]:
import random
random.seed(42)

In [96]:
import pandas as pd
%matplotlib inline

## Part B: 변수별 결측치 확인

In [97]:
# 테이블 로드
train_valid_merged_df = spark.read.parquet("gs://cap-18/output/train_subset_final")

In [98]:
## cf) 판다스 데이터 프레임 구조 깨지지 않게 보기:
pd.set_option('display.max_columns', 100)

In [99]:
documents_categories_schema = StructType(
                    [StructField("document_id_cat", IntegerType(), True),
                    StructField("category_id", IntegerType(), True),                    
                    StructField("confidence_level_cat", FloatType(), True)]
                    )

documents_categories_df = spark.read.schema(documents_categories_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_categories.csv") \
                .alias('documents_categories').cache()
    
documents_categories_grouped_df = documents_categories_df.groupBy('document_id_cat') \
                                            .agg(F.collect_list('category_id').alias('category_id_list'),
                                                 F.collect_list('confidence_level_cat').alias('confidence_level_cat_list')) \
                                            .withColumn('dummyDocumentsCategory', F.lit(1)) \
                                            .alias('documents_categories_grouped')

In [100]:
documents_topics_schema = StructType(
                    [StructField("document_id_top", IntegerType(), True),
                    StructField("topic_id", IntegerType(), True),                    
                    StructField("confidence_level_top", FloatType(), True)]
                    )

documents_topics_df = spark.read.schema(documents_topics_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_topics.csv")  \
                .alias('documents_topics').cache()
    
documents_topics_grouped_df = documents_topics_df.groupBy('document_id_top') \
                                            .agg(F.collect_list('topic_id').alias('topic_id_list'),
                                                 F.collect_list('confidence_level_top').alias('confidence_level_top_list')) \
                                            .withColumn('dummyDocumentsTopics', F.lit(1)) \
                                            .alias('documents_topics_grouped')

다음의 코드로 변수별 결측치를 한번에 확인할 수 있다.

In [101]:
## cf) 판다스 100행까지 보기:
pd.set_option('display.max_rows', 100)

In [102]:
# train_valid_merged_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in train_valid_merged_df.columns]).toPandas().transpose()

### 필요없는 columns 제거

In [103]:
train_valid_merged_df = train_valid_merged_df.drop('doc_event_hour', 'pop_source_id_country', 'pop_source_id_country_conf', 'pop_source_id_country_conf_multipl',
                                                  'pop_category_id_country', 'pop_category_id_country_conf', 'pop_category_id_country_conf_multipl',
                                                  'platform_event', 'doc_id','document_id_cat','document_id_top',
                                                  'timestamp_event')

In [104]:
len(train_valid_merged_df.columns)

58

### 칼럼명 변경

In [105]:
train_imputing_df = train_valid_merged_df

In [106]:
train_imputing_df.schema.names

['display_id',
 'ad_id',
 'label',
 'is_leak',
 'event_weekend',
 'user_views',
 'ad_views',
 'doc_views',
 'pop_ad_id',
 'pop_ad_id_conf',
 'pop_ad_id_conf_multipl',
 'pop_document_id',
 'pop_document_id_conf',
 'pop_document_id_conf_multipl',
 'pop_publisher_id',
 'pop_publisher_id_conf',
 'pop_publisher_id_conf_multipl',
 'pop_advertiser_id',
 'pop_advertiser_id_conf',
 'pop_advertiser_id_conf_multipl',
 'pop_campaign_id',
 'pop_campaign_id_conf',
 'pop_campaign_id_conf_multipl',
 'pop_doc_event_doc_ad',
 'pop_doc_event_doc_ad_conf',
 'pop_doc_event_doc_ad_conf_multipl',
 'pop_source_id',
 'pop_source_id_conf',
 'pop_source_id_conf_multipl',
 'pop_topic_id',
 'pop_topic_id_conf',
 'pop_topic_id_conf_multipl',
 'pop_category_id',
 'pop_category_id_conf',
 'pop_category_id_conf_multipl',
 'doc_event_publisher_id',
 'doc_event_source_id',
 'event_hour',
 'event_platform',
 'traffic_source',
 'is_train',
 'uuid',
 'view_doc_id',
 'event_country',
 'day_event',
 'ad_doc_id',
 'campaign_i

In [107]:
train_imputing_df = train_imputing_df.withColumnRenamed('pop_document_id', 'pop_ad_doc_id') \
                                    .withColumnRenamed('pop_document_id_conf', 'pop_ad_doc_id_conf').withColumnRenamed('pop_publisher_id', 'pop_ad_publisher_id') \
                                    .withColumnRenamed('pop_document_id_conf_multipl', 'pop_ad_doc_id_conf_multipl') \
                                    .withColumnRenamed('pop_publisher_id_conf', 'pop_ad_publisher_id_conf') \
                                    .withColumnRenamed('pop_publisher_id_conf_multipl', 'pop_ad_publisher_id_conf_multipl') \
                                    .withColumnRenamed('pop_doc_event_doc_ad', 'pop_view_doc_ad_doc') \
                                    .withColumnRenamed('pop_doc_event_doc_ad_conf', 'pop_view_doc_ad_doc_conf') \
                                    .withColumnRenamed('pop_doc_event_doc_ad_conf_multipl', 'pop_view_doc_ad_doc_conf_multipl') \
                                    .withColumnRenamed('pop_source_id', 'pop_ad_source_id') \
                                    .withColumnRenamed('pop_source_id_conf', 'pop_ad_source_id_conf') \
                                    .withColumnRenamed('pop_source_id_conf_multipl', 'pop_ad_source_id_conf_multipl') \
                                    .withColumnRenamed('pop_topic_id', 'pop_ad_topic_id') \
                                    .withColumnRenamed('pop_topic_id_conf', 'pop_ad_topic_id_conf') \
                                    .withColumnRenamed('pop_topic_id_conf_multipl', 'pop_ad_topic_id_conf_multipl') \
                                    .withColumnRenamed('pop_category_id', 'pop_ad_category_id') \
                                    .withColumnRenamed('pop_category_id_conf', 'pop_ad_category_id_conf') \
                                    .withColumnRenamed('pop_category_id_conf_multipl', 'pop_ad_category_id_conf_multipl') \
                                    .withColumnRenamed('doc_event_publisher_id', 'view_publisher_id') \
                                    .withColumnRenamed('doc_event_source_id', 'view_source_id') \
                                    .withColumnRenamed('event_hour', 'hour') \
                                    .withColumnRenamed('event_platform', 'platform') \
                                    .withColumnRenamed('event_country', 'country') \
                                    .withColumnRenamed('event_weekend', 'weekend')

In [108]:
print(train_imputing_df.schema.names)
len(train_imputing_df.schema.names)

['display_id', 'ad_id', 'label', 'is_leak', 'weekend', 'user_views', 'ad_views', 'doc_views', 'pop_ad_id', 'pop_ad_id_conf', 'pop_ad_id_conf_multipl', 'pop_ad_doc_id', 'pop_ad_doc_id_conf', 'pop_ad_doc_id_conf_multipl', 'pop_ad_publisher_id', 'pop_ad_publisher_id_conf', 'pop_ad_publisher_id_conf_multipl', 'pop_advertiser_id', 'pop_advertiser_id_conf', 'pop_advertiser_id_conf_multipl', 'pop_campaign_id', 'pop_campaign_id_conf', 'pop_campaign_id_conf_multipl', 'pop_view_doc_ad_doc', 'pop_view_doc_ad_doc_conf', 'pop_view_doc_ad_doc_conf_multipl', 'pop_ad_source_id', 'pop_ad_source_id_conf', 'pop_ad_source_id_conf_multipl', 'pop_ad_topic_id', 'pop_ad_topic_id_conf', 'pop_ad_topic_id_conf_multipl', 'pop_ad_category_id', 'pop_ad_category_id_conf', 'pop_ad_category_id_conf_multipl', 'view_publisher_id', 'view_source_id', 'hour', 'platform', 'traffic_source', 'is_train', 'uuid', 'view_doc_id', 'country', 'day_event', 'ad_doc_id', 'campaign_id', 'advertiser_id', 'ad_category_id', 'ad_category_c

58

### NA처리

In [109]:
# 결측치를 다시한번 확인하자
train_imputing_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in train_imputing_df.columns]).toPandas().transpose()

Unnamed: 0,0
display_id,0
ad_id,0
label,0
is_leak,0
weekend,0
user_views,55311626
ad_views,33833186
doc_views,52700407
pop_ad_id,1501513
pop_ad_id_conf,1501513


`user_views`, `ad_views`, `doc_views`는 참조할 사전 데이터가 없는 경우에 발생하기 때문에 1로 채워넣었다.

In [110]:
train_imputing_df = train_imputing_df.na.fill({'user_views': 1, 'ad_views': 1, 'doc_views': 1})

`traffic_source`와 `platform_event`는 최빈값인 1로 채우자.

In [111]:
train_imputing_df = train_imputing_df.na.fill({'traffic_source': 1, 'platform': 1})

pop-으로 시작하는 CTR과 그 외 범주형 변수의 결측치는 알려지지 않음(U: Unknown)으로 대체하자.

In [112]:
train_imputing_df = train_imputing_df.na.fill({'ad_source_id': 'U', 'view_topic_id': 'U',
                                              'view_category_id': 'U', 'ad_topic_id': 'U', 'ad_category_id': 'U',
                                              'view_source_id': 'U', 'ad_publisher_id': 'U',
                                              'view_category_conf' : 'U', 'ad_topic_conf' : 'U', 'ad_category_conf': 'U',
                                              'view_publisher_id' : 'U', 
                                               'pop_ad_category_id_conf_multipl' : 'U',
                                               'pop_ad_category_id_conf' : 'U',
                                               'pop_ad_category_id' : 'U',
                                               'pop_ad_topic_id_conf_multipl' : 'U',
                                               'pop_ad_topic_id_conf' : 'U',
                                               'pop_ad_topic_id' : 'U',
                                               'pop_ad_source_id_conf_multipl' : 'U',
                                               'pop_ad_source_id_conf' : 'U',
                                               'pop_ad_source_id' : 'U',
                                               'pop_view_doc_ad_doc_conf_multipl' : 'U',
                                               'pop_view_doc_ad_doc_conf' : 'U',
                                               'pop_view_doc_ad_doc' : 'U',
                                               'pop_campaign_id_conf_multipl' : 'U',
                                               'pop_campaign_id_conf' : 'U',
                                               'pop_campaign_id' : 'U',
                                               'pop_advertiser_id_conf_multipl' : 'U',
                                               'pop_advertiser_id_conf' : 'U',
                                               'pop_advertiser_id' : 'U',
                                               'pop_ad_publisher_id_conf_multipl' : 'U',
                                               'pop_ad_publisher_id_conf' : 'U',
                                               'pop_ad_publisher_id' : 'U',
                                               'pop_ad_doc_id_conf_multipl' : 'U',
                                               'pop_ad_doc_id_conf' : 'U',
                                               'pop_ad_doc_id' : 'U',
                                               'pop_ad_id_conf_multipl' : 'U',
                                               'pop_ad_id_conf' : 'U',
                                               'pop_ad_id' : 'U'
                                              })

# ad_publisher_id 유난히 NA가 많음 => 확인 필요 => okay. 원래 데이터에서도 그렇다.
# document_id_top, document_id_cat 삭제 => okay. Feature Enginnering_05 코드에 반영하였음. 재실행 시 해당 변수가 사라져 있을 것!

### 추가적으로 빼기로 한 변수 제거 + 모델에 필요없는 변수 제거

In [113]:
# 모델에 필요없는 변수 제거
train_imputing_df = train_imputing_df.drop('display_id', 'ad_id', 'ad_doc_id', 'uuid', 'view_doc_id')

# 'is_train'과 'day_event'는 train, valid 나눌 때 필요해서 남겨 둠. 모델링에는 필요 없음.

In [114]:
# 추가로 빼기로 한 변수 제거
train_imputing_df = train_imputing_df.drop('user_views', 'ad_views', 'doc_views', 
                                                  'pop_ad_id_conf','pop_ad_id_conf_multipl', 'pop_ad_doc_id_conf', 'pop_ad_doc_id_conf_multipl',
                                                  'pop_ad_publisher_id_conf_multipl', 'pop_ad_publisher_id_conf', 'pop_advertiser_id_conf_multipl', 'pop_advertiser_id_conf',
                                                  'pop_campaign_id_conf_multipl', 'pop_campaign_id_conf', 'pop_view_doc_ad_doc_conf_multipl', 'pop_view_doc_ad_doc_conf',
                                                  'pop_ad_source_id_conf_multipl', 'pop_ad_source_id_conf', 'pop_ad_topic_id_conf_multipl', 'pop_ad_topic_id_conf',
                                                  'pop_ad_category_id_conf_multipl', 'pop_ad_category_id_conf', 'view_category_conf', 'ad_topic_conf', 'ad_category_conf', 'view_topic_conf')

In [115]:
print(train_imputing_df.columns)
len(train_imputing_df.columns)

['label', 'is_leak', 'weekend', 'pop_ad_id', 'pop_ad_doc_id', 'pop_ad_publisher_id', 'pop_advertiser_id', 'pop_campaign_id', 'pop_view_doc_ad_doc', 'pop_ad_source_id', 'pop_ad_topic_id', 'pop_ad_category_id', 'view_publisher_id', 'view_source_id', 'hour', 'platform', 'traffic_source', 'is_train', 'country', 'day_event', 'campaign_id', 'advertiser_id', 'ad_category_id', 'ad_topic_id', 'view_category_id', 'view_topic_id', 'ad_source_id', 'ad_publisher_id']


28

In [116]:
# 결측치 처리됐는지 확인
train_imputing_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in train_imputing_df.columns]).toPandas().transpose()

Unnamed: 0,0
label,0
is_leak,0
weekend,0
pop_ad_id,1501513
pop_ad_doc_id,567008
pop_ad_publisher_id,53234319
pop_advertiser_id,38189
pop_campaign_id,214394
pop_view_doc_ad_doc,43315606
pop_ad_source_id,74740


## Part C: OHE(One-Hot Encoding)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

`train`에서 세 개의 컬럼만 빼서 logistic regression으로 적합시켜보자.

In [None]:
train_light = train.select('label', 'event_weekend', 'doc_event_hour', 'pop_advertiser_id')

결측치가 포함되어 있으면 VectorAssembler가 작동하지 않는다.

In [None]:
train_light = train_light.dropna() ### 테스트 용으로 생성하였음. 

In [None]:
categorical_columns= ['event_weekend', 'doc_event_hour']

indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categorical_columns
]

encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]

numericCols = ["pop_advertiser_id"]

assemblerInputs = [encoder.getOutputCol() for encoder in encoders] + numericCols
assembler = VectorAssembler(inputCols= assemblerInputs, outputCol="features")

In [None]:
pipeline = Pipeline(stages=indexers + encoders+[assembler])
model=pipeline.fit(train_light)

In [None]:
transformed = model.transform(train_light)
transformed.show(5)

In [None]:
lrModel = LogisticRegression().fit(transformed)

In [None]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))