## Sampling & Splitting Data

## Part A: 분석을 위한 환경 설정

### 모듈/패키지 로드

In [3]:
evaluation = True
evaluation_verbose = False

OUTPUT_BUCKET_FOLDER = "gs://cap-18/output/"
DATA_BUCKET_FOLDER = "gs://cap-18/data/"

In [4]:
from IPython.display import display

In [5]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [6]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all" # 한 셀(cell)에서의 코드 실행 결과가 다 보이도록 설정

In [7]:
import numpy as np
import scipy.sparse

In [8]:
import warnings 
warnings.filterwarnings('ignore') # warning 메시지 반환되지 않도록 설정

In [9]:
import math
import datetime
import time
import itertools

In [10]:
import pickle

In [11]:
import random
random.seed(42)

In [12]:
import pandas as pd
%matplotlib inline

## Part B: Validation Set 나누기

In [None]:
# 파케이 파일 로드
train_valid_merged_df = spark.read.parquet("gs://cap-18/output/train_subset_final")

In [None]:
## cf) 데이터 프레임 구조 깨지지 않게 보기:
train_pandas = train_valid_merged_df.limit(1).toPandas()

In [None]:
pd.set_option('display.max_columns', 100)

In [None]:
train_pandas

## Part C: OHE(One-Hot Encoding)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

`train`에서 세 개의 컬럼만 빼서 logistic regression으로 적합시켜보자.

In [None]:
train_light = train.select('label', 'event_weekend', 'doc_event_hour', 'pop_advertiser_id')

결측치가 포함되어 있으면 VectorAssembler가 작동하지 않는다.

In [None]:
train_light = train_light.dropna() ### 테스트 용으로 생성하였음. 

In [None]:
categorical_columns= ['event_weekend', 'doc_event_hour']

indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categorical_columns
]

encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]

numericCols = ["pop_advertiser_id"]

assemblerInputs = [encoder.getOutputCol() for encoder in encoders] + numericCols
assembler = VectorAssembler(inputCols= assemblerInputs, outputCol="features")

In [None]:
pipeline = Pipeline(stages=indexers + encoders+[assembler])
model=pipeline.fit(train_light)

In [None]:
transformed = model.transform(train_light)
transformed.show(5)

In [None]:
lrModel = LogisticRegression().fit(transformed)

In [None]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))