## set up

In [1]:
from IPython.display import display
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [2]:
import numpy as np
import scipy.sparse
import math
import datetime
import time
import itertools
import pickle
import pandas as pd
%matplotlib inline

In [3]:
import random
random.seed(42)

In [4]:
# 해시 함수를 가진 모듈 hashlib
import hashlib
def hashstr(s, nr_bins):
    return int(hashlib.md5(s.encode('utf8')).hexdigest(), 16)%(nr_bins-1)+1

## train_feature_vectors_exported_df

In [5]:
train_feature_vectors_exported_df = spark.read.parquet("gs://capstone-01/output/train_feature_vectors_integral_eval")
%time train_feature_vectors_exported_df.take(1)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 9.24 s


[Row(uuid=u'10005a0add15f6', display_id=5686397, ad_id=61941, document_id_event=2135921, document_id=1017869, label=1, feature_vector=SparseVector(103, {0: 1.0, 3: 11370.0, 4: 15083.0, 5: 0.0, 6: 3.0, 7: 173.0, 8: 0.4069, 9: 0.5623, 10: 0.2288, 11: 0.3902, 12: 0.5131, 13: 0.2002, 14: 0.2691, 15: 0.3531, 16: 0.095, 17: 0.3902, 18: 0.5131, 19: 0.2002, 20: 0.3902, 21: 0.5131, 22: 0.2002, 26: 0.3902, 27: 0.5131, 28: 0.2002, 29: 0.3902, 30: 0.5131, 31: 0.2002, 32: 0.3826, 33: 0.2833, 34: 0.1084, 35: 0.3962, 36: 0.2833, 37: 0.1122, 38: 0.2738, 39: 0.0012, 40: 0.0003, 41: 0.2801, 42: 0.0012, 43: 0.0003, 44: 0.2298, 45: 0.0622, 46: 0.0143, 47: 0.2336, 48: 0.0595, 49: 0.0139, 59: 0.0, 60: 0.0004, 61: 0.0, 62: 0.0, 63: 0.0, 64: 0.0, 68: 2413.0, 69: 1403.0, 70: 1610.0, 72: 108.0, 75: 194.0, 76: 15.0, 81: 440.0, 82: 4016.0, 83: 1702.0, 84: 1707.0, 86: 137.0, 95: 723.0, 96: 4194.0, 97: 18595452.0, 98: 745661.0, 99: 33260.0, 100: 3.0, 101: 2.0, 102: 1.0}))]

## feature vector header

In [6]:
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

In [7]:
int_feature_names = ['user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published', 
                    ]

In [8]:
float_feature_names = [                                
                'pop_ad_id',       
                'pop_ad_id_conf',   
                'pop_ad_id_conf_multipl', 
                'pop_document_id',                
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campain_id',
                'pop_campain_id_conf',
                'pop_campain_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',  
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',    
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id', 
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id', 
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',    
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',    
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',                    
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',    
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',    
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',                    
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl'
               ]

In [9]:
TRAFFIC_SOURCE_FV='traffic_source'
EVENT_HOUR_FV='event_hour'
EVENT_COUNTRY_FV = 'event_country'
EVENT_COUNTRY_STATE_FV = 'event_country_state'
EVENT_GEO_LOCATION_FV = 'event_geo_location'
EVENT_PLATFORM_FV = 'event_platform'
AD_ADVERTISER_FV = 'ad_advertiser'
DOC_AD_SOURCE_ID_FV='doc_ad_source_id'
DOC_AD_PUBLISHER_ID_FV='doc_ad_publisher_id'
DOC_EVENT_SOURCE_ID_FV='doc_event_source_id'
DOC_EVENT_PUBLISHER_ID_FV='doc_event_publisher_id'
DOC_AD_CATEGORY_ID_FV='doc_ad_category_id'
DOC_AD_TOPIC_ID_FV='doc_ad_topic_id'
DOC_AD_ENTITY_ID_FV='doc_ad_entity_id'
DOC_EVENT_CATEGORY_ID_FV='doc_event_category_id'
DOC_EVENT_TOPIC_ID_FV='doc_event_topic_id'
DOC_EVENT_ENTITY_ID_FV='doc_event_entity_id'

In [10]:
category_feature_names_integral = ['ad_advertiser',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_ad_entity_id_1', 
 'doc_ad_entity_id_2', 
 'doc_ad_entity_id_3', 
 'doc_ad_entity_id_4', 
 'doc_ad_entity_id_5', 
 'doc_ad_entity_id_6', 
 'doc_ad_publisher_id',
 'doc_ad_source_id', 
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_event_entity_id_1',
 'doc_event_entity_id_2',
 'doc_event_entity_id_3',
 'doc_event_entity_id_4',
 'doc_event_entity_id_5',
 'doc_event_entity_id_6',
 'doc_event_publisher_id',
 'doc_event_source_id', 
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'traffic_source']

In [11]:
feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + \
                                 category_feature_names_integral

In [12]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open("train_feature_vectors_integral_eval.csv"+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [13]:
def sparse_vector_to_csv_with_nulls_row(additional_column_values, vec, num_columns):    
    return ','.join([str(value) for value in additional_column_values] + 
                     list([ '{:.5}'.format(vec[x]) if x in vec.indices else '' for x in range(vec.size) ])[:num_columns]) \
            .replace('.0,',',')

In [14]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn('is_leak', F.lit(-1)) \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

## rdd to dataframe

In [37]:
# rdd를 dataframe으로 변환한 것 -> trainingData
trainingData = train_feature_vectors_integral_csv_rdd.map(lambda x : x.split(",")).toDF()

In [16]:
# spark dataframe으로 잘 변환된 것을 확인할 수 있다
type(trainingData)

pyspark.sql.dataframe.DataFrame

In [17]:
# 모든 컬럼의 datatype이 string인 것을 확인할 수 있다
trainingData.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: string (nullable = true)
 |-- _4: string (nullable = true)
 |-- _5: string (nullable = true)
 |-- _6: string (nullable = true)
 |-- _7: string (nullable = true)
 |-- _8: string (nullable = true)
 |-- _9: string (nullable = true)
 |-- _10: string (nullable = true)
 |-- _11: string (nullable = true)
 |-- _12: string (nullable = true)
 |-- _13: string (nullable = true)
 |-- _14: string (nullable = true)
 |-- _15: string (nullable = true)
 |-- _16: string (nullable = true)
 |-- _17: string (nullable = true)
 |-- _18: string (nullable = true)
 |-- _19: string (nullable = true)
 |-- _20: string (nullable = true)
 |-- _21: string (nullable = true)
 |-- _22: string (nullable = true)
 |-- _23: string (nullable = true)
 |-- _24: string (nullable = true)
 |-- _25: string (nullable = true)
 |-- _26: string (nullable = true)
 |-- _27: string (nullable = true)
 |-- _28: string (nullable = true)
 |-- _29: string (nullab

In [18]:
# show를 통해 데이터 형태를 보자
trainingData.show(1)

+---+-------+-----+-------+-------+---+---+---+---+---------+----------+---+---+---+-------+-------+-------+-------+-------+-------+------+-------+--------+-------+-------+-------+-------+-------+-------+---+---+---+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+---------+----------+-------+---------+----------+-------+------+--------+-------+--------+-------+---+---+---+---+---+---+---+---+---+---+----------+---+---+----------+---+---+---+---+----+----+----+---+---+---+---+---+---+---+---+---+---+---+----+----+----+---+---+---+---+---+---+---+---+----+----+----+----+----------+----------+---------+----+----+----+
| _1|     _2|   _3|     _4|     _5| _6| _7| _8| _9|      _10|       _11|_12|_13|_14|    _15|    _16|    _17|    _18|    _19|    _20|   _21|    _22|     _23|    _24|    _25|    _26|    _27|    _28|    _29|_30|_31|_32|    _33|    _34|    _35|    _36|    _37|    _38|    _39|    _40|    _41|    _42|    _43|    _44|    _45| 

In [19]:
# 이제 dataframe의 컬럼별로 datatype을 바꿔보자
# 테스트로, "_1" 열의 datatype을 string에서 integer로 바꿔보자
# datatype 변환에는 cast 함수를 사용한다
# printScheme()을 실행하면, datatype 변환이 잘 되었음을 알 수 있다
changedTypedf = trainingData.withColumn("label", trainingData["_1"].cast("integer"))
changedTypedf.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: string (nullable = true)
 |-- _4: string (nullable = true)
 |-- _5: string (nullable = true)
 |-- _6: string (nullable = true)
 |-- _7: string (nullable = true)
 |-- _8: string (nullable = true)
 |-- _9: string (nullable = true)
 |-- _10: string (nullable = true)
 |-- _11: string (nullable = true)
 |-- _12: string (nullable = true)
 |-- _13: string (nullable = true)
 |-- _14: string (nullable = true)
 |-- _15: string (nullable = true)
 |-- _16: string (nullable = true)
 |-- _17: string (nullable = true)
 |-- _18: string (nullable = true)
 |-- _19: string (nullable = true)
 |-- _20: string (nullable = true)
 |-- _21: string (nullable = true)
 |-- _22: string (nullable = true)
 |-- _23: string (nullable = true)
 |-- _24: string (nullable = true)
 |-- _25: string (nullable = true)
 |-- _26: string (nullable = true)
 |-- _27: string (nullable = true)
 |-- _28: string (nullable = true)
 |-- _29: string (nullab

In [38]:
# 이제 컬럼별로 datatype을 선언하자

trainingData_final = trainingData.select(
        trainingData._1.cast("Integer").alias("label"),
        trainingData._2.cast("Integer").alias("display_id"), 
        trainingData._3.cast("Integer").alias("ad_id"), 
        trainingData._4.cast("Integer").alias("doc_id"),
        trainingData._5.cast("Integer").alias("doc_event_id"), 
        trainingData._6.cast("Integer").alias("is_leak"),
        trainingData._7.cast("Integer").alias("event_weekend"),
        trainingData._8.cast("Integer").alias("user_has_already_viewed_doc"), 
        trainingData._9.cast("Integer").alias("user_views"), 
        trainingData._10.cast("Integer").alias("ad_views"),
        trainingData._11.cast("Integer").alias("doc_views"), 
        trainingData._12.cast("Integer").alias("doc_event_days_since_published"),
        trainingData._13.cast("Float").alias("doc_event_hour"),
        trainingData._14.cast("Float").alias("doc_ad_days_since_published"), 
        trainingData._15.cast("Float").alias("pop_ad_id"), 
        trainingData._16.cast("Float").alias("pop_ad_id_conf"),
        trainingData._17.cast("Float").alias("pop_ad_id_conf_multipl"), 
        trainingData._18.cast("Float").alias("pop_document_id"),
        trainingData._19.cast("Float").alias("pop_document_id_conf"),
        trainingData._20.cast("Float").alias("pop_document_id_conf_multipl"), 
        trainingData._21.cast("Float").alias("pop_publisher_id"), 
        trainingData._22.cast("Float").alias("pop_publisher_id_conf"),
        trainingData._23.cast("Float").alias("pop_publisher_id_conf_multipl"), 
        trainingData._24.cast("Float").alias("pop_advertiser_id"),
        trainingData._25.cast("Float").alias("pop_advertiser_id_conf"),
        trainingData._26.cast("Float").alias("pop_advertiser_id_conf_multipl"), 
        trainingData._27.cast("Float").alias("pop_campain_id"), 
        trainingData._28.cast("Float").alias("pop_campain_id_conf"),
        trainingData._29.cast("Float").alias("pop_campain_id_conf_multipl"), 
        trainingData._30.cast("Float").alias("pop_doc_event_doc_ad"),
        trainingData._31.cast("Float").alias("pop_doc_event_doc_ad_conf"),
        trainingData._32.cast("Float").alias("pop_doc_event_doc_ad_conf_multipl"), 
        trainingData._33.cast("Float").alias("pop_source_id"), 
        trainingData._34.cast("Float").alias("pop_source_id_conf"),
        trainingData._35.cast("Float").alias("pop_source_id_conf_multipl"), 
        trainingData._36.cast("Float").alias("pop_source_id_country"),
        trainingData._37.cast("Float").alias("pop_source_id_country_conf"),
        trainingData._38.cast("Float").alias("pop_source_id_country_conf_multipl"),
        trainingData._39.cast("Float").alias("pop_entity_id"),
        trainingData._40.cast("Float").alias("pop_entity_id_conf"),
        trainingData._41.cast("Float").alias("pop_entity_id_conf_multipl"),
        trainingData._42.cast("Float").alias("pop_entity_id_country"),
        trainingData._43.cast("Float").alias("pop_entity_id_country_conf"),
        trainingData._44.cast("Float").alias("pop_entity_id_country_conf_multipl"),
        trainingData._45.cast("Float").alias("pop_topic_id"),
        trainingData._46.cast("Float").alias("pop_topic_id_conf"),
        trainingData._47.cast("Float").alias("pop_topic_id_conf_multipl"),
        trainingData._48.cast("Float").alias("pop_topic_id_country"),
        trainingData._49.cast("Float").alias("pop_topic_id_country_conf"),
        trainingData._50.cast("Float").alias("pop_topic_id_country_conf_multipl"),
        trainingData._51.cast("Float").alias("pop_category_id"),
        trainingData._52.cast("Float").alias("pop_category_id_conf"),
        trainingData._53.cast("Float").alias("pop_category_id_conf_multipl"),
        trainingData._54.cast("Float").alias("pop_category_id_country"),
        trainingData._55.cast("Float").alias("pop_category_id_country_conf"),
        trainingData._56.cast("Float").alias("pop_category_id_country_conf_multipl"),
        trainingData._57.cast("Float").alias("user_doc_ad_sim_categories"),
        trainingData._58.cast("Float").alias("user_doc_ad_sim_categories_conf"),
        trainingData._59.cast("Float").alias("user_doc_ad_sim_categories_conf_multipl"),
        trainingData._60.cast("Float").alias("user_doc_ad_sim_topics"),
        trainingData._61.cast("Float").alias("user_doc_ad_sim_topics_conf"),
        trainingData._62.cast("Float").alias("user_doc_ad_sim_topics_conf_multipl"),
        trainingData._63.cast("Float").alias("user_doc_ad_sim_entities"),
        trainingData._64.cast("Float").alias("user_doc_ad_sim_entities_conf"),
        trainingData._65.cast("Float").alias("user_doc_ad_sim_entities_conf_multipl"),
        trainingData._66.cast("Float").alias("doc_event_doc_ad_sim_categories"),
        trainingData._67.cast("Float").alias("doc_event_doc_ad_sim_categories_conf"),
        trainingData._68.cast("Float").alias("doc_event_doc_ad_sim_categories_conf_multipl"),
        trainingData._69.cast("Float").alias("doc_event_doc_ad_sim_topics"),
        trainingData._70.cast("Float").alias("doc_event_doc_ad_sim_topics_conf"),
        trainingData._71.cast("Float").alias("doc_event_doc_ad_sim_topics_conf_multipl"),
        trainingData._72.cast("Float").alias("doc_event_doc_ad_sim_entities"),
        trainingData._73.cast("Float").alias("doc_event_doc_ad_sim_entities_conf"),
        trainingData._74.cast("Float").alias("doc_event_doc_ad_sim_entities_conf_multipl"),
        trainingData._75.cast("Integer").alias("ad_advertiser"),
        trainingData._76.cast("Integer").alias("doc_ad_category_id_1"),
        trainingData._77.cast("Integer").alias("doc_ad_category_id_2"),
        trainingData._78.cast("Integer").alias("doc_ad_category_id_3"),
        trainingData._79.cast("Integer").alias("doc_ad_topic_id_1"),
        trainingData._80.cast("Integer").alias("doc_ad_topic_id_2"),
        trainingData._81.cast("Integer").alias("doc_ad_topic_id_3"),
        trainingData._82.cast("Integer").alias("doc_ad_entity_id_1"),
        trainingData._83.cast("Integer").alias("doc_ad_entity_id_2"),
        trainingData._84.cast("Integer").alias("doc_ad_entity_id_3"),
        trainingData._85.cast("Integer").alias("doc_ad_entity_id_4"),
        trainingData._86.cast("Integer").alias("doc_ad_entity_id_5"),
        trainingData._87.cast("Integer").alias("doc_ad_entity_id_6"),
        trainingData._88.cast("Integer").alias("doc_ad_publisher_id"),
        trainingData._89.cast("Integer").alias("doc_ad_source_id"),
        trainingData._90.cast("Integer").alias("doc_event_category_id_1"),
        trainingData._91.cast("Integer").alias("doc_event_category_id_2"),
        trainingData._92.cast("Integer").alias("doc_event_category_id_3"),
        trainingData._93.cast("Integer").alias("doc_event_topic_id_1"),
        trainingData._94.cast("Integer").alias("doc_event_topic_id_2"),
        trainingData._95.cast("Integer").alias("doc_event_topic_id_3"),
        trainingData._96.cast("Integer").alias("doc_event_entity_id_1"),
        trainingData._97.cast("Integer").alias("doc_event_entity_id_2"),
        trainingData._98.cast("Integer").alias("doc_event_entity_id_3"),
        trainingData._99.cast("Integer").alias("doc_event_entity_id_4"),
        trainingData._100.cast("Integer").alias("doc_event_entity_id_5"),
        trainingData._101.cast("Integer").alias("doc_event_entity_id_6"),
        trainingData._102.cast("Integer").alias("doc_event_publisher_id"),
        trainingData._103.cast("Integer").alias("doc_event_source_id"),
        trainingData._104.cast("Integer").alias("event_country"),
        trainingData._105.cast("Integer").alias("event_country_state"),
        trainingData._106.cast("Integer").alias("event_geo_location"),
        trainingData._107.cast("Integer").alias("event_hour"),
        trainingData._108.cast("Integer").alias("event_platform"),
        trainingData._109.cast("Integer").alias("traffic_source")
    )

In [21]:
# spark dataframe 형태인 것을 한번 더 확인
type(trainingData_final)

pyspark.sql.dataframe.DataFrame

In [39]:
# 컬럼별 datatype이 잘 변경된 것을 확인할 수 있다
trainingData_final.printSchema()

root
 |-- label: integer (nullable = true)
 |-- display_id: integer (nullable = true)
 |-- ad_id: integer (nullable = true)
 |-- doc_id: integer (nullable = true)
 |-- doc_event_id: integer (nullable = true)
 |-- is_leak: integer (nullable = true)
 |-- event_weekend: integer (nullable = true)
 |-- user_has_already_viewed_doc: integer (nullable = true)
 |-- user_views: integer (nullable = true)
 |-- ad_views: integer (nullable = true)
 |-- doc_views: integer (nullable = true)
 |-- doc_event_days_since_published: integer (nullable = true)
 |-- doc_event_hour: float (nullable = true)
 |-- doc_ad_days_since_published: float (nullable = true)
 |-- pop_ad_id: float (nullable = true)
 |-- pop_ad_id_conf: float (nullable = true)
 |-- pop_ad_id_conf_multipl: float (nullable = true)
 |-- pop_document_id: float (nullable = true)
 |-- pop_document_id_conf: float (nullable = true)
 |-- pop_document_id_conf_multipl: float (nullable = true)
 |-- pop_publisher_id: float (nullable = true)
 |-- pop_publ

## drop some columns

In [40]:
trainingData_final_dropped = trainingData_final.drop('doc_event_entity_id_1', 'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4', 'doc_event_entity_id_5', 'doc_event_entity_id_6', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3', 'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6', 'doc_ad_days_since_published', 'doc_event_days_since_published', 'user_has_already_viewed_doc', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'pop_entity_id_country', 'pop_entity_id_country_conf', 'pop_entity_id_country_conf_multipl', 'user_doc_ad_sim_categories', 'user_doc_ad_sim_categories_conf', 'user_doc_ad_sim_categories_conf_multipl', 'user_doc_ad_sim_topics', 'user_doc_ad_sim_topics_conf', 'user_doc_ad_sim_topics_conf_multipl', 'user_doc_ad_sim_entities', 'user_doc_ad_sim_entities_conf', 'user_doc_ad_sim_entities_conf_multipl', 'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_categories_conf', 'doc_event_doc_ad_sim_categories_conf_multipl', 'doc_event_doc_ad_sim_topics', 'doc_event_doc_ad_sim_topics_conf', 'doc_event_doc_ad_sim_topics_conf_multipl', 'doc_event_doc_ad_sim_entities', 'doc_event_doc_ad_sim_entities_conf', 'doc_event_doc_ad_sim_entities_conf_multipl', 'ad_id')

In [41]:
trainingData_final_dropped.printSchema()

root
 |-- label: integer (nullable = true)
 |-- display_id: integer (nullable = true)
 |-- doc_id: integer (nullable = true)
 |-- doc_event_id: integer (nullable = true)
 |-- is_leak: integer (nullable = true)
 |-- event_weekend: integer (nullable = true)
 |-- user_views: integer (nullable = true)
 |-- ad_views: integer (nullable = true)
 |-- doc_views: integer (nullable = true)
 |-- doc_event_hour: float (nullable = true)
 |-- pop_ad_id: float (nullable = true)
 |-- pop_ad_id_conf: float (nullable = true)
 |-- pop_ad_id_conf_multipl: float (nullable = true)
 |-- pop_document_id: float (nullable = true)
 |-- pop_document_id_conf: float (nullable = true)
 |-- pop_document_id_conf_multipl: float (nullable = true)
 |-- pop_publisher_id: float (nullable = true)
 |-- pop_publisher_id_conf: float (nullable = true)
 |-- pop_publisher_id_conf_multipl: float (nullable = true)
 |-- pop_advertiser_id: float (nullable = true)
 |-- pop_advertiser_id_conf: float (nullable = true)
 |-- pop_advertiser

In [46]:
# 최종 train dataframe을 train_x이라 한다
train_x = trainingData_final_dropped.drop("label")
# label(clicked)는 train_y로 저장한다
train_y = trainingData_final_dropped.select("label")

In [48]:
# train의 컬럼명 확인
sorted(train_x.columns)

['ad_advertiser',
 'ad_views',
 'display_id',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_publisher_id',
 'doc_ad_source_id',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_hour',
 'doc_event_id',
 'doc_event_publisher_id',
 'doc_event_source_id',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_id',
 'doc_views',
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'event_weekend',
 'is_leak',
 'pop_ad_id',
 'pop_ad_id_conf',
 'pop_ad_id_conf_multipl',
 'pop_advertiser_id',
 'pop_advertiser_id_conf',
 'pop_advertiser_id_conf_multipl',
 'pop_campain_id',
 'pop_campain_id_conf',
 'pop_campain_id_conf_multipl',
 'pop_category_id',
 'pop_category_id_conf',
 'pop_category_id_conf_multipl',
 'pop_category_id_country',
 'pop_category_id_country_conf',
 'pop_cate

## validation set (rdd -> dataframe)

In [50]:
test_validation_feature_vectors_exported_df = spark.read.parquet("gs://capstone-01/output/validation_feature_vectors_integral")
%time test_validation_feature_vectors_exported_df.take(1)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 8.76 s


[Row(uuid=u'100289071872c9', display_id=3110701, ad_id=149004, document_id_event=1973614, document_id=1335842, label=1, is_leak=0, feature_vector=SparseVector(103, {0: 0.0, 1: 0.0, 2: 43.0, 3: 5042.0, 4: 15296.0, 5: 0.0, 6: 3.0, 7: 54.0, 8: 0.2582, 9: 0.5133, 10: 0.1326, 11: 0.2288, 12: 0.5384, 13: 0.1232, 14: 0.2288, 15: 0.5384, 16: 0.1232, 17: 0.1855, 18: 0.3432, 19: 0.0637, 20: 0.2378, 21: 0.332, 22: 0.079, 26: 0.2288, 27: 0.5384, 28: 0.1232, 29: 0.2288, 30: 0.5384, 31: 0.1232, 32: 0.2288, 33: 0.4539, 34: 0.1039, 35: 0.2288, 36: 0.4539, 37: 0.1039, 38: 0.1948, 39: 0.018, 40: 0.0035, 41: 0.2483, 42: 0.0167, 43: 0.0041, 44: 0.1838, 45: 0.1421, 46: 0.0261, 47: 0.1886, 48: 0.149, 49: 0.0281, 50: 0.0645, 51: 1.0, 52: 0.0645, 53: 0.0, 54: 0.0005, 55: 0.0, 56: 0.0, 57: 0.0, 58: 0.0, 59: 0.0136, 60: 0.9996, 61: 0.0136, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0, 66: 0.0, 67: 0.0, 68: 709.0, 69: 1100.0, 70: 1408.0, 72: 277.0, 81: 509.0, 82: 3890.0, 83: 1403.0, 84: 1408.0, 86: 136.0, 95: 407.0, 96: 6

In [51]:
test_validation_feature_vector_integral_csv_folder_name = 'validation_feature_vectors_integral.csv'
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open(test_validation_feature_vector_integral_csv_folder_name+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [52]:
test_validation_feature_vectors_integral_csv_rdd = test_validation_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'is_leak', 'feature_vector') \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [54]:
# rdd를 dataframe으로 변환한 것 -> validationData
validationData = test_validation_feature_vectors_integral_csv_rdd.map(lambda x : x.split(",")).toDF()

In [55]:
# 이제 컬럼별로 datatype을 선언하자

validationData_final = validationData.select(
        validationData._1.cast("Integer").alias("label"),
        validationData._2.cast("Integer").alias("display_id"), 
        validationData._3.cast("Integer").alias("ad_id"), 
        validationData._4.cast("Integer").alias("doc_id"),
        validationData._5.cast("Integer").alias("doc_event_id"), 
        validationData._6.cast("Integer").alias("is_leak"),
        validationData._7.cast("Integer").alias("event_weekend"),
        validationData._8.cast("Integer").alias("user_has_already_viewed_doc"), 
        validationData._9.cast("Integer").alias("user_views"), 
        validationData._10.cast("Integer").alias("ad_views"),
        validationData._11.cast("Integer").alias("doc_views"), 
        validationData._12.cast("Integer").alias("doc_event_days_since_published"),
        validationData._13.cast("Float").alias("doc_event_hour"),
        validationData._14.cast("Float").alias("doc_ad_days_since_published"), 
        validationData._15.cast("Float").alias("pop_ad_id"), 
        validationData._16.cast("Float").alias("pop_ad_id_conf"),
        validationData._17.cast("Float").alias("pop_ad_id_conf_multipl"), 
        validationData._18.cast("Float").alias("pop_document_id"),
        validationData._19.cast("Float").alias("pop_document_id_conf"),
        validationData._20.cast("Float").alias("pop_document_id_conf_multipl"), 
        validationData._21.cast("Float").alias("pop_publisher_id"), 
        validationData._22.cast("Float").alias("pop_publisher_id_conf"),
        validationData._23.cast("Float").alias("pop_publisher_id_conf_multipl"), 
        validationData._24.cast("Float").alias("pop_advertiser_id"),
        validationData._25.cast("Float").alias("pop_advertiser_id_conf"),
        validationData._26.cast("Float").alias("pop_advertiser_id_conf_multipl"), 
        validationData._27.cast("Float").alias("pop_campain_id"), 
        validationData._28.cast("Float").alias("pop_campain_id_conf"),
        validationData._29.cast("Float").alias("pop_campain_id_conf_multipl"), 
        validationData._30.cast("Float").alias("pop_doc_event_doc_ad"),
        validationData._31.cast("Float").alias("pop_doc_event_doc_ad_conf"),
        validationData._32.cast("Float").alias("pop_doc_event_doc_ad_conf_multipl"), 
        validationData._33.cast("Float").alias("pop_source_id"), 
        validationData._34.cast("Float").alias("pop_source_id_conf"),
        validationData._35.cast("Float").alias("pop_source_id_conf_multipl"), 
        validationData._36.cast("Float").alias("pop_source_id_country"),
        validationData._37.cast("Float").alias("pop_source_id_country_conf"),
        validationData._38.cast("Float").alias("pop_source_id_country_conf_multipl"),
        validationData._39.cast("Float").alias("pop_entity_id"),
        validationData._40.cast("Float").alias("pop_entity_id_conf"),
        validationData._41.cast("Float").alias("pop_entity_id_conf_multipl"),
        validationData._42.cast("Float").alias("pop_entity_id_country"),
        validationData._43.cast("Float").alias("pop_entity_id_country_conf"),
        validationData._44.cast("Float").alias("pop_entity_id_country_conf_multipl"),
        validationData._45.cast("Float").alias("pop_topic_id"),
        validationData._46.cast("Float").alias("pop_topic_id_conf"),
        validationData._47.cast("Float").alias("pop_topic_id_conf_multipl"),
        validationData._48.cast("Float").alias("pop_topic_id_country"),
        validationData._49.cast("Float").alias("pop_topic_id_country_conf"),
        validationData._50.cast("Float").alias("pop_topic_id_country_conf_multipl"),
        validationData._51.cast("Float").alias("pop_category_id"),
        validationData._52.cast("Float").alias("pop_category_id_conf"),
        validationData._53.cast("Float").alias("pop_category_id_conf_multipl"),
        validationData._54.cast("Float").alias("pop_category_id_country"),
        validationData._55.cast("Float").alias("pop_category_id_country_conf"),
        validationData._56.cast("Float").alias("pop_category_id_country_conf_multipl"),
        validationData._57.cast("Float").alias("user_doc_ad_sim_categories"),
        validationData._58.cast("Float").alias("user_doc_ad_sim_categories_conf"),
        validationData._59.cast("Float").alias("user_doc_ad_sim_categories_conf_multipl"),
        validationData._60.cast("Float").alias("user_doc_ad_sim_topics"),
        validationData._61.cast("Float").alias("user_doc_ad_sim_topics_conf"),
        validationData._62.cast("Float").alias("user_doc_ad_sim_topics_conf_multipl"),
        validationData._63.cast("Float").alias("user_doc_ad_sim_entities"),
        validationData._64.cast("Float").alias("user_doc_ad_sim_entities_conf"),
        validationData._65.cast("Float").alias("user_doc_ad_sim_entities_conf_multipl"),
        validationData._66.cast("Float").alias("doc_event_doc_ad_sim_categories"),
        validationData._67.cast("Float").alias("doc_event_doc_ad_sim_categories_conf"),
        validationData._68.cast("Float").alias("doc_event_doc_ad_sim_categories_conf_multipl"),
        validationData._69.cast("Float").alias("doc_event_doc_ad_sim_topics"),
        validationData._70.cast("Float").alias("doc_event_doc_ad_sim_topics_conf"),
        validationData._71.cast("Float").alias("doc_event_doc_ad_sim_topics_conf_multipl"),
        validationData._72.cast("Float").alias("doc_event_doc_ad_sim_entities"),
        validationData._73.cast("Float").alias("doc_event_doc_ad_sim_entities_conf"),
        validationData._74.cast("Float").alias("doc_event_doc_ad_sim_entities_conf_multipl"),
        validationData._75.cast("Integer").alias("ad_advertiser"),
        validationData._76.cast("Integer").alias("doc_ad_category_id_1"),
        validationData._77.cast("Integer").alias("doc_ad_category_id_2"),
        validationData._78.cast("Integer").alias("doc_ad_category_id_3"),
        validationData._79.cast("Integer").alias("doc_ad_topic_id_1"),
        validationData._80.cast("Integer").alias("doc_ad_topic_id_2"),
        validationData._81.cast("Integer").alias("doc_ad_topic_id_3"),
        validationData._82.cast("Integer").alias("doc_ad_entity_id_1"),
        validationData._83.cast("Integer").alias("doc_ad_entity_id_2"),
        validationData._84.cast("Integer").alias("doc_ad_entity_id_3"),
        validationData._85.cast("Integer").alias("doc_ad_entity_id_4"),
        validationData._86.cast("Integer").alias("doc_ad_entity_id_5"),
        validationData._87.cast("Integer").alias("doc_ad_entity_id_6"),
        validationData._88.cast("Integer").alias("doc_ad_publisher_id"),
        validationData._89.cast("Integer").alias("doc_ad_source_id"),
        validationData._90.cast("Integer").alias("doc_event_category_id_1"),
        validationData._91.cast("Integer").alias("doc_event_category_id_2"),
        validationData._92.cast("Integer").alias("doc_event_category_id_3"),
        validationData._93.cast("Integer").alias("doc_event_topic_id_1"),
        validationData._94.cast("Integer").alias("doc_event_topic_id_2"),
        validationData._95.cast("Integer").alias("doc_event_topic_id_3"),
        validationData._96.cast("Integer").alias("doc_event_entity_id_1"),
        validationData._97.cast("Integer").alias("doc_event_entity_id_2"),
        validationData._98.cast("Integer").alias("doc_event_entity_id_3"),
        validationData._99.cast("Integer").alias("doc_event_entity_id_4"),
        validationData._100.cast("Integer").alias("doc_event_entity_id_5"),
        validationData._101.cast("Integer").alias("doc_event_entity_id_6"),
        validationData._102.cast("Integer").alias("doc_event_publisher_id"),
        validationData._103.cast("Integer").alias("doc_event_source_id"),
        validationData._104.cast("Integer").alias("event_country"),
        validationData._105.cast("Integer").alias("event_country_state"),
        validationData._106.cast("Integer").alias("event_geo_location"),
        validationData._107.cast("Integer").alias("event_hour"),
        validationData._108.cast("Integer").alias("event_platform"),
        validationData._109.cast("Integer").alias("traffic_source")
    )

In [56]:
# drop some columns
validationData_final_dropped = validationData_final.drop('doc_event_entity_id_1', 'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4', 'doc_event_entity_id_5', 'doc_event_entity_id_6', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3', 'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6', 'doc_ad_days_since_published', 'doc_event_days_since_published', 'user_has_already_viewed_doc', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'pop_entity_id_country', 'pop_entity_id_country_conf', 'pop_entity_id_country_conf_multipl', 'user_doc_ad_sim_categories', 'user_doc_ad_sim_categories_conf', 'user_doc_ad_sim_categories_conf_multipl', 'user_doc_ad_sim_topics', 'user_doc_ad_sim_topics_conf', 'user_doc_ad_sim_topics_conf_multipl', 'user_doc_ad_sim_entities', 'user_doc_ad_sim_entities_conf', 'user_doc_ad_sim_entities_conf_multipl', 'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_categories_conf', 'doc_event_doc_ad_sim_categories_conf_multipl', 'doc_event_doc_ad_sim_topics', 'doc_event_doc_ad_sim_topics_conf', 'doc_event_doc_ad_sim_topics_conf_multipl', 'doc_event_doc_ad_sim_entities', 'doc_event_doc_ad_sim_entities_conf', 'doc_event_doc_ad_sim_entities_conf_multipl')

In [57]:
# 최종 validation dataframe을 valid_x이라 한다
valid_x = validationData_final_dropped.drop("label")
# label(clicked)는 valid_y로 저장한다
valid_y = validationData_final_dropped.select("label")

## before modeling

In [59]:
cat_vars = ['ad_advertiser', 'display_id', 'doc_ad_category_id_1', 'doc_ad_category_id_2', 'doc_ad_category_id_3', 'doc_ad_source_id', 'doc_ad_topic_id_1', 'doc_ad_topic_id_2', 'doc_ad_topic_id_3', 'doc_event_category_id_1', 'doc_event_category_id_2', 'doc_event_category_id_3', 'doc_event_hour', 'doc_event_id', 'doc_event_publisher_id', 'doc_event_source_id', 'doc_event_topic_id_1', 'doc_event_topic_id_2', 'doc_event_topic_id_3', 'doc_id', 'event_country', 'event_country_state', 'event_geo_location', 'event_hour', 'event_platform', 'event_weekend', 'is_leak', 'traffic_source']
contin_vars = ['ad_views', 'doc_views', 'pop_ad_id', 'pop_ad_id_conf', 'pop_ad_id_conf_multipl', 'pop_advertiser_id', 'pop_advertiser_id_conf', 'pop_advertiser_id_conf_multipl', 'pop_campain_id', 'pop_campain_id_conf', 'pop_campain_id_conf_multipl', 'pop_category_id', 'pop_category_id_conf', 'pop_category_id_conf_multipl', 'pop_category_id_country', 'pop_category_id_country_conf', 'pop_category_id_country_conf_multipl', 'pop_doc_event_doc_ad', 'pop_doc_event_doc_ad_conf', 'pop_doc_event_doc_ad_conf_multipl', 'pop_document_id', 'pop_document_id_conf', 'pop_document_id_conf_multipl', 'pop_publisher_id', 'pop_publisher_id_conf', 'pop_publisher_id_conf_multipl', 'pop_source_id', 'pop_source_id_conf', 'pop_source_id_conf_multipl', 'pop_source_id_country', 'pop_source_id_country_conf', 'pop_source_id_country_conf_multipl', 'pop_topic_id', 'pop_topic_id_conf', 'pop_topic_id_conf_multipl', 'pop_topic_id_country', 'pop_topic_id_country_conf', 'pop_topic_id_country_conf_multipl', 'user_views']

In [60]:
# Importing the libraries for string indexing and applying it to categorical values
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
indexers = [StringIndexer(inputCol=cat_var, outputCol=cat_var+"Index4", handleInvalid="skip") for cat_var in cat_vars]

# defining the model features into a variable
features_set = [cat_var+"Index4" for cat_var in cat_vars]

In [63]:
cont_assembler = VectorAssembler(inputCols=contin_vars, outputCol="contin_test_vars")
contin_scaler = StandardScaler(inputCol="contin_test_vars", outputCol="scaledFeatures", withStd=True, withMean=True)
features_set.append('scaledFeatures')

In [64]:
assembler = VectorAssembler(inputCols=features_set, outputCol="features")

In [65]:
# import the libraries for random forest
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol='train_y', featuresCol='features', maxBins=1500)
# append the model stage in to the indexer
indexers.append(rf)
# define the indexers as the pipeline
pipeline = Pipeline(stages=indexers)
# print the stages defined in the pipeline
print(indexers)

[StringIndexer_474c81d92fb4365d4596, StringIndexer_4258a4de1ceba015361c, StringIndexer_458d841cb7b060792f0a, StringIndexer_43de99d0005eb918cb33, StringIndexer_43799e6ad55e6c42c793, StringIndexer_4b3ca1476cf648e08453, StringIndexer_44388bd6f3c2660eaf93, StringIndexer_4ba48708c16889687fd5, StringIndexer_4e938f6a4d887574bf49, StringIndexer_446a9724c78823332c9e, StringIndexer_4dfb8844c1680ecb3dc8, StringIndexer_4d578fc396624498327a, StringIndexer_445ea9226c1fa857a248, StringIndexer_4eff9e2e723aefdeb8c1, StringIndexer_446a8f97f72c8d095445, StringIndexer_4a2898f45e16c840b9d3, StringIndexer_4a489c7ad28caaf886c8, StringIndexer_44afac73746d49b10f11, StringIndexer_42f5968c8c16cd7aeeb2, StringIndexer_49e88dd1f81ebe5bdeb8, StringIndexer_416b9ff3645d005b0ccc, StringIndexer_4be7a8dca94e5f5b5dd8, StringIndexer_49688fef69a5a91ff777, StringIndexer_4b81b3346c12d32d5b57, StringIndexer_4cf0882db3a3d162c63d, StringIndexer_4d29904f80068c57012f, StringIndexer_41fa916391b01307257a, StringIndexer_4ecfaae624098

In [None]:
# fit the stages in a pipeline
model = pipeline.fit(train_x)
# predict the values on the test set
predictions = model.transform(valid_x)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import mean, min, max, stddev, ceil
evaluator = RegressionEvaluator(labelCol='Sales', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print("Root Mean Square Error on test data = %g" % rmse)