In [1]:
evaluation = True
evaluation_verbose = False

OUTPUT_BUCKET_FOLDER = "gs://cap-18/output/"
DATA_BUCKET_FOLDER = "gs://cap-18/data/"

In [2]:
from IPython.display import display

In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [4]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all" # 한 셀(cell)에서의 코드 실행 결과가 다 보이도록 설정

In [5]:
import numpy as np
import scipy.sparse

In [6]:
import warnings 
warnings.filterwarnings('ignore') # warning 메시지 반환되지 않도록 설정

In [7]:
import math
import datetime
import time
import itertools

In [8]:
import pickle

In [9]:
import random
random.seed(42)

In [10]:
import pandas as pd
%matplotlib inline

### 데이터 로드

In [11]:
extract_country_udf = F.udf(lambda geo: geo.strip()[:2] if geo != None else '', StringType())

In [12]:
events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid_event", StringType(), True),                    
                    StructField("document_id_event", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform_event", IntegerType(), True),
                    StructField("geo_location_event", StringType(), True)]
                    )

events_df = spark.read.schema(events_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "events.csv") \
                .withColumn('dummyEvents', F.lit(1)) \
                .withColumn('event_country', extract_country_udf('geo_location_event')) \
                .alias('events')               

In [13]:
promoted_content_schema = StructType(
                    [StructField("ad_id", IntegerType(), True),
                    StructField("document_id_promo", IntegerType(), True),                    
                    StructField("campaign_id", IntegerType(), True),
                    StructField("advertiser_id", IntegerType(), True)]
                    )

promoted_content_df = spark.read.schema(promoted_content_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
                .withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content').cache()

In [14]:
documents_meta_schema = StructType(
                    [StructField("document_id_doc", IntegerType(), True),
                    StructField("source_id", IntegerType(), True),                    
                    StructField("publisher_id", IntegerType(), True)]
                    )

documents_meta_df = spark.read.schema(documents_meta_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_meta.csv") \
                .withColumn('dummyDocumentsMeta', F.lit(1)).alias('documents_meta').cache()

In [15]:
page_views_schema = StructType(
                    [StructField("uuid_pv", StringType(), True),
                    StructField("document_id_pv", IntegerType(), True),
                    StructField("timestamp_pv", IntegerType(), True),
                    StructField("platform_pv", IntegerType(), True),
                    StructField("geo_location_pv", StringType(), True),
                    StructField("traffic_source_pv", IntegerType(), True)]
                    )
page_views_df = spark.read.schema(page_views_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"page_views.csv") \
                .alias('page_views')        
            
page_views_df.createOrReplaceTempView('page_views')

In [16]:
#Joining with Page Views to get traffic_source_pv
events_joined_df = events_df.join(documents_meta_df \
                                  .withColumnRenamed('source_id', 'source_id_doc_event') \
                                  .withColumnRenamed('publisher_id', 'publisher_doc_event')
                                  , on=F.col("document_id_event") == F.col("document_id_doc"), how='left') \
                            .join(page_views_df, 
                                           on=[F.col('uuid_event') == F.col('uuid_pv'),
                                               F.col('document_id_event') == F.col('document_id_pv'),
                                               F.col('platform_event') == F.col('platform_pv'),
                                               F.col('geo_location_event') == F.col('geo_location_pv')],
                                               how='left') \
                                    .alias('events').cache()

In [95]:
clicks_train_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True),                    
                    StructField("clicked", IntegerType(), True)]
                    )

clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
                .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train').cache()

In [96]:
clicks_train_df.count()

87141731

In [18]:
clicks_train_joined_df = clicks_train_df \
                         .join(promoted_content_df, on='ad_id', how='left') \
                         .join(documents_meta_df, on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"), how='left') \
                         .join(events_joined_df, on='display_id', how='left').cache()                         
clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')

In [31]:
clicks_train_joined_df.show(1)

+----------+-----+-------+----------------+-----------------+-----------+-------------+--------------------+---------------+---------+------------+------------------+--------------+-----------------+---------------+--------------+------------------+-----------+-------------+---------------+-------------------+-------------------+------------------+--------------+--------------+------------+-----------+---------------+-----------------+
|display_id|ad_id|clicked|dummyClicksTrain|document_id_promo|campaign_id|advertiser_id|dummyPromotedContent|document_id_doc|source_id|publisher_id|dummyDocumentsMeta|    uuid_event|document_id_event|timestamp_event|platform_event|geo_location_event|dummyEvents|event_country|document_id_doc|source_id_doc_event|publisher_doc_event|dummyDocumentsMeta|       uuid_pv|document_id_pv|timestamp_pv|platform_pv|geo_location_pv|traffic_source_pv|
+----------+-----+-------+----------------+-----------------+-----------+-------------+--------------------+------------

In [169]:
train_feature_vectors_exported_df = spark.read.parquet("gs://capstone-01/output/train_feature_vectors_integral_eval")
validation_feature_vectors_exported_df = spark.read.parquet("gs://capstone-01/output/validation_feature_vectors_integral")

train_feature_vectors_exported_df.take(1)
validation_feature_vectors_exported_df.take(1)

[Row(uuid=u'10005a0add15f6', display_id=5686397, ad_id=61941, document_id_event=2135921, document_id=1017869, label=1, feature_vector=SparseVector(103, {0: 1.0, 3: 11370.0, 4: 15083.0, 5: 0.0, 6: 3.0, 7: 173.0, 8: 0.4069, 9: 0.5623, 10: 0.2288, 11: 0.3902, 12: 0.5131, 13: 0.2002, 14: 0.2691, 15: 0.3531, 16: 0.095, 17: 0.3902, 18: 0.5131, 19: 0.2002, 20: 0.3902, 21: 0.5131, 22: 0.2002, 26: 0.3902, 27: 0.5131, 28: 0.2002, 29: 0.3902, 30: 0.5131, 31: 0.2002, 32: 0.3826, 33: 0.2833, 34: 0.1084, 35: 0.3962, 36: 0.2833, 37: 0.1122, 38: 0.2738, 39: 0.0012, 40: 0.0003, 41: 0.2801, 42: 0.0012, 43: 0.0003, 44: 0.2298, 45: 0.0622, 46: 0.0143, 47: 0.2336, 48: 0.0595, 49: 0.0139, 59: 0.0, 60: 0.0004, 61: 0.0, 62: 0.0, 63: 0.0, 64: 0.0, 68: 2413.0, 69: 1403.0, 70: 1610.0, 72: 108.0, 75: 194.0, 76: 15.0, 81: 440.0, 82: 4016.0, 83: 1702.0, 84: 1707.0, 86: 137.0, 95: 723.0, 96: 4194.0, 97: 18595452.0, 98: 745661.0, 99: 33260.0, 100: 3.0, 101: 2.0, 102: 1.0}))]

[Row(uuid=u'100289071872c9', display_id=3110701, ad_id=149004, document_id_event=1973614, document_id=1335842, label=1, is_leak=0, feature_vector=SparseVector(103, {0: 0.0, 1: 0.0, 2: 43.0, 3: 5042.0, 4: 15296.0, 5: 0.0, 6: 3.0, 7: 54.0, 8: 0.2582, 9: 0.5133, 10: 0.1326, 11: 0.2288, 12: 0.5384, 13: 0.1232, 14: 0.2288, 15: 0.5384, 16: 0.1232, 17: 0.1855, 18: 0.3432, 19: 0.0637, 20: 0.2378, 21: 0.332, 22: 0.079, 26: 0.2288, 27: 0.5384, 28: 0.1232, 29: 0.2288, 30: 0.5384, 31: 0.1232, 32: 0.2288, 33: 0.4539, 34: 0.1039, 35: 0.2288, 36: 0.4539, 37: 0.1039, 38: 0.1948, 39: 0.018, 40: 0.0035, 41: 0.2483, 42: 0.0167, 43: 0.0041, 44: 0.1838, 45: 0.1421, 46: 0.0261, 47: 0.1886, 48: 0.149, 49: 0.0281, 50: 0.0645, 51: 1.0, 52: 0.0645, 53: 0.0, 54: 0.0005, 55: 0.0, 56: 0.0, 57: 0.0, 58: 0.0, 59: 0.0136, 60: 0.9996, 61: 0.0136, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0, 66: 0.0, 67: 0.0, 68: 709.0, 69: 1100.0, 70: 1408.0, 72: 277.0, 81: 509.0, 82: 3890.0, 83: 1403.0, 84: 1408.0, 86: 136.0, 95: 407.0, 96: 6

In [None]:
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

In [171]:
int_feature_names = ['user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published', 
                    ]

In [172]:
float_feature_names = [                                
                'pop_ad_id',       
                'pop_ad_id_conf',   
                'pop_ad_id_conf_multipl', 
                'pop_document_id',                
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campaign_id',
                'pop_campaign_id_conf',
                'pop_campaign_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',  
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',    
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id', 
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id', 
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',    
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',    
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',                    
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',    
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',    
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',                    
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl'
               ]

In [173]:
TRAFFIC_SOURCE_FV='traffic_source'
EVENT_HOUR_FV='event_hour'
EVENT_COUNTRY_FV = 'event_country'
EVENT_COUNTRY_STATE_FV = 'event_country_state'
EVENT_GEO_LOCATION_FV = 'event_geo_location'
EVENT_PLATFORM_FV = 'event_platform'
AD_ADVERTISER_FV = 'ad_advertiser'
DOC_AD_SOURCE_ID_FV='doc_ad_source_id'
DOC_AD_PUBLISHER_ID_FV='doc_ad_publisher_id'
DOC_EVENT_SOURCE_ID_FV='doc_event_source_id'
DOC_EVENT_PUBLISHER_ID_FV='doc_event_publisher_id'
DOC_AD_CATEGORY_ID_FV='doc_ad_category_id'
DOC_AD_TOPIC_ID_FV='doc_ad_topic_id'
DOC_AD_ENTITY_ID_FV='doc_ad_entity_id'
DOC_EVENT_CATEGORY_ID_FV='doc_event_category_id'
DOC_EVENT_TOPIC_ID_FV='doc_event_topic_id'
DOC_EVENT_ENTITY_ID_FV='doc_event_entity_id'

In [174]:
category_feature_names_integral = ['ad_advertiser',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_ad_entity_id_1', 
 'doc_ad_entity_id_2', 
 'doc_ad_entity_id_3', 
 'doc_ad_entity_id_4', 
 'doc_ad_entity_id_5', 
 'doc_ad_entity_id_6', 
 'doc_ad_publisher_id',
 'doc_ad_source_id', 
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_event_entity_id_1',
 'doc_event_entity_id_2',
 'doc_event_entity_id_3',
 'doc_event_entity_id_4',
 'doc_event_entity_id_5',
 'doc_event_entity_id_6',
 'doc_event_publisher_id',
 'doc_event_source_id', 
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'traffic_source']

In [42]:
feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + \
                                 category_feature_names_integral

In [175]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open("train_feature_vectors_integral_eval.csv"+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [None]:
def sparse_vector_to_csv_with_nulls_row(additional_column_values, vec, num_columns):    
    return ','.join([str(value) for value in additional_column_values] + 
                     list([ '{:.5}'.format(vec[x]) if x in vec.indices else '' for x in range(vec.size) ])[:num_columns]) \
            .replace('.0,',',')

In [177]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn('is_leak', F.lit(-1)) \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [None]:
# rdd를 dataframe으로 변환한 것 -> trainingData
trainingData = train_feature_vectors_integral_csv_rdd.map(lambda x : x.split(",")).toDF()

In [None]:
# spark dataframe으로 잘 변환된 것을 확인할 수 있다
type(trainingData)

In [None]:
# 이제 dataframe의 컬럼별로 datatype을 바꿔보자
# 테스트로, "_1" 열의 datatype을 string에서 integer로 바꿔보자
# datatype 변환에는 cast 함수를 사용한다
# printScheme()을 실행하면, datatype 변환이 잘 되었음을 알 수 있다
changedTypedf = trainingData.withColumn("label", trainingData["_1"].cast("integer"))
changedTypedf.printSchema()

In [None]:
# 이제 컬럼별로 datatype을 선언하자

trainingData_final = trainingData.select(
        trainingData._1.cast("Integer").alias("label"),
        trainingData._2.cast("Integer").alias("display_id"), 
        trainingData._3.cast("Integer").alias("ad_id"), 
        trainingData._4.cast("Integer").alias("doc_id"),
        trainingData._5.cast("Integer").alias("doc_event_id"), 
        trainingData._6.cast("Integer").alias("is_leak"),
        trainingData._7.cast("Integer").alias("event_weekend"),
        trainingData._8.cast("Integer").alias("user_has_already_viewed_doc"), 
        trainingData._9.cast("Integer").alias("user_views"), 
        trainingData._10.cast("Integer").alias("ad_views"),
        trainingData._11.cast("Integer").alias("doc_views"), 
        trainingData._12.cast("Integer").alias("doc_event_days_since_published"),
        trainingData._13.cast("Float").alias("doc_event_hour"),
        trainingData._14.cast("Float").alias("doc_ad_days_since_published"), 
        trainingData._15.cast("Float").alias("pop_ad_id"), 
        trainingData._16.cast("Float").alias("pop_ad_id_conf"),
        trainingData._17.cast("Float").alias("pop_ad_id_conf_multipl"), 
        trainingData._18.cast("Float").alias("pop_document_id"),
        trainingData._19.cast("Float").alias("pop_document_id_conf"),
        trainingData._20.cast("Float").alias("pop_document_id_conf_multipl"), 
        trainingData._21.cast("Float").alias("pop_publisher_id"), 
        trainingData._22.cast("Float").alias("pop_publisher_id_conf"),
        trainingData._23.cast("Float").alias("pop_publisher_id_conf_multipl"), 
        trainingData._24.cast("Float").alias("pop_advertiser_id"),
        trainingData._25.cast("Float").alias("pop_advertiser_id_conf"),
        trainingData._26.cast("Float").alias("pop_advertiser_id_conf_multipl"), 
        trainingData._27.cast("Float").alias("pop_campain_id"), 
        trainingData._28.cast("Float").alias("pop_campain_id_conf"),
        trainingData._29.cast("Float").alias("pop_campain_id_conf_multipl"), 
        trainingData._30.cast("Float").alias("pop_doc_event_doc_ad"),
        trainingData._31.cast("Float").alias("pop_doc_event_doc_ad_conf"),
        trainingData._32.cast("Float").alias("pop_doc_event_doc_ad_conf_multipl"), 
        trainingData._33.cast("Float").alias("pop_source_id"), 
        trainingData._34.cast("Float").alias("pop_source_id_conf"),
        trainingData._35.cast("Float").alias("pop_source_id_conf_multipl"), 
        trainingData._36.cast("Float").alias("pop_source_id_country"),
        trainingData._37.cast("Float").alias("pop_source_id_country_conf"),
        trainingData._38.cast("Float").alias("pop_source_id_country_conf_multipl"),
        trainingData._39.cast("Float").alias("pop_entity_id"),
        trainingData._40.cast("Float").alias("pop_entity_id_conf"),
        trainingData._41.cast("Float").alias("pop_entity_id_conf_multipl"),
        trainingData._42.cast("Float").alias("pop_entity_id_country"),
        trainingData._43.cast("Float").alias("pop_entity_id_country_conf"),
        trainingData._44.cast("Float").alias("pop_entity_id_country_conf_multipl"),
        trainingData._45.cast("Float").alias("pop_topic_id"),
        trainingData._46.cast("Float").alias("pop_topic_id_conf"),
        trainingData._47.cast("Float").alias("pop_topic_id_conf_multipl"),
        trainingData._48.cast("Float").alias("pop_topic_id_country"),
        trainingData._49.cast("Float").alias("pop_topic_id_country_conf"),
        trainingData._50.cast("Float").alias("pop_topic_id_country_conf_multipl"),
        trainingData._51.cast("Float").alias("pop_category_id"),
        trainingData._52.cast("Float").alias("pop_category_id_conf"),
        trainingData._53.cast("Float").alias("pop_category_id_conf_multipl"),
        trainingData._54.cast("Float").alias("pop_category_id_country"),
        trainingData._55.cast("Float").alias("pop_category_id_country_conf"),
        trainingData._56.cast("Float").alias("pop_category_id_country_conf_multipl"),
        trainingData._57.cast("Float").alias("user_doc_ad_sim_categories"),
        trainingData._58.cast("Float").alias("user_doc_ad_sim_categories_conf"),
        trainingData._59.cast("Float").alias("user_doc_ad_sim_categories_conf_multipl"),
        trainingData._60.cast("Float").alias("user_doc_ad_sim_topics"),
        trainingData._61.cast("Float").alias("user_doc_ad_sim_topics_conf"),
        trainingData._62.cast("Float").alias("user_doc_ad_sim_topics_conf_multipl"),
        trainingData._63.cast("Float").alias("user_doc_ad_sim_entities"),
        trainingData._64.cast("Float").alias("user_doc_ad_sim_entities_conf"),
        trainingData._65.cast("Float").alias("user_doc_ad_sim_entities_conf_multipl"),
        trainingData._66.cast("Float").alias("doc_event_doc_ad_sim_categories"),
        trainingData._67.cast("Float").alias("doc_event_doc_ad_sim_categories_conf"),
        trainingData._68.cast("Float").alias("doc_event_doc_ad_sim_categories_conf_multipl"),
        trainingData._69.cast("Float").alias("doc_event_doc_ad_sim_topics"),
        trainingData._70.cast("Float").alias("doc_event_doc_ad_sim_topics_conf"),
        trainingData._71.cast("Float").alias("doc_event_doc_ad_sim_topics_conf_multipl"),
        trainingData._72.cast("Float").alias("doc_event_doc_ad_sim_entities"),
        trainingData._73.cast("Float").alias("doc_event_doc_ad_sim_entities_conf"),
        trainingData._74.cast("Float").alias("doc_event_doc_ad_sim_entities_conf_multipl"),
        trainingData._75.cast("Integer").alias("ad_advertiser"),
        trainingData._76.cast("Integer").alias("doc_ad_category_id_1"),
        trainingData._77.cast("Integer").alias("doc_ad_category_id_2"),
        trainingData._78.cast("Integer").alias("doc_ad_category_id_3"),
        trainingData._79.cast("Integer").alias("doc_ad_topic_id_1"),
        trainingData._80.cast("Integer").alias("doc_ad_topic_id_2"),
        trainingData._81.cast("Integer").alias("doc_ad_topic_id_3"),
        trainingData._82.cast("Integer").alias("doc_ad_entity_id_1"),
        trainingData._83.cast("Integer").alias("doc_ad_entity_id_2"),
        trainingData._84.cast("Integer").alias("doc_ad_entity_id_3"),
        trainingData._85.cast("Integer").alias("doc_ad_entity_id_4"),
        trainingData._86.cast("Integer").alias("doc_ad_entity_id_5"),
        trainingData._87.cast("Integer").alias("doc_ad_entity_id_6"),
        trainingData._88.cast("Integer").alias("doc_ad_publisher_id"),
        trainingData._89.cast("Integer").alias("doc_ad_source_id"),
        trainingData._90.cast("Integer").alias("doc_event_category_id_1"),
        trainingData._91.cast("Integer").alias("doc_event_category_id_2"),
        trainingData._92.cast("Integer").alias("doc_event_category_id_3"),
        trainingData._93.cast("Integer").alias("doc_event_topic_id_1"),
        trainingData._94.cast("Integer").alias("doc_event_topic_id_2"),
        trainingData._95.cast("Integer").alias("doc_event_topic_id_3"),
        trainingData._96.cast("Integer").alias("doc_event_entity_id_1"),
        trainingData._97.cast("Integer").alias("doc_event_entity_id_2"),
        trainingData._98.cast("Integer").alias("doc_event_entity_id_3"),
        trainingData._99.cast("Integer").alias("doc_event_entity_id_4"),
        trainingData._100.cast("Integer").alias("doc_event_entity_id_5"),
        trainingData._101.cast("Integer").alias("doc_event_entity_id_6"),
        trainingData._102.cast("Integer").alias("doc_event_publisher_id"),
        trainingData._103.cast("Integer").alias("doc_event_source_id"),
        trainingData._104.cast("Integer").alias("event_country"),
        trainingData._105.cast("Integer").alias("event_country_state"),
        trainingData._106.cast("Integer").alias("event_geo_location"),
        trainingData._107.cast("Integer").alias("event_hour"),
        trainingData._108.cast("Integer").alias("event_platform"),
        trainingData._109.cast("Integer").alias("traffic_source")
    ).cache()

---

In [None]:
validation_feature_vector_integral_csv_folder_name = 'validation_feature_vectors_integral.csv'

integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open(validation_feature_vector_integral_csv_folder_name+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [None]:
validation_feature_vectors_integral_csv_rdd = validation_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'is_leak', 'feature_vector') \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [None]:
# rdd를 dataframe으로 변환한 것 -> validationData
validationData = validation_feature_vectors_integral_csv_rdd.map(lambda x : x.split(",")).toDF()

In [None]:
# 이제 dataframe의 컬럼별로 datatype을 바꿔보자
# 테스트로, "_1" 열의 datatype을 string에서 integer로 바꿔보자
# datatype 변환에는 cast 함수를 사용한다
# printScheme()을 실행하면, datatype 변환이 잘 되었음을 알 수 있다
changedTypedf = validationData.withColumn("label", validationData["_1"].cast("integer"))
changedTypedf.printSchema()

In [None]:
# spark dataframe으로 잘 변환된 것을 확인할 수 있다
type(validationData)

In [None]:
# 이제 컬럼별로 datatype을 선언하자

validationData_final = validationData.select(
        validationData._1.cast("Integer").alias("label"),
        validationData._2.cast("Integer").alias("display_id"), 
        validationData._3.cast("Integer").alias("ad_id"), 
        validationData._4.cast("Integer").alias("doc_id"),
        validationData._5.cast("Integer").alias("doc_event_id"), 
        validationData._6.cast("Integer").alias("is_leak"),
        validationData._7.cast("Integer").alias("event_weekend"),
        validationData._8.cast("Integer").alias("user_has_already_viewed_doc"), 
        validationData._9.cast("Integer").alias("user_views"), 
        validationData._10.cast("Integer").alias("ad_views"),
        validationData._11.cast("Integer").alias("doc_views"), 
        validationData._12.cast("Integer").alias("doc_event_days_since_published"),
        validationData._13.cast("Float").alias("doc_event_hour"),
        validationData._14.cast("Float").alias("doc_ad_days_since_published"), 
        validationData._15.cast("Float").alias("pop_ad_id"), 
        validationData._16.cast("Float").alias("pop_ad_id_conf"),
        validationData._17.cast("Float").alias("pop_ad_id_conf_multipl"), 
        validationData._18.cast("Float").alias("pop_document_id"),
        validationData._19.cast("Float").alias("pop_document_id_conf"),
        validationData._20.cast("Float").alias("pop_document_id_conf_multipl"), 
        validationData._21.cast("Float").alias("pop_publisher_id"), 
        validationData._22.cast("Float").alias("pop_publisher_id_conf"),
        validationData._23.cast("Float").alias("pop_publisher_id_conf_multipl"), 
        validationData._24.cast("Float").alias("pop_advertiser_id"),
        validationData._25.cast("Float").alias("pop_advertiser_id_conf"),
        validationData._26.cast("Float").alias("pop_advertiser_id_conf_multipl"), 
        validationData._27.cast("Float").alias("pop_campain_id"), 
        validationData._28.cast("Float").alias("pop_campain_id_conf"),
        validationData._29.cast("Float").alias("pop_campain_id_conf_multipl"), 
        validationData._30.cast("Float").alias("pop_doc_event_doc_ad"),
        validationData._31.cast("Float").alias("pop_doc_event_doc_ad_conf"),
        validationData._32.cast("Float").alias("pop_doc_event_doc_ad_conf_multipl"), 
        validationData._33.cast("Float").alias("pop_source_id"), 
        validationData._34.cast("Float").alias("pop_source_id_conf"),
        validationData._35.cast("Float").alias("pop_source_id_conf_multipl"), 
        validationData._36.cast("Float").alias("pop_source_id_country"),
        validationData._37.cast("Float").alias("pop_source_id_country_conf"),
        validationData._38.cast("Float").alias("pop_source_id_country_conf_multipl"),
        validationData._39.cast("Float").alias("pop_entity_id"),
        validationData._40.cast("Float").alias("pop_entity_id_conf"),
        validationData._41.cast("Float").alias("pop_entity_id_conf_multipl"),
        validationData._42.cast("Float").alias("pop_entity_id_country"),
        validationData._43.cast("Float").alias("pop_entity_id_country_conf"),
        validationData._44.cast("Float").alias("pop_entity_id_country_conf_multipl"),
        validationData._45.cast("Float").alias("pop_topic_id"),
        validationData._46.cast("Float").alias("pop_topic_id_conf"),
        validationData._47.cast("Float").alias("pop_topic_id_conf_multipl"),
        validationData._48.cast("Float").alias("pop_topic_id_country"),
        validationData._49.cast("Float").alias("pop_topic_id_country_conf"),
        validationData._50.cast("Float").alias("pop_topic_id_country_conf_multipl"),
        validationData._51.cast("Float").alias("pop_category_id"),
        validationData._52.cast("Float").alias("pop_category_id_conf"),
        validationData._53.cast("Float").alias("pop_category_id_conf_multipl"),
        validationData._54.cast("Float").alias("pop_category_id_country"),
        validationData._55.cast("Float").alias("pop_category_id_country_conf"),
        validationData._56.cast("Float").alias("pop_category_id_country_conf_multipl"),
        validationData._57.cast("Float").alias("user_doc_ad_sim_categories"),
        validationData._58.cast("Float").alias("user_doc_ad_sim_categories_conf"),
        validationData._59.cast("Float").alias("user_doc_ad_sim_categories_conf_multipl"),
        validationData._60.cast("Float").alias("user_doc_ad_sim_topics"),
        validationData._61.cast("Float").alias("user_doc_ad_sim_topics_conf"),
        validationData._62.cast("Float").alias("user_doc_ad_sim_topics_conf_multipl"),
        validationData._63.cast("Float").alias("user_doc_ad_sim_entities"),
        validationData._64.cast("Float").alias("user_doc_ad_sim_entities_conf"),
        validationData._65.cast("Float").alias("user_doc_ad_sim_entities_conf_multipl"),
        validationData._66.cast("Float").alias("doc_event_doc_ad_sim_categories"),
        validationData._67.cast("Float").alias("doc_event_doc_ad_sim_categories_conf"),
        validationData._68.cast("Float").alias("doc_event_doc_ad_sim_categories_conf_multipl"),
        validationData._69.cast("Float").alias("doc_event_doc_ad_sim_topics"),
        validationData._70.cast("Float").alias("doc_event_doc_ad_sim_topics_conf"),
        validationData._71.cast("Float").alias("doc_event_doc_ad_sim_topics_conf_multipl"),
        validationData._72.cast("Float").alias("doc_event_doc_ad_sim_entities"),
        validationData._73.cast("Float").alias("doc_event_doc_ad_sim_entities_conf"),
        validationData._74.cast("Float").alias("doc_event_doc_ad_sim_entities_conf_multipl"),
        validationData._75.cast("Integer").alias("ad_advertiser"),
        validationData._76.cast("Integer").alias("doc_ad_category_id_1"),
        validationData._77.cast("Integer").alias("doc_ad_category_id_2"),
        validationData._78.cast("Integer").alias("doc_ad_category_id_3"),
        validationData._79.cast("Integer").alias("doc_ad_topic_id_1"),
        validationData._80.cast("Integer").alias("doc_ad_topic_id_2"),
        validationData._81.cast("Integer").alias("doc_ad_topic_id_3"),
        validationData._82.cast("Integer").alias("doc_ad_entity_id_1"),
        validationData._83.cast("Integer").alias("doc_ad_entity_id_2"),
        validationData._84.cast("Integer").alias("doc_ad_entity_id_3"),
        validationData._85.cast("Integer").alias("doc_ad_entity_id_4"),
        validationData._86.cast("Integer").alias("doc_ad_entity_id_5"),
        validationData._87.cast("Integer").alias("doc_ad_entity_id_6"),
        validationData._88.cast("Integer").alias("doc_ad_publisher_id"),
        validationData._89.cast("Integer").alias("doc_ad_source_id"),
        validationData._90.cast("Integer").alias("doc_event_category_id_1"),
        validationData._91.cast("Integer").alias("doc_event_category_id_2"),
        validationData._92.cast("Integer").alias("doc_event_category_id_3"),
        validationData._93.cast("Integer").alias("doc_event_topic_id_1"),
        validationData._94.cast("Integer").alias("doc_event_topic_id_2"),
        validationData._95.cast("Integer").alias("doc_event_topic_id_3"),
        validationData._96.cast("Integer").alias("doc_event_entity_id_1"),
        validationData._97.cast("Integer").alias("doc_event_entity_id_2"),
        validationData._98.cast("Integer").alias("doc_event_entity_id_3"),
        validationData._99.cast("Integer").alias("doc_event_entity_id_4"),
        validationData._100.cast("Integer").alias("doc_event_entity_id_5"),
        validationData._101.cast("Integer").alias("doc_event_entity_id_6"),
        validationData._102.cast("Integer").alias("doc_event_publisher_id"),
        validationData._103.cast("Integer").alias("doc_event_source_id"),
        validationData._104.cast("Integer").alias("event_country"),
        validationData._105.cast("Integer").alias("event_country_state"),
        validationData._106.cast("Integer").alias("event_geo_location"),
        validationData._107.cast("Integer").alias("event_hour"),
        validationData._108.cast("Integer").alias("event_platform"),
        validationData._109.cast("Integer").alias("traffic_source")
    )

In [None]:
trainingData_final_dropped = trainingData_final.drop('doc_event_entity_id_1', 'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4', 'doc_event_entity_id_5', 'doc_event_entity_id_6', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3', 'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6', 'doc_ad_days_since_published', 'doc_event_days_since_published', 'user_has_already_viewed_doc', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'pop_entity_id_country', 'pop_entity_id_country_conf', 'pop_entity_id_country_conf_multipl', 'user_doc_ad_sim_categories', 'user_doc_ad_sim_categories_conf', 'user_doc_ad_sim_categories_conf_multipl', 'user_doc_ad_sim_topics', 'user_doc_ad_sim_topics_conf', 'user_doc_ad_sim_topics_conf_multipl', 'user_doc_ad_sim_entities', 'user_doc_ad_sim_entities_conf', 'user_doc_ad_sim_entities_conf_multipl', 'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_categories_conf', 'doc_event_doc_ad_sim_categories_conf_multipl', 'doc_event_doc_ad_sim_topics', 'doc_event_doc_ad_sim_topics_conf', 'doc_event_doc_ad_sim_topics_conf_multipl', 'doc_event_doc_ad_sim_entities', 'doc_event_doc_ad_sim_entities_conf', 'doc_event_doc_ad_sim_entities_conf_multipl')

In [None]:
# drop some columns
validationData_final_dropped = validationData_final.drop('doc_event_entity_id_1', 'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4', 'doc_event_entity_id_5', 'doc_event_entity_id_6', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3', 'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6', 'doc_ad_days_since_published', 'doc_event_days_since_published', 'user_has_already_viewed_doc', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'pop_entity_id_country', 'pop_entity_id_country_conf', 'pop_entity_id_country_conf_multipl', 'user_doc_ad_sim_categories', 'user_doc_ad_sim_categories_conf', 'user_doc_ad_sim_categories_conf_multipl', 'user_doc_ad_sim_topics', 'user_doc_ad_sim_topics_conf', 'user_doc_ad_sim_topics_conf_multipl', 'user_doc_ad_sim_entities', 'user_doc_ad_sim_entities_conf', 'user_doc_ad_sim_entities_conf_multipl', 'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_categories_conf', 'doc_event_doc_ad_sim_categories_conf_multipl', 'doc_event_doc_ad_sim_topics', 'doc_event_doc_ad_sim_topics_conf', 'doc_event_doc_ad_sim_topics_conf_multipl', 'doc_event_doc_ad_sim_entities', 'doc_event_doc_ad_sim_entities_conf', 'doc_event_doc_ad_sim_entities_conf_multipl')

In [None]:
# 최종 train dataframe을 train이라 한다
train = trainingData_final_dropped
validation = validationData_final_dropped
train = train.cache()
validation = validation.cache()

In [None]:
from pyspark.sql.functions import lit

train = train.withColumn("is_train", lit(1))
validation = validation.withColumn('is_train', lit(0))

In [None]:
train.printSchema()

In [None]:
validation.printSchema()

In [None]:
# train_validation 합치기

train_valid_merged = train.union(validation).cache()

In [78]:
train_valid_merged.count()

87141731

`train_valid_merged`이 잘 생성된 것을 확인할 수 있다.

In [79]:
train_valid_merged = train_valid_merged.drop('doc_ad_category_id_1', 'doc_ad_category_id_2', 'doc_ad_category_id_3', 
                          'doc_ad_topic_id_1', 'doc_ad_topic_id_2', 'doc_ad_topic_id_3',
                          'doc_event_category_id_1', 'doc_event_category_id_2', 'doc_event_category_id_3',
                           'doc_event_topic_id_1', 'doc_event_topic_id_2', 'doc_event_topic_id_3',
                          'event_hour', 'event_geo_location', 'event_country_state',
                          'event_country', 'ad_advertiser', 'doc_ad_publisher_id', 'doc_ad_source_id',
                          'doc_ad_category')

`display_id`와 `ad_id`를 사용하여 `uuid`를 가져오자.

In [102]:
documents_categories_schema = StructType(
                    [StructField("document_id_cat", IntegerType(), True),
                    StructField("category_id", IntegerType(), True),                    
                    StructField("confidence_level_cat", FloatType(), True)]
                    )

documents_categories_df = spark.read.schema(documents_categories_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_categories.csv") \
                .alias('documents_categories').cache()
    
documents_categories_grouped_df = documents_categories_df.groupBy('document_id_cat') \
                                            .agg(F.collect_list('category_id').alias('category_id_list'),
                                                 F.collect_list('confidence_level_cat').alias('confidence_level_cat_list')) \
                                            .withColumn('dummyDocumentsCategory', F.lit(1)) \
                                            .alias('documents_categories_grouped')

In [103]:
documents_topics_schema = StructType(
                    [StructField("document_id_top", IntegerType(), True),
                    StructField("topic_id", IntegerType(), True),                    
                    StructField("confidence_level_top", FloatType(), True)]
                    )

documents_topics_df = spark.read.schema(documents_topics_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_topics.csv")  \
                .alias('documents_topics').cache()
    
documents_topics_grouped_df = documents_topics_df.groupBy('document_id_top') \
                                            .agg(F.collect_list('topic_id').alias('topic_id_list'),
                                                 F.collect_list('confidence_level_top').alias('confidence_level_top_list')) \
                                            .withColumn('dummyDocumentsTopics', F.lit(1)) \
                                            .alias('documents_topics_grouped')

In [111]:
clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
                .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train').cache()

In [112]:
clicks_joined_df = clicks_train_df \
        .join(events_df, on = 'display_id', how = 'left').cache()

clicks_joined_df.count()

87141731

In [113]:
clicks_joined_df.show(1)

+----------+-----+-------+----------------+--------------+-----------------+---------------+--------------+------------------+-----------+-------------+
|display_id|ad_id|clicked|dummyClicksTrain|    uuid_event|document_id_event|timestamp_event|platform_event|geo_location_event|dummyEvents|event_country|
+----------+-----+-------+----------------+--------------+-----------------+---------------+--------------+------------------+-----------+-------------+
|       148|89351|      1|               1|9adce6a5363308|          1205772|          11202|             2|         US>LA>612|          1|           US|
+----------+-----+-------+----------------+--------------+-----------------+---------------+--------------+------------------+-----------+-------------+
only showing top 1 row



In [114]:
train_valid_merged_df = train_valid_merged.join(clicks_joined_df, ['display_id','ad_id']).cache()
train_valid_merged_df.count()

87141731

`ad_id`를 사용하여 `campaign_id`와 `advertiser_id`를 가져오자.

In [115]:
train_valid_ad_merged_df = train_valid_merged_df.join(promoted_content_df, 'ad_id').cache()
train_valid_ad_merged_df.count()

87141731

In [116]:
train_valid_ad_merged_df.printSchema()

root
 |-- ad_id: integer (nullable = true)
 |-- display_id: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- doc_id: integer (nullable = true)
 |-- doc_event_id: integer (nullable = true)
 |-- is_leak: integer (nullable = true)
 |-- event_weekend: integer (nullable = true)
 |-- user_views: integer (nullable = true)
 |-- ad_views: integer (nullable = true)
 |-- doc_views: integer (nullable = true)
 |-- doc_event_hour: float (nullable = true)
 |-- pop_ad_id: float (nullable = true)
 |-- pop_ad_id_conf: float (nullable = true)
 |-- pop_ad_id_conf_multipl: float (nullable = true)
 |-- pop_document_id: float (nullable = true)
 |-- pop_document_id_conf: float (nullable = true)
 |-- pop_document_id_conf_multipl: float (nullable = true)
 |-- pop_publisher_id: float (nullable = true)
 |-- pop_publisher_id_conf: float (nullable = true)
 |-- pop_publisher_id_conf_multipl: float (nullable = true)
 |-- pop_advertiser_id: float (nullable = true)
 |-- pop_advertiser_id_conf: float

In [123]:
train_subset = train_valid_ad_merged_df.drop('dummyEvents', 'dummyPromotedContent', 'geo_location_event', 'timestamp_event',
                                            'dummyClicksTrain', 'clicked').cache()

In [124]:
train_subset.count()

87141731

In [126]:
# train_subset.write.csv("gs://cap-18/output/train_subset.csv")

In [130]:
train_subset.write.parquet(OUTPUT_BUCKET_FOLDER + 'train_subset', mode='overwrite')

**주의) overwrite을 조심하자...**

`document_id`를 사용하여 `category_id`, `topic_id`, `publisher_id`를 가져오자.

페이지별로 신뢰도가 높은 하나의 카테고리, 토픽만 뽑아보자.

In [120]:
documents_categories_grouped_df.show(10)

# 신뢰도 순으로 정렬되어 있음을 알 수 있음.

+---------------+----------------+-------------------------+----------------------+
|document_id_cat|category_id_list|confidence_level_cat_list|dummyDocumentsCategory|
+---------------+----------------+-------------------------+----------------------+
|            148|    [1403, 1702]|             [0.92, 0.07]|                     1|
|            463|    [1513, 1808]|     [0.8932095, 0.067...|                     1|
|            471|    [1504, 1609]|             [0.92, 0.07]|                     1|
|            496|    [1210, 1203]|             [0.92, 0.07]|                     1|
|            833|    [1305, 2004]|             [0.92, 0.07]|                     1|
|           1088|    [2006, 1210]|     [0.8364613, 0.063...|                     1|
|           1238|    [1100, 1407]|     [0.34836665, 0.02...|                     1|
|           1342|    [1408, 2004]|     [0.42835742, 0.03...|                     1|
|           1580|    [1403, 1402]|     [0.65625566, 0.04...|                

In [142]:
documents_categories_grouped_df.count()

2828649

In [154]:
documents_categories_grouped_df_pandas = documents_categories_grouped_df.toPandas()
documents_categories_grouped_df_pandas.head(1)
documents_category_new = pd.DataFrame(documents_categories_grouped_df_pandas.category_id_list.tolist(), columns=['category_id', 'category_id2'])
documents_category_new = documents_category_new['category_id']
documents_categories_grouped_df_pandas = pd.concat([documents_categories_grouped_df_pandas, documents_category_new], axis = 1)
documents_categories_grouped_df_pandas = documents_categories_grouped_df_pandas[['document_id_cat', 'category_id']]
documents_categories_grouped_df_pandas.count()

Unnamed: 0,document_id_cat,category_id_list,confidence_level_cat_list,dummyDocumentsCategory
0,148,"[1403, 1702]","[0.920000016689, 0.070000000298]",1


document_id_cat    2828649
category_id        2828649
dtype: int64

In [121]:
documents_topics_grouped_df.show(10)

# 신뢰도 순으로 정렬되어 있음을 알 수 있음.

+---------------+--------------------+-------------------------+--------------------+
|document_id_top|       topic_id_list|confidence_level_top_list|dummyDocumentsTopics|
+---------------+--------------------+-------------------------+--------------------+
|            148|[153, 140, 8, 172...|     [0.07523697, 0.07...|                   1|
|            463|[181, 292, 24, 25...|     [0.11870128, 0.05...|                   1|
|            471|[285, 238, 153, 193]|     [0.15588789, 0.04...|                   1|
|            496|[244, 294, 196, 1...|     [0.18284231, 0.11...|                   1|
|            833|[294, 89, 174, 86...|     [0.11430275, 0.04...|                   1|
|           1088|[107, 75, 153, 64...|     [0.10822894, 0.06...|                   1|
|           1238| [89, 221, 192, 236]|     [0.023348164, 0.0...|                   1|
|           1342|[271, 283, 181, 2...|     [0.0457309, 0.025...|                   1|
|           1580|[8, 37, 136, 12, ...|     [0.08965496

In [155]:
documents_topics_grouped_df_pandas = documents_topics_grouped_df.toPandas()
documents_topics_grouped_df_pandas.head(1)

Unnamed: 0,document_id_top,topic_id_list,confidence_level_top_list,dummyDocumentsTopics
0,148,"[153, 140, 8, 172, 244, 179, 36, 2, 64, 10, 216]","[0.0752369686961, 0.0719832107425, 0.061084270...",1


In [161]:
documents_topics_new = pd.DataFrame(documents_topics_grouped_df_pandas.topic_id_list.tolist(), columns=['topic_id', '','','','','','','','','','','','','','','','','','','','','','','','','','','',
                                                                                                       '','','','','','','','','','',''])
documents_topics_new = documents_topics_new['topic_id']
documents_topics_new.head(1)

0    153
Name: topic_id, dtype: int64

In [162]:
documents_topics_grouped_df_pandas = pd.concat([documents_topics_grouped_df_pandas, documents_topics_new], axis = 1)
documents_topics_grouped_df_pandas = documents_topics_grouped_df_pandas[['document_id_top', 'topic_id']]

In [163]:
documents_topics_grouped_df_pandas.count()
documents_topics_grouped_df_pandas.head(1)

document_id_top    2495423
topic_id           2495423
dtype: int64

Unnamed: 0,document_id_top,topic_id
0,148,153


생성한 페이지별 카테고리, 토픽 정보를 기존 파일에 합치자.

In [164]:
#Create Spark DataFrame from Pandas

categories_schema = StructType(
                    [StructField("document_id_cat", StringType(), True),
                    StructField("category_id", StringType(), True)]
                    )

topic_schema = StructType(
                    [StructField("document_id_top", StringType(), True),
                    StructField("topic_id", StringType(), True)]
                    )

documents_categories_table = sqlContext.createDataFrame(documents_topics_grouped_df_pandas, categories_schema)
documents_topics_table = sqlContext.createDataFrame(documents_categories_grouped_df_pandas, topic_schema)

In [165]:
documents_categories_table.show(1)
documents_topics_table.show(1)

+---------------+-----------+
|document_id_cat|category_id|
+---------------+-----------+
|            148|        153|
+---------------+-----------+
only showing top 1 row

+---------------+--------+
|document_id_top|topic_id|
+---------------+--------+
|            148|    1403|
+---------------+--------+
only showing top 1 row



In [167]:
train_subset.select('event_platform').distinct().show()

+--------------+
|event_platform|
+--------------+
|          null|
|             1|
|             6|
|             3|
|             5|
|             4|
|             2|
+--------------+



## OHE(One-Hot Encoding)

In [85]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

`train`에서 세 개의 컬럼만 빼서 logistic regression으로 적합시켜보자.

In [303]:
train_light = train.select('label', 'event_weekend', 'doc_event_hour', 'pop_advertiser_id')

결측치가 포함되어 있으면 VectorAssembler가 작동하지 않는다.

In [313]:
train_light = train_light.dropna() ### 테스트 용으로 생성하였음. 

In [314]:
train_light.show(1)

+-----+-------------+--------------+-----------------+
|label|event_weekend|doc_event_hour|pop_advertiser_id|
+-----+-------------+--------------+-----------------+
|    1|            1|           3.0|          0.39017|
+-----+-------------+--------------+-----------------+
only showing top 1 row



In [315]:
train_light.printSchema()

root
 |-- label: integer (nullable = true)
 |-- event_weekend: integer (nullable = true)
 |-- doc_event_hour: float (nullable = true)
 |-- pop_advertiser_id: float (nullable = true)



In [316]:
categorical_columns= ['event_weekend', 'doc_event_hour']

indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categorical_columns
]

encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]

numericCols = ["pop_advertiser_id"]

assemblerInputs = [encoder.getOutputCol() for encoder in encoders] + numericCols
assembler = VectorAssembler(inputCols= assemblerInputs, outputCol="features")

In [317]:
pipeline = Pipeline(stages=indexers + encoders+[assembler])
model=pipeline.fit(train_light)

In [318]:
transformed = model.transform(train_light)
transformed.show(5)

+-----+-------------+--------------+-----------------+---------------------+----------------------+-----------------------------+------------------------------+--------------------+
|label|event_weekend|doc_event_hour|pop_advertiser_id|event_weekend_indexed|doc_event_hour_indexed|event_weekend_indexed_encoded|doc_event_hour_indexed_encoded|            features|
+-----+-------------+--------------+-----------------+---------------------+----------------------+-----------------------------+------------------------------+--------------------+
|    1|            1|           3.0|          0.39017|                  1.0|                   3.0|                (2,[1],[1.0])|                 (6,[3],[1.0])|(9,[1,5,8],[1.0,1...|
|    0|            1|           3.0|         0.045525|                  1.0|                   3.0|                (2,[1],[1.0])|                 (6,[3],[1.0])|(9,[1,5,8],[1.0,1...|
|    0|            0|           5.0|          0.12022|                  0.0|              

In [301]:
lrModel = LogisticRegression().fit(transformed)

In [302]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: [-0.794120161470076,-0.7801337170118473,-0.2914448004121033,-0.30109970451540263,-0.30425818741879407,-0.2616615092582643,-0.21434825317394268,-0.2341588436782957,6.0348883381458505]
Intercept: -1.64561707644


변수별 결측치를 세보자.

In [324]:
train.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in train.columns]).show()

+-----+----------+-----+------+------------+-------+-------------+----------+--------+---------+--------------+---------+--------------+----------------------+---------------+--------------------+----------------------------+----------------+---------------------+-----------------------------+-----------------+----------------------+------------------------------+--------------+-------------------+---------------------------+--------------------+-------------------------+---------------------------------+-------------+------------------+--------------------------+---------------------+--------------------------+----------------------------------+------------+-----------------+-------------------------+--------------------+-------------------------+---------------------------------+---------------+--------------------+----------------------------+-----------------------+----------------------------+------------------------------------+-------------+--------------------+-----------------

### Feature 별 처리 방안

`user_views`: 전체 유저의 뷰 수의 중앙값으로 대체 + 각각의 4분위 구해서 4가지 수준으로 생성 + OHE 

`ad_views`: 전체 유저의 광고 뷰 수의 중앙값으로 대체 + 각각의 4분위 구해서 4가지 수준으로 생성  + OHE 

`doc_views`: 중앙값으로 대체 + 각각의 4분위 구해서 4가지 수준으로 생성  + OHE 

`pop` 시리즈: 각 수준별 중앙값으로 대체 + 각각의 4분위 구해서 4가지 수준으로 생성 + OHE

`traffic_source`: 최빈값 1로 대체 + OHE
 
`event_platform`: 최빈값 1로 대체 + OHE

`campaign_id`: 추가하고, 원래 값 가져와서 OHE로 처리할 것.



--- 

**삭제된 Features**

`event_geo_location`: 삭제

`event_country_state`: 삭제

`event_hour`: 삭제

---

**지우긴 했으나 이후 과정 해야 하는 Features**

`doc_ad_category`: 해시된 것 지우고, 원래 값 가져와서 OHE로 처리할 것.

`ad_advertiser`: 해시된 것 지우고, 원래 값 가져와서 OHE로 처리할 것.

`doc_ad_publisher_id`: 해시된 것 지우고, 가장 많이 광고 내는 publisher_id로 대체 + OHE

`doc_ad_source_id`: 해시된 것 지우고, 최빈값으로 대체 + OHE

`event_country`: 해시된 것 지우고, 최빈값 US로 대체

In [327]:
train_dropped = train.drop('doc_ad_category_id_1', 'doc_ad_category_id_2', 'doc_ad_category_id_3', 
                          'doc_ad_topic_id_1', 'doc_ad_topic_id_2', 'doc_ad_topic_id_3',
                          'doc_event_category_id_1', 'doc_event_category_id_2', 'doc_event_category_id_3',
                           'doc_event_topic_id_1', 'doc_event_topic_id_2', 'doc_event_topic_id_3',
                          'event_hour', 'event_geo_location', 'event_country_state',
                          'event_country', 'ad_advertiser', 'doc_ad_publisher_id', 'doc_ad_source_id',
                          'doc_ad_category')

**user_views** : uuid를 join해서 유저별 views로 정리

In [330]:
train.select('user_views').describe().show()

+-------+-----------------+
|summary|       user_views|
+-------+-----------------+
|  count|          4740153|
|   mean|34.11778543857129|
| stddev|36.59554434203119|
|    min|                1|
|    max|              660|
+-------+-----------------+



**ad_views** : ad_id별 최소값으로 정리

**doc_views** : doc_id별 최소값으로 정리

### 어