# Creating a Data Table

As of Dec 04, 2018

## Part A: 분석을 위한 환경 설정

### 모듈/패키지 로드

In [1]:
evaluation = True
evaluation_verbose = False

OUTPUT_BUCKET_FOLDER = "gs://cap-18/output/"
DATA_BUCKET_FOLDER = "gs://cap-18/data/"

In [2]:
from IPython.display import display

In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [4]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all" # 한 셀(cell)에서의 코드 실행 결과가 다 보이도록 설정

In [5]:
import numpy as np
import scipy.sparse

In [6]:
import warnings 
warnings.filterwarnings('ignore') # warning 메시지 반환되지 않도록 설정

In [7]:
import math
import datetime
import time
import itertools

In [8]:
import pickle

In [9]:
import random
random.seed(42)

In [10]:
import pandas as pd
%matplotlib inline

### 데이터 로드

In [11]:
extract_country_udf = F.udf(lambda geo: geo.strip()[:2] if geo != None else '', StringType())

In [12]:
truncate_day_from_timestamp_udf = F.udf(lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType())

In [13]:
events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid_event", StringType(), True),                    
                    StructField("document_id_event", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform_event", IntegerType(), True),
                    StructField("geo_location_event", StringType(), True)]
                    )

events_df = spark.read.schema(events_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "events.csv") \
                .withColumn('dummyEvents', F.lit(1)) \
                .withColumn('event_country', extract_country_udf('geo_location_event')) \
                .withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \
                .alias('events')               

In [14]:
promoted_content_schema = StructType(
                    [StructField("ad_id", IntegerType(), True),
                    StructField("document_id_promo", IntegerType(), True),                    
                    StructField("campaign_id", IntegerType(), True),
                    StructField("advertiser_id", IntegerType(), True)]
                    )

promoted_content_df = spark.read.schema(promoted_content_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
                .withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content').cache()

In [15]:
documents_meta_schema = StructType(
                    [StructField("document_id_doc", IntegerType(), True),
                    StructField("source_id", IntegerType(), True),                    
                    StructField("publisher_id", IntegerType(), True)]
                    )

documents_meta_df = spark.read.schema(documents_meta_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_meta.csv") \
                .withColumn('dummyDocumentsMeta', F.lit(1)).alias('documents_meta').cache()

In [16]:
page_views_schema = StructType(
                    [StructField("uuid_pv", StringType(), True),
                    StructField("document_id_pv", IntegerType(), True),
                    StructField("timestamp_pv", IntegerType(), True),
                    StructField("platform_pv", IntegerType(), True),
                    StructField("geo_location_pv", StringType(), True),
                    StructField("traffic_source_pv", IntegerType(), True)]
                    )
page_views_df = spark.read.schema(page_views_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"page_views.csv") \
                .alias('page_views')        
            
page_views_df.createOrReplaceTempView('page_views')

In [17]:
#Joining with Page Views to get traffic_source_pv
events_joined_df = events_df.join(documents_meta_df \
                                  .withColumnRenamed('source_id', 'source_id_doc_event') \
                                  .withColumnRenamed('publisher_id', 'publisher_doc_event')
                                  , on=F.col("document_id_event") == F.col("document_id_doc"), how='left') \
                            .join(page_views_df, 
                                           on=[F.col('uuid_event') == F.col('uuid_pv'),
                                               F.col('document_id_event') == F.col('document_id_pv'),
                                               F.col('platform_event') == F.col('platform_pv'),
                                               F.col('geo_location_event') == F.col('geo_location_pv')],
                                               how='left') \
                                    .alias('events').cache()

In [18]:
clicks_train_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True),                    
                    StructField("clicked", IntegerType(), True)]
                    )

clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
                .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train').cache()

In [19]:
clicks_train_joined_df = clicks_train_df \
                         .join(promoted_content_df, on='ad_id', how='left') \
                         .join(documents_meta_df, on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"), how='left') \
                         .join(events_joined_df, on='display_id', how='left').cache()                         
clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')

In [20]:
# clicks_train_joined_df.show(1)

In [21]:
train_feature_vectors_exported_df = spark.read.parquet("gs://capstone-01/output/train_feature_vectors_integral_eval")
validation_feature_vectors_exported_df = spark.read.parquet("gs://capstone-01/output/validation_feature_vectors_integral")

# train_feature_vectors_exported_df.take(1)
# validation_feature_vectors_exported_df.take(1)

In [22]:
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

In [23]:
int_feature_names = ['user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published', 
                    ]

In [24]:
float_feature_names = [                                
                'pop_ad_id',       
                'pop_ad_id_conf',   
                'pop_ad_id_conf_multipl', 
                'pop_document_id',                
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campaign_id',
                'pop_campaign_id_conf',
                'pop_campaign_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',  
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',    
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id', 
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id', 
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',    
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',    
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',                    
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',    
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',    
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',                    
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl'
               ]

In [25]:
TRAFFIC_SOURCE_FV='traffic_source'
EVENT_HOUR_FV='event_hour'
EVENT_COUNTRY_FV = 'event_country'
EVENT_COUNTRY_STATE_FV = 'event_country_state'
EVENT_GEO_LOCATION_FV = 'event_geo_location'
EVENT_PLATFORM_FV = 'event_platform'
AD_ADVERTISER_FV = 'ad_advertiser'
DOC_AD_SOURCE_ID_FV='doc_ad_source_id'
DOC_AD_PUBLISHER_ID_FV='doc_ad_publisher_id'
DOC_EVENT_SOURCE_ID_FV='doc_event_source_id'
DOC_EVENT_PUBLISHER_ID_FV='doc_event_publisher_id'
DOC_AD_CATEGORY_ID_FV='doc_ad_category_id'
DOC_AD_TOPIC_ID_FV='doc_ad_topic_id'
DOC_AD_ENTITY_ID_FV='doc_ad_entity_id'
DOC_EVENT_CATEGORY_ID_FV='doc_event_category_id'
DOC_EVENT_TOPIC_ID_FV='doc_event_topic_id'
DOC_EVENT_ENTITY_ID_FV='doc_event_entity_id'

In [26]:
category_feature_names_integral = ['ad_advertiser',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_ad_entity_id_1', 
 'doc_ad_entity_id_2', 
 'doc_ad_entity_id_3', 
 'doc_ad_entity_id_4', 
 'doc_ad_entity_id_5', 
 'doc_ad_entity_id_6', 
 'doc_ad_publisher_id',
 'doc_ad_source_id', 
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_event_entity_id_1',
 'doc_event_entity_id_2',
 'doc_event_entity_id_3',
 'doc_event_entity_id_4',
 'doc_event_entity_id_5',
 'doc_event_entity_id_6',
 'doc_event_publisher_id',
 'doc_event_source_id', 
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'traffic_source']

In [27]:
feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + \
                                 category_feature_names_integral

In [28]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open("train_feature_vectors_integral_eval.csv"+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [29]:
def sparse_vector_to_csv_with_nulls_row(additional_column_values, vec, num_columns):    
    return ','.join([str(value) for value in additional_column_values] + 
                     list([ '{:.5}'.format(vec[x]) if x in vec.indices else '' for x in range(vec.size) ])[:num_columns]) \
            .replace('.0,',',')

In [30]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn('is_leak', F.lit(-1)) \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [31]:
# rdd를 dataframe으로 변환한 것 -> trainingData
trainingData = train_feature_vectors_integral_csv_rdd.map(lambda x : x.split(",")).toDF()

In [32]:
# spark dataframe으로 잘 변환된 것을 확인할 수 있다
type(trainingData)

pyspark.sql.dataframe.DataFrame

In [33]:
# 이제 컬럼별로 datatype을 선언하자

trainingData_final = trainingData.select(
        trainingData._1.cast("Integer").alias("label"),
        trainingData._2.cast("Integer").alias("display_id"), 
        trainingData._3.cast("Integer").alias("ad_id"), 
        trainingData._4.cast("Integer").alias("doc_id"),
        trainingData._5.cast("Integer").alias("doc_event_id"), 
        trainingData._6.cast("Integer").alias("is_leak"),
        trainingData._7.cast("Integer").alias("event_weekend"),
        trainingData._8.cast("Integer").alias("user_has_already_viewed_doc"), 
        trainingData._9.cast("Integer").alias("user_views"), 
        trainingData._10.cast("Integer").alias("ad_views"),
        trainingData._11.cast("Integer").alias("doc_views"), 
        trainingData._12.cast("Integer").alias("doc_event_days_since_published"),
        trainingData._13.cast("Float").alias("doc_event_hour"),
        trainingData._14.cast("Float").alias("doc_ad_days_since_published"), 
        trainingData._15.cast("Float").alias("pop_ad_id"), 
        trainingData._16.cast("Float").alias("pop_ad_id_conf"),
        trainingData._17.cast("Float").alias("pop_ad_id_conf_multipl"), 
        trainingData._18.cast("Float").alias("pop_document_id"),
        trainingData._19.cast("Float").alias("pop_document_id_conf"),
        trainingData._20.cast("Float").alias("pop_document_id_conf_multipl"), 
        trainingData._21.cast("Float").alias("pop_publisher_id"), 
        trainingData._22.cast("Float").alias("pop_publisher_id_conf"),
        trainingData._23.cast("Float").alias("pop_publisher_id_conf_multipl"), 
        trainingData._24.cast("Float").alias("pop_advertiser_id"),
        trainingData._25.cast("Float").alias("pop_advertiser_id_conf"),
        trainingData._26.cast("Float").alias("pop_advertiser_id_conf_multipl"), 
        trainingData._27.cast("Float").alias("pop_campaign_id"), 
        trainingData._28.cast("Float").alias("pop_campaign_id_conf"),
        trainingData._29.cast("Float").alias("pop_campaign_id_conf_multipl"), 
        trainingData._30.cast("Float").alias("pop_doc_event_doc_ad"),
        trainingData._31.cast("Float").alias("pop_doc_event_doc_ad_conf"),
        trainingData._32.cast("Float").alias("pop_doc_event_doc_ad_conf_multipl"), 
        trainingData._33.cast("Float").alias("pop_source_id"), 
        trainingData._34.cast("Float").alias("pop_source_id_conf"),
        trainingData._35.cast("Float").alias("pop_source_id_conf_multipl"), 
        trainingData._36.cast("Float").alias("pop_source_id_country"),
        trainingData._37.cast("Float").alias("pop_source_id_country_conf"),
        trainingData._38.cast("Float").alias("pop_source_id_country_conf_multipl"),
        trainingData._39.cast("Float").alias("pop_entity_id"),
        trainingData._40.cast("Float").alias("pop_entity_id_conf"),
        trainingData._41.cast("Float").alias("pop_entity_id_conf_multipl"),
        trainingData._42.cast("Float").alias("pop_entity_id_country"),
        trainingData._43.cast("Float").alias("pop_entity_id_country_conf"),
        trainingData._44.cast("Float").alias("pop_entity_id_country_conf_multipl"),
        trainingData._45.cast("Float").alias("pop_topic_id"),
        trainingData._46.cast("Float").alias("pop_topic_id_conf"),
        trainingData._47.cast("Float").alias("pop_topic_id_conf_multipl"),
        trainingData._48.cast("Float").alias("pop_topic_id_country"),
        trainingData._49.cast("Float").alias("pop_topic_id_country_conf"),
        trainingData._50.cast("Float").alias("pop_topic_id_country_conf_multipl"),
        trainingData._51.cast("Float").alias("pop_category_id"),
        trainingData._52.cast("Float").alias("pop_category_id_conf"),
        trainingData._53.cast("Float").alias("pop_category_id_conf_multipl"),
        trainingData._54.cast("Float").alias("pop_category_id_country"),
        trainingData._55.cast("Float").alias("pop_category_id_country_conf"),
        trainingData._56.cast("Float").alias("pop_category_id_country_conf_multipl"),
        trainingData._57.cast("Float").alias("user_doc_ad_sim_categories"),
        trainingData._58.cast("Float").alias("user_doc_ad_sim_categories_conf"),
        trainingData._59.cast("Float").alias("user_doc_ad_sim_categories_conf_multipl"),
        trainingData._60.cast("Float").alias("user_doc_ad_sim_topics"),
        trainingData._61.cast("Float").alias("user_doc_ad_sim_topics_conf"),
        trainingData._62.cast("Float").alias("user_doc_ad_sim_topics_conf_multipl"),
        trainingData._63.cast("Float").alias("user_doc_ad_sim_entities"),
        trainingData._64.cast("Float").alias("user_doc_ad_sim_entities_conf"),
        trainingData._65.cast("Float").alias("user_doc_ad_sim_entities_conf_multipl"),
        trainingData._66.cast("Float").alias("doc_event_doc_ad_sim_categories"),
        trainingData._67.cast("Float").alias("doc_event_doc_ad_sim_categories_conf"),
        trainingData._68.cast("Float").alias("doc_event_doc_ad_sim_categories_conf_multipl"),
        trainingData._69.cast("Float").alias("doc_event_doc_ad_sim_topics"),
        trainingData._70.cast("Float").alias("doc_event_doc_ad_sim_topics_conf"),
        trainingData._71.cast("Float").alias("doc_event_doc_ad_sim_topics_conf_multipl"),
        trainingData._72.cast("Float").alias("doc_event_doc_ad_sim_entities"),
        trainingData._73.cast("Float").alias("doc_event_doc_ad_sim_entities_conf"),
        trainingData._74.cast("Float").alias("doc_event_doc_ad_sim_entities_conf_multipl"),
        trainingData._75.cast("Integer").alias("ad_advertiser"),
        trainingData._76.cast("Integer").alias("doc_ad_category_id_1"),
        trainingData._77.cast("Integer").alias("doc_ad_category_id_2"),
        trainingData._78.cast("Integer").alias("doc_ad_category_id_3"),
        trainingData._79.cast("Integer").alias("doc_ad_topic_id_1"),
        trainingData._80.cast("Integer").alias("doc_ad_topic_id_2"),
        trainingData._81.cast("Integer").alias("doc_ad_topic_id_3"),
        trainingData._82.cast("Integer").alias("doc_ad_entity_id_1"),
        trainingData._83.cast("Integer").alias("doc_ad_entity_id_2"),
        trainingData._84.cast("Integer").alias("doc_ad_entity_id_3"),
        trainingData._85.cast("Integer").alias("doc_ad_entity_id_4"),
        trainingData._86.cast("Integer").alias("doc_ad_entity_id_5"),
        trainingData._87.cast("Integer").alias("doc_ad_entity_id_6"),
        trainingData._88.cast("Integer").alias("doc_ad_publisher_id"),
        trainingData._89.cast("Integer").alias("doc_ad_source_id"),
        trainingData._90.cast("Integer").alias("doc_event_category_id_1"),
        trainingData._91.cast("Integer").alias("doc_event_category_id_2"),
        trainingData._92.cast("Integer").alias("doc_event_category_id_3"),
        trainingData._93.cast("Integer").alias("doc_event_topic_id_1"),
        trainingData._94.cast("Integer").alias("doc_event_topic_id_2"),
        trainingData._95.cast("Integer").alias("doc_event_topic_id_3"),
        trainingData._96.cast("Integer").alias("doc_event_entity_id_1"),
        trainingData._97.cast("Integer").alias("doc_event_entity_id_2"),
        trainingData._98.cast("Integer").alias("doc_event_entity_id_3"),
        trainingData._99.cast("Integer").alias("doc_event_entity_id_4"),
        trainingData._100.cast("Integer").alias("doc_event_entity_id_5"),
        trainingData._101.cast("Integer").alias("doc_event_entity_id_6"),
        trainingData._102.cast("Integer").alias("doc_event_publisher_id"),
        trainingData._103.cast("Integer").alias("doc_event_source_id"),
        trainingData._104.cast("Integer").alias("event_country"),
        trainingData._105.cast("Integer").alias("event_country_state"),
        trainingData._106.cast("Integer").alias("event_geo_location"),
        trainingData._107.cast("Integer").alias("event_hour"),
        trainingData._108.cast("Integer").alias("event_platform"),
        trainingData._109.cast("Integer").alias("traffic_source")
    ).cache()

In [34]:
# trainingData_final.groupBy("event_platform").count().show()

---

In [34]:
validation_feature_vector_integral_csv_folder_name = 'validation_feature_vectors_integral.csv'

integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open(validation_feature_vector_integral_csv_folder_name+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [35]:
validation_feature_vectors_integral_csv_rdd = validation_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'is_leak', 'feature_vector') \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [36]:
# rdd를 dataframe으로 변환한 것 -> validationData
validationData = validation_feature_vectors_integral_csv_rdd.map(lambda x : x.split(",")).toDF()

In [37]:
# spark dataframe으로 잘 변환된 것을 확인할 수 있다
type(validationData)

pyspark.sql.dataframe.DataFrame

In [38]:
# 이제 컬럼별로 datatype을 선언하자

validationData_final = validationData.select(
        validationData._1.cast("Integer").alias("label"),
        validationData._2.cast("Integer").alias("display_id"), 
        validationData._3.cast("Integer").alias("ad_id"), 
        validationData._4.cast("Integer").alias("doc_id"),
        validationData._5.cast("Integer").alias("doc_event_id"), 
        validationData._6.cast("Integer").alias("is_leak"),
        validationData._7.cast("Integer").alias("event_weekend"),
        validationData._8.cast("Integer").alias("user_has_already_viewed_doc"), 
        validationData._9.cast("Integer").alias("user_views"), 
        validationData._10.cast("Integer").alias("ad_views"),
        validationData._11.cast("Integer").alias("doc_views"), 
        validationData._12.cast("Integer").alias("doc_event_days_since_published"),
        validationData._13.cast("Float").alias("doc_event_hour"),
        validationData._14.cast("Float").alias("doc_ad_days_since_published"), 
        validationData._15.cast("Float").alias("pop_ad_id"), 
        validationData._16.cast("Float").alias("pop_ad_id_conf"),
        validationData._17.cast("Float").alias("pop_ad_id_conf_multipl"), 
        validationData._18.cast("Float").alias("pop_document_id"),
        validationData._19.cast("Float").alias("pop_document_id_conf"),
        validationData._20.cast("Float").alias("pop_document_id_conf_multipl"), 
        validationData._21.cast("Float").alias("pop_publisher_id"), 
        validationData._22.cast("Float").alias("pop_publisher_id_conf"),
        validationData._23.cast("Float").alias("pop_publisher_id_conf_multipl"), 
        validationData._24.cast("Float").alias("pop_advertiser_id"),
        validationData._25.cast("Float").alias("pop_advertiser_id_conf"),
        validationData._26.cast("Float").alias("pop_advertiser_id_conf_multipl"), 
        validationData._27.cast("Float").alias("pop_campaign_id"), 
        validationData._28.cast("Float").alias("pop_campaign_id_conf"),
        validationData._29.cast("Float").alias("pop_campaign_id_conf_multipl"), 
        validationData._30.cast("Float").alias("pop_doc_event_doc_ad"),
        validationData._31.cast("Float").alias("pop_doc_event_doc_ad_conf"),
        validationData._32.cast("Float").alias("pop_doc_event_doc_ad_conf_multipl"), 
        validationData._33.cast("Float").alias("pop_source_id"), 
        validationData._34.cast("Float").alias("pop_source_id_conf"),
        validationData._35.cast("Float").alias("pop_source_id_conf_multipl"), 
        validationData._36.cast("Float").alias("pop_source_id_country"),
        validationData._37.cast("Float").alias("pop_source_id_country_conf"),
        validationData._38.cast("Float").alias("pop_source_id_country_conf_multipl"),
        validationData._39.cast("Float").alias("pop_entity_id"),
        validationData._40.cast("Float").alias("pop_entity_id_conf"),
        validationData._41.cast("Float").alias("pop_entity_id_conf_multipl"),
        validationData._42.cast("Float").alias("pop_entity_id_country"),
        validationData._43.cast("Float").alias("pop_entity_id_country_conf"),
        validationData._44.cast("Float").alias("pop_entity_id_country_conf_multipl"),
        validationData._45.cast("Float").alias("pop_topic_id"),
        validationData._46.cast("Float").alias("pop_topic_id_conf"),
        validationData._47.cast("Float").alias("pop_topic_id_conf_multipl"),
        validationData._48.cast("Float").alias("pop_topic_id_country"),
        validationData._49.cast("Float").alias("pop_topic_id_country_conf"),
        validationData._50.cast("Float").alias("pop_topic_id_country_conf_multipl"),
        validationData._51.cast("Float").alias("pop_category_id"),
        validationData._52.cast("Float").alias("pop_category_id_conf"),
        validationData._53.cast("Float").alias("pop_category_id_conf_multipl"),
        validationData._54.cast("Float").alias("pop_category_id_country"),
        validationData._55.cast("Float").alias("pop_category_id_country_conf"),
        validationData._56.cast("Float").alias("pop_category_id_country_conf_multipl"),
        validationData._57.cast("Float").alias("user_doc_ad_sim_categories"),
        validationData._58.cast("Float").alias("user_doc_ad_sim_categories_conf"),
        validationData._59.cast("Float").alias("user_doc_ad_sim_categories_conf_multipl"),
        validationData._60.cast("Float").alias("user_doc_ad_sim_topics"),
        validationData._61.cast("Float").alias("user_doc_ad_sim_topics_conf"),
        validationData._62.cast("Float").alias("user_doc_ad_sim_topics_conf_multipl"),
        validationData._63.cast("Float").alias("user_doc_ad_sim_entities"),
        validationData._64.cast("Float").alias("user_doc_ad_sim_entities_conf"),
        validationData._65.cast("Float").alias("user_doc_ad_sim_entities_conf_multipl"),
        validationData._66.cast("Float").alias("doc_event_doc_ad_sim_categories"),
        validationData._67.cast("Float").alias("doc_event_doc_ad_sim_categories_conf"),
        validationData._68.cast("Float").alias("doc_event_doc_ad_sim_categories_conf_multipl"),
        validationData._69.cast("Float").alias("doc_event_doc_ad_sim_topics"),
        validationData._70.cast("Float").alias("doc_event_doc_ad_sim_topics_conf"),
        validationData._71.cast("Float").alias("doc_event_doc_ad_sim_topics_conf_multipl"),
        validationData._72.cast("Float").alias("doc_event_doc_ad_sim_entities"),
        validationData._73.cast("Float").alias("doc_event_doc_ad_sim_entities_conf"),
        validationData._74.cast("Float").alias("doc_event_doc_ad_sim_entities_conf_multipl"),
        validationData._75.cast("Integer").alias("ad_advertiser"),
        validationData._76.cast("Integer").alias("doc_ad_category_id_1"),
        validationData._77.cast("Integer").alias("doc_ad_category_id_2"),
        validationData._78.cast("Integer").alias("doc_ad_category_id_3"),
        validationData._79.cast("Integer").alias("doc_ad_topic_id_1"),
        validationData._80.cast("Integer").alias("doc_ad_topic_id_2"),
        validationData._81.cast("Integer").alias("doc_ad_topic_id_3"),
        validationData._82.cast("Integer").alias("doc_ad_entity_id_1"),
        validationData._83.cast("Integer").alias("doc_ad_entity_id_2"),
        validationData._84.cast("Integer").alias("doc_ad_entity_id_3"),
        validationData._85.cast("Integer").alias("doc_ad_entity_id_4"),
        validationData._86.cast("Integer").alias("doc_ad_entity_id_5"),
        validationData._87.cast("Integer").alias("doc_ad_entity_id_6"),
        validationData._88.cast("Integer").alias("doc_ad_publisher_id"),
        validationData._89.cast("Integer").alias("doc_ad_source_id"),
        validationData._90.cast("Integer").alias("doc_event_category_id_1"),
        validationData._91.cast("Integer").alias("doc_event_category_id_2"),
        validationData._92.cast("Integer").alias("doc_event_category_id_3"),
        validationData._93.cast("Integer").alias("doc_event_topic_id_1"),
        validationData._94.cast("Integer").alias("doc_event_topic_id_2"),
        validationData._95.cast("Integer").alias("doc_event_topic_id_3"),
        validationData._96.cast("Integer").alias("doc_event_entity_id_1"),
        validationData._97.cast("Integer").alias("doc_event_entity_id_2"),
        validationData._98.cast("Integer").alias("doc_event_entity_id_3"),
        validationData._99.cast("Integer").alias("doc_event_entity_id_4"),
        validationData._100.cast("Integer").alias("doc_event_entity_id_5"),
        validationData._101.cast("Integer").alias("doc_event_entity_id_6"),
        validationData._102.cast("Integer").alias("doc_event_publisher_id"),
        validationData._103.cast("Integer").alias("doc_event_source_id"),
        validationData._104.cast("Integer").alias("event_country"),
        validationData._105.cast("Integer").alias("event_country_state"),
        validationData._106.cast("Integer").alias("event_geo_location"),
        validationData._107.cast("Integer").alias("event_hour"),
        validationData._108.cast("Integer").alias("event_platform"),
        validationData._109.cast("Integer").alias("traffic_source")
    )

In [39]:
trainingData_final_dropped = trainingData_final.drop('doc_event_entity_id_1', 'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4', 'doc_event_entity_id_5', 'doc_event_entity_id_6', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3', 'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6', 'doc_ad_days_since_published', 'doc_event_days_since_published', 'user_has_already_viewed_doc', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'pop_entity_id_country', 'pop_entity_id_country_conf', 'pop_entity_id_country_conf_multipl', 'user_doc_ad_sim_categories', 'user_doc_ad_sim_categories_conf', 'user_doc_ad_sim_categories_conf_multipl', 'user_doc_ad_sim_topics', 'user_doc_ad_sim_topics_conf', 'user_doc_ad_sim_topics_conf_multipl', 'user_doc_ad_sim_entities', 'user_doc_ad_sim_entities_conf', 'user_doc_ad_sim_entities_conf_multipl', 'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_categories_conf', 'doc_event_doc_ad_sim_categories_conf_multipl', 'doc_event_doc_ad_sim_topics', 'doc_event_doc_ad_sim_topics_conf', 'doc_event_doc_ad_sim_topics_conf_multipl', 'doc_event_doc_ad_sim_entities', 'doc_event_doc_ad_sim_entities_conf', 'doc_event_doc_ad_sim_entities_conf_multipl')

In [40]:
# drop some columns
validationData_final_dropped = validationData_final.drop('doc_event_entity_id_1', 'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4', 'doc_event_entity_id_5', 'doc_event_entity_id_6', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3', 'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6', 'doc_ad_days_since_published', 'doc_event_days_since_published', 'user_has_already_viewed_doc', 'pop_entity_id', 'pop_entity_id_conf', 'pop_entity_id_conf_multipl', 'pop_entity_id_country', 'pop_entity_id_country_conf', 'pop_entity_id_country_conf_multipl', 'user_doc_ad_sim_categories', 'user_doc_ad_sim_categories_conf', 'user_doc_ad_sim_categories_conf_multipl', 'user_doc_ad_sim_topics', 'user_doc_ad_sim_topics_conf', 'user_doc_ad_sim_topics_conf_multipl', 'user_doc_ad_sim_entities', 'user_doc_ad_sim_entities_conf', 'user_doc_ad_sim_entities_conf_multipl', 'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_categories_conf', 'doc_event_doc_ad_sim_categories_conf_multipl', 'doc_event_doc_ad_sim_topics', 'doc_event_doc_ad_sim_topics_conf', 'doc_event_doc_ad_sim_topics_conf_multipl', 'doc_event_doc_ad_sim_entities', 'doc_event_doc_ad_sim_entities_conf', 'doc_event_doc_ad_sim_entities_conf_multipl')

In [41]:
train = trainingData_final_dropped
validation = validationData_final_dropped
train = train.cache()
validation = validation.cache()

In [42]:
from pyspark.sql.functions import lit

train = train.withColumn("is_train", lit(1))
validation = validation.withColumn('is_train', lit(0))

In [None]:
# train.printSchema()

In [None]:
# validation.printSchema()

In [43]:
# train_validation 합치기
train_valid_merged = train.union(validation).cache()

In [78]:
# train_valid_merged.count()

87141731

In [44]:
train_valid_merged = train_valid_merged.drop('doc_ad_category_id_1', 'doc_ad_category_id_2', 'doc_ad_category_id_3', 
                          'doc_ad_topic_id_1', 'doc_ad_topic_id_2', 'doc_ad_topic_id_3',
                          'doc_event_category_id_1', 'doc_event_category_id_2', 'doc_event_category_id_3',
                           'doc_event_topic_id_1', 'doc_event_topic_id_2', 'doc_event_topic_id_3',
                          'event_geo_location', 'event_country_state',
                          'event_country', 'ad_advertiser', 'doc_ad_publisher_id', 'doc_ad_source_id',
                          'doc_ad_category')

`display_id`와 `ad_id`를 사용하여 `uuid`를 가져오자.

In [45]:
documents_categories_schema = StructType(
                    [StructField("document_id_cat", IntegerType(), True),
                    StructField("category_id", IntegerType(), True),                    
                    StructField("confidence_level_cat", FloatType(), True)]
                    )

documents_categories_df = spark.read.schema(documents_categories_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_categories.csv") \
                .alias('documents_categories').cache()
    
documents_categories_grouped_df = documents_categories_df.groupBy('document_id_cat') \
                                            .agg(F.collect_list('category_id').alias('category_id_list'),
                                                 F.collect_list('confidence_level_cat').alias('confidence_level_cat_list')) \
                                            .withColumn('dummyDocumentsCategory', F.lit(1)) \
                                            .alias('documents_categories_grouped')

In [46]:
documents_topics_schema = StructType(
                    [StructField("document_id_top", IntegerType(), True),
                    StructField("topic_id", IntegerType(), True),                    
                    StructField("confidence_level_top", FloatType(), True)]
                    )

documents_topics_df = spark.read.schema(documents_topics_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_topics.csv")  \
                .alias('documents_topics').cache()
    
documents_topics_grouped_df = documents_topics_df.groupBy('document_id_top') \
                                            .agg(F.collect_list('topic_id').alias('topic_id_list'),
                                                 F.collect_list('confidence_level_top').alias('confidence_level_top_list')) \
                                            .withColumn('dummyDocumentsTopics', F.lit(1)) \
                                            .alias('documents_topics_grouped')

In [47]:
clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
                .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train').cache()

In [48]:
clicks_joined_df = clicks_train_df \
        .join(events_df, on = 'display_id', how = 'left').cache()

# clicks_joined_df.count()

In [47]:
# clicks_joined_df.show(1)

In [49]:
train_valid_merged_df = train_valid_merged.join(clicks_joined_df, ['display_id','ad_id']).cache()
# train_valid_merged_df.count() : 87141731건

`ad_id`를 사용하여 `campaign_id`와 `advertiser_id`를 가져오자.

In [50]:
train_valid_ad_merged_df = train_valid_merged_df.join(promoted_content_df, 'ad_id').cache()
# train_valid_ad_merged_df.count()

In [54]:
# train_valid_ad_merged_df.printSchema()

root
 |-- ad_id: integer (nullable = true)
 |-- display_id: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- doc_id: integer (nullable = true)
 |-- doc_event_id: integer (nullable = true)
 |-- is_leak: integer (nullable = true)
 |-- event_weekend: integer (nullable = true)
 |-- user_views: integer (nullable = true)
 |-- ad_views: integer (nullable = true)
 |-- doc_views: integer (nullable = true)
 |-- doc_event_hour: float (nullable = true)
 |-- pop_ad_id: float (nullable = true)
 |-- pop_ad_id_conf: float (nullable = true)
 |-- pop_ad_id_conf_multipl: float (nullable = true)
 |-- pop_document_id: float (nullable = true)
 |-- pop_document_id_conf: float (nullable = true)
 |-- pop_document_id_conf_multipl: float (nullable = true)
 |-- pop_publisher_id: float (nullable = true)
 |-- pop_publisher_id_conf: float (nullable = true)
 |-- pop_publisher_id_conf_multipl: float (nullable = true)
 |-- pop_advertiser_id: float (nullable = true)
 |-- pop_advertiser_id_conf: float

In [51]:
train_subset = train_valid_ad_merged_df.drop('dummyEvents', 'dummyPromotedContent', 'geo_location_event', 'timestamp_event',
                                            'dummyClicksTrain', 'clicked').cache()

In [124]:
# train_subset.count()

87141731

In [126]:
# train_subset.write.csv("gs://cap-18/output/train_subset.csv")

In [97]:
# train_subset.groupBy("event_platform").count().show()

+--------------+--------+
|event_platform|   count|
+--------------+--------+
|          null|      30|
|             1|37519782|
|             3|12785378|
|             2|36836541|
+--------------+--------+



**주의) overwrite을 조심하자...**

`document_id`를 사용하여 `category_id`, `topic_id`, `publisher_id`를 가져오자.

페이지별로 신뢰도가 높은 하나의 카테고리, 토픽만 뽑아보자.

In [165]:
# documents_categories_grouped_df.show(10)

# 신뢰도 순으로 정렬되어 있음을 알 수 있음.

+---------------+----------------+-------------------------+----------------------+
|document_id_cat|category_id_list|confidence_level_cat_list|dummyDocumentsCategory|
+---------------+----------------+-------------------------+----------------------+
|            148|    [1403, 1702]|             [0.92, 0.07]|                     1|
|            463|    [1513, 1808]|     [0.8932095, 0.067...|                     1|
|            471|    [1504, 1609]|             [0.92, 0.07]|                     1|
|            496|    [1210, 1203]|             [0.92, 0.07]|                     1|
|            833|    [1305, 2004]|             [0.92, 0.07]|                     1|
|           1088|    [2006, 1210]|     [0.8364613, 0.063...|                     1|
|           1238|    [1100, 1407]|     [0.34836665, 0.02...|                     1|
|           1342|    [1408, 2004]|     [0.42835742, 0.03...|                     1|
|           1580|    [1403, 1402]|     [0.65625566, 0.04...|                

In [142]:
# documents_categories_grouped_df.count()

2828649

In [52]:
documents_categories_grouped_df_pandas = documents_categories_grouped_df.toPandas()
documents_category_new = pd.DataFrame(documents_categories_grouped_df_pandas.category_id_list.tolist(), columns=['category_id', 'category_id2'])
documents_category_new2 = pd.DataFrame(documents_categories_grouped_df_pandas.confidence_level_cat_list.tolist(), columns=['category_conf', 'category2_conf'])

In [56]:
# documents_categories_grouped_df_pandas.head(1)
# documents_category_new.head(1)
# documents_category_new2.head(1)

Unnamed: 0,document_id_cat,category_id_list,confidence_level_cat_list,dummyDocumentsCategory
0,148,"[1403, 1702]","[0.920000016689, 0.070000000298]",1


Unnamed: 0,category_id,category_id2
0,1403,1702.0


Unnamed: 0,category_conf,category2_conf
0,0.92,0.07


In [53]:
documents_category_new = documents_category_new['category_id']
documents_category_new2 = documents_category_new2['category_conf']

In [54]:
documents_categories_grouped_df_pandas = pd.concat([documents_categories_grouped_df_pandas, documents_category_new, documents_category_new2], axis = 1)
documents_categories_grouped_df_pandas = documents_categories_grouped_df_pandas[['document_id_cat', 'category_id', 'category_conf']]

In [65]:
# documents_categories_grouped_df_pandas.isnull().sum()

document_id_cat    0
category_id        0
category_conf      0
dtype: int64

In [66]:
# documents_topics_grouped_df.show(10)

# 토픽도 마찬가지로 신뢰도 순으로 정렬되어 있음을 알 수 있음.

+---------------+--------------------+-------------------------+--------------------+
|document_id_top|       topic_id_list|confidence_level_top_list|dummyDocumentsTopics|
+---------------+--------------------+-------------------------+--------------------+
|            148|[153, 140, 8, 172...|     [0.07523697, 0.07...|                   1|
|            463|[181, 292, 24, 25...|     [0.11870128, 0.05...|                   1|
|            471|[285, 238, 153, 193]|     [0.15588789, 0.04...|                   1|
|            496|[244, 294, 196, 1...|     [0.18284231, 0.11...|                   1|
|            833|[294, 89, 174, 86...|     [0.11430275, 0.04...|                   1|
|           1088|[107, 75, 153, 64...|     [0.10822894, 0.06...|                   1|
|           1238| [89, 221, 192, 236]|     [0.023348164, 0.0...|                   1|
|           1342|[271, 283, 181, 2...|     [0.0457309, 0.025...|                   1|
|           1580|[8, 37, 136, 12, ...|     [0.08965496

In [55]:
documents_topics_grouped_df_pandas = documents_topics_grouped_df.toPandas()
# documents_topics_grouped_df_pandas.head(1)

In [56]:
documents_topics_new = pd.DataFrame(documents_topics_grouped_df_pandas.topic_id_list.tolist(), columns=['topic_id', '','','','','','','','','','','','','','','','','','','','','','','','','','','',
                                                                                                       '','','','','','','','','','',''])
documents_topics_new2 = pd.DataFrame(documents_topics_grouped_df_pandas.confidence_level_top_list.tolist(), columns=['topic_conf', '','','','','','','','','','','','','','','','','','','','','','','','','','','',
                                                                                                       '','','','','','','','','','',''])
documents_topics_new = documents_topics_new['topic_id']
documents_topics_new2 = documents_topics_new2['topic_conf']

# documents_topics_new.head(1)
# documents_topics_new2.head(1)

In [57]:
documents_topics_grouped_df_pandas = pd.concat([documents_topics_grouped_df_pandas, documents_topics_new, documents_topics_new2], axis = 1)
documents_topics_grouped_df_pandas = documents_topics_grouped_df_pandas[['document_id_top', 'topic_id', 'topic_conf']]

In [72]:
# documents_topics_grouped_df_pandas.count()
# documents_topics_grouped_df_pandas.head(10)
# documents_topics_grouped_df_pandas.isnull().sum()

document_id_top    2495423
topic_id           2495423
topic_conf         2495423
dtype: int64

Unnamed: 0,document_id_top,topic_id,topic_conf
0,148,153,0.075237
1,463,181,0.118701
2,471,285,0.155888
3,496,244,0.182842
4,833,294,0.114303
5,1088,107,0.108229
6,1238,89,0.023348
7,1342,271,0.045731
8,1580,8,0.089655
9,1591,260,0.06284


document_id_top    0
topic_id           0
topic_conf         0
dtype: int64

생성한 페이지별 카테고리, 토픽 정보를 기존 파일에 합치자.

In [58]:
# categories_schema, topic_schema 생성

categories_schema = StructType(
                    [StructField("document_id_cat", StringType(), True),
                    StructField("category_id", StringType(), True),
                    StructField("category_conf", FloatType(), True)]
                    )

topic_schema = StructType(
                    [StructField("document_id_top", StringType(), True),
                    StructField("topic_id", StringType(), True),
                    StructField("topic_conf", FloatType(), True)]
                    )

documents_categories_table = sqlContext.createDataFrame(documents_categories_grouped_df_pandas, categories_schema)
documents_topics_table = sqlContext.createDataFrame(documents_topics_grouped_df_pandas, topic_schema)

In [99]:
# documents_categories_table.show(1)
# documents_topics_table.show(1)

+---------------+-----------+-------------+
|document_id_cat|category_id|category_conf|
+---------------+-----------+-------------+
|            148|        153|   0.07523697|
+---------------+-----------+-------------+
only showing top 1 row

+---------------+--------+----------+
|document_id_top|topic_id|topic_conf|
+---------------+--------+----------+
|            148|    1403|      0.92|
+---------------+--------+----------+
only showing top 1 row



In [59]:
train_subset_joined_df = train_subset.join(documents_categories_table,
                                           on=F.col("doc_id") == F.col("document_id_cat"), how='left')

In [60]:
train_subset_joined_df = train_subset_joined_df.join(documents_topics_table,
                                          on = F.col('doc_id') == F.col('document_id_top'), how = 'left')

In [116]:
# train_subset_joined_df.count()

87141731

In [192]:
# train_subset_joined_df.printSchema()

root
 |-- ad_id: integer (nullable = true)
 |-- display_id: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- doc_id: integer (nullable = true)
 |-- doc_event_id: integer (nullable = true)
 |-- is_leak: integer (nullable = true)
 |-- event_weekend: integer (nullable = true)
 |-- user_views: integer (nullable = true)
 |-- ad_views: integer (nullable = true)
 |-- doc_views: integer (nullable = true)
 |-- doc_event_hour: float (nullable = true)
 |-- pop_ad_id: float (nullable = true)
 |-- pop_ad_id_conf: float (nullable = true)
 |-- pop_ad_id_conf_multipl: float (nullable = true)
 |-- pop_document_id: float (nullable = true)
 |-- pop_document_id_conf: float (nullable = true)
 |-- pop_document_id_conf_multipl: float (nullable = true)
 |-- pop_publisher_id: float (nullable = true)
 |-- pop_publisher_id_conf: float (nullable = true)
 |-- pop_publisher_id_conf_multipl: float (nullable = true)
 |-- pop_advertiser_id: float (nullable = true)
 |-- pop_advertiser_id_conf: float

In [61]:
train_subset_joined_df = train_subset_joined_df.withColumnRenamed('topic_id', 'ad_topic_id') \
                                                .withColumnRenamed('category_id', 'ad_category_id') \
                                                .withColumnRenamed('topic_conf', 'ad_topic_conf') \
                                                .withColumnRenamed('category_conf', 'ad_category_conf')

In [62]:
train_subset_joined_df = train_subset_joined_df.drop('document_id_top', 'document_id_cat', 'doc_event_id', 'pop_topic_id_country', 'pop_topic_id_country_conf', 'pop_topic_id_country_conf_multipl')

In [63]:
train_subset_joined_df = train_subset_joined_df.join(documents_categories_table,
                                           on= F.col("document_id_event") == F.col("document_id_cat"), how='left')

In [64]:
train_subset_joined_df = train_subset_joined_df.join(documents_topics_table,
                                           on= F.col("document_id_event") == F.col("document_id_top"), how='left')

In [65]:
train_subset_joined_df = train_subset_joined_df.withColumnRenamed('topic_id', 'view_topic_id') \
                                                .withColumnRenamed('category_id', 'view_category_id') \
                                                .withColumnRenamed('topic_conf', 'view_topic_conf') \
                                                .withColumnRenamed('category_conf', 'view_category_conf')

In [81]:
# checking_null = train_subset_joined_df.select('view_topic_id', 'view_category_id', 'view_topic_conf', 'view_category_conf',
#                               'ad_topic_id', 'ad_category_id', 'ad_topic_conf', 'ad_category_conf')

In [None]:
# 변수별 NA 개수 체크
# from pyspark.sql.functions import isnan, when, count, col
# checking_null.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in checking_null.columns]).show()

In [66]:
# 시간 정보, 일(day_event) 정보 붙이기
events_time = events_df.select('display_id', 'timestamp_event')

In [66]:
# events_time.show(1)

In [67]:
train_subset_joined_df = train_subset_joined_df.join(events_time, on = 'display_id', how = 'left')

In [199]:
# train_subset_joined_df.count()

87141731

In [68]:
train_validation_checking = train_subset_joined_df

In [69]:
train_validation_checking.columns

['display_id',
 'ad_id',
 'label',
 'doc_id',
 'is_leak',
 'event_weekend',
 'user_views',
 'ad_views',
 'doc_views',
 'doc_event_hour',
 'pop_ad_id',
 'pop_ad_id_conf',
 'pop_ad_id_conf_multipl',
 'pop_document_id',
 'pop_document_id_conf',
 'pop_document_id_conf_multipl',
 'pop_publisher_id',
 'pop_publisher_id_conf',
 'pop_publisher_id_conf_multipl',
 'pop_advertiser_id',
 'pop_advertiser_id_conf',
 'pop_advertiser_id_conf_multipl',
 'pop_campaign_id',
 'pop_campaign_id_conf',
 'pop_campaign_id_conf_multipl',
 'pop_doc_event_doc_ad',
 'pop_doc_event_doc_ad_conf',
 'pop_doc_event_doc_ad_conf_multipl',
 'pop_source_id',
 'pop_source_id_conf',
 'pop_source_id_conf_multipl',
 'pop_source_id_country',
 'pop_source_id_country_conf',
 'pop_source_id_country_conf_multipl',
 'pop_topic_id',
 'pop_topic_id_conf',
 'pop_topic_id_conf_multipl',
 'pop_category_id',
 'pop_category_id_conf',
 'pop_category_id_conf_multipl',
 'pop_category_id_country',
 'pop_category_id_country_conf',
 'pop_categor

In [202]:
# train_validation_checking.select('document_id_cat', 'document_id_top', 'document_id_event', 'document_id_promo').show(3)

+---------------+---------------+-----------------+-----------------+
|document_id_cat|document_id_top|document_id_event|document_id_promo|
+---------------+---------------+-----------------+-----------------+
|        1205772|        1205772|          1205772|          1060089|
|        1205772|        1205772|          1205772|          1379561|
|        1205772|        1205772|          1205772|           990613|
+---------------+---------------+-----------------+-----------------+
only showing top 3 rows



In [70]:
# 중복된 변수 및 불필요한 변수 삭제
train_validation_checking = train_validation_checking.drop('doc_id', 'platform_event',
                                                           'pop_category_id_country',
                                                          'pop_category_id_country_conf',
                                                          'pop_category_id_country_conf_multipl',
                                                          'pop_source_id_country',
                                                          'pop_source_id_country_conf',
                                                          'pop_source_id_country_conf_multipl',
                                                          'document_id_cat', 'document_id_top',
                                                          'doc_event_hour',
                                                          'pop_topic_id_country',
                                                          'pop_topic_id_country_conf',
                                                          'pop_topic_id_country_conf_multipl')

In [70]:
train_validation_checking = train_validation_checking.join(documents_meta_df, on = F.col('document_id_promo') == F.col('document_id_doc'), how = 'left')

In [71]:
train_validation_checking.columns

['display_id',
 'ad_id',
 'label',
 'doc_id',
 'is_leak',
 'event_weekend',
 'user_views',
 'ad_views',
 'doc_views',
 'doc_event_hour',
 'pop_ad_id',
 'pop_ad_id_conf',
 'pop_ad_id_conf_multipl',
 'pop_document_id',
 'pop_document_id_conf',
 'pop_document_id_conf_multipl',
 'pop_publisher_id',
 'pop_publisher_id_conf',
 'pop_publisher_id_conf_multipl',
 'pop_advertiser_id',
 'pop_advertiser_id_conf',
 'pop_advertiser_id_conf_multipl',
 'pop_campaign_id',
 'pop_campaign_id_conf',
 'pop_campaign_id_conf_multipl',
 'pop_doc_event_doc_ad',
 'pop_doc_event_doc_ad_conf',
 'pop_doc_event_doc_ad_conf_multipl',
 'pop_source_id',
 'pop_source_id_conf',
 'pop_source_id_conf_multipl',
 'pop_source_id_country',
 'pop_source_id_country_conf',
 'pop_source_id_country_conf_multipl',
 'pop_topic_id',
 'pop_topic_id_conf',
 'pop_topic_id_conf_multipl',
 'pop_category_id',
 'pop_category_id_conf',
 'pop_category_id_conf_multipl',
 'pop_category_id_country',
 'pop_category_id_country_conf',
 'pop_categor

In [72]:
train_validation_checking = train_validation_checking.drop('dummyDocumentsMeta', 'document_id_doc') \
                                                .withColumnRenamed('source_id', 'ad_source_id') \
                                                .withColumnRenamed('publisher_id', 'ad_publisher_id') \
                                                .withColumnRenamed('uuid_event', 'uuid') \
                                                .withColumnRenamed('document_id_event', 'view_doc_id') \
                                                .withColumnRenamed('document_id_promo', 'ad_doc_id')

In [73]:
train_validation_checking.columns # 최종 변수명 확인

['display_id',
 'ad_id',
 'label',
 'doc_id',
 'is_leak',
 'event_weekend',
 'user_views',
 'ad_views',
 'doc_views',
 'doc_event_hour',
 'pop_ad_id',
 'pop_ad_id_conf',
 'pop_ad_id_conf_multipl',
 'pop_document_id',
 'pop_document_id_conf',
 'pop_document_id_conf_multipl',
 'pop_publisher_id',
 'pop_publisher_id_conf',
 'pop_publisher_id_conf_multipl',
 'pop_advertiser_id',
 'pop_advertiser_id_conf',
 'pop_advertiser_id_conf_multipl',
 'pop_campaign_id',
 'pop_campaign_id_conf',
 'pop_campaign_id_conf_multipl',
 'pop_doc_event_doc_ad',
 'pop_doc_event_doc_ad_conf',
 'pop_doc_event_doc_ad_conf_multipl',
 'pop_source_id',
 'pop_source_id_conf',
 'pop_source_id_conf_multipl',
 'pop_source_id_country',
 'pop_source_id_country_conf',
 'pop_source_id_country_conf_multipl',
 'pop_topic_id',
 'pop_topic_id_conf',
 'pop_topic_id_conf_multipl',
 'pop_category_id',
 'pop_category_id_conf',
 'pop_category_id_conf_multipl',
 'pop_category_id_country',
 'pop_category_id_country_conf',
 'pop_categor

최종적으로 사용할 Data Table을 파케이, csv 형식으로 생성하였다.

In [74]:
# 파케이 형식으로 테이블 내보내기: 하기 전에 항상 경로 정확한지 체크
train_validation_checking.write.parquet(OUTPUT_BUCKET_FOLDER + 'train_subset_final', mode='overwrite')

In [75]:
# csv 형식으로 테이블 내보내기
train_validation_checking.write.csv(OUTPUT_BUCKET_FOLDER + 'train_validation_final.csv', header = True)