# Creating user_views, ad_views & Train-Test Set Split

As of Dec 13, 2018

## Part A: 데이터 로드 및 환경 설정

In [1]:
from pyspark.sql import Window

In [2]:
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all" # 한 셀(cell)에서의 코드 실행 결과가 다 보이도록 설정
import numpy as np
import scipy.sparse
import math
import datetime
import time
import itertools
import random
random.seed(42)
import pandas as pd
%matplotlib inline

In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [4]:
OUTPUT_BUCKET_FOLDER = "gs://line-2018-2/output/"
DATA_BUCKET_FOLDER = "gs://upload-bigquery180927/data/"

In [22]:
page_views_schema = StructType(
                    [StructField("uuid", StringType(), True),
                    StructField("document_id", IntegerType(), True),
                    StructField("timestamp_pv", IntegerType(), True),
                    StructField("platform", IntegerType(), True),
                    StructField("geo_location", StringType(), True),
                    StructField("traffic_source", IntegerType(), True)]
                    )
#Data is read from Google Cloud Storage, a distributed filesystem like HDFS stored externally to the cluster
#If you wanna use your own HDFS, just prefix the file path with the absolute path 'hdfs:' or 
#with the relative path if HDFS is your cluster default filesystem
page_views_df = spark.read.schema(page_views_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "page_views.csv")

In [23]:
events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid", StringType(), True),
                    StructField("document_id", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform", IntegerType(), True),
                    StructField("geo_location", StringType(), True)]
                    )

events_df = spark.read \
            .schema(events_schema) \
            .options(header='true', inferschema='false', nullValue='\\N') \
            .csv(DATA_BUCKET_FOLDER + "events.csv").cache()

In [24]:
page_views_df = page_views_df.withColumn('dummyPageView', F.lit(1))
events_df = events_df.withColumn('dummyEvent', F.lit(1))

In [25]:
clicks_train_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True),                    
                    StructField("clicked", IntegerType(), True)]
                    )

clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
                .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train').cache()

In [26]:
promoted_content_schema = StructType(
                    [StructField("ad_id", IntegerType(), True),
                    StructField("document_id_promo", IntegerType(), True),                    
                    StructField("campaign_id", IntegerType(), True),
                    StructField("advertiser_id", IntegerType(), True)]
                    )

promoted_content_df = spark.read.schema(promoted_content_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
                .withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content').cache()

In [27]:
clicks_train_joined_df = clicks_train_df \
                         .join(promoted_content_df, on='ad_id', how='left') \
                         .join(events_df, on='display_id', how='left').cache()                      

In [28]:
clicks_train_joined_df.columns

['display_id',
 'ad_id',
 'clicked',
 'dummyClicksTrain',
 'document_id_promo',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent',
 'uuid',
 'document_id',
 'timestamp_event',
 'platform',
 'geo_location',
 'dummyEvent']

## Part B: user_views(유저의 누적 페이지 뷰 수 카운팅) 생성하기

In [29]:
truncate_day_from_timestamp_udf = F.udf(lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType())

In [30]:
page_views_with_day_df = page_views_df.withColumn('day', truncate_day_from_timestamp_udf('timestamp_pv')).withColumn('dummyPageView', F.lit(1))

In [33]:
page_views_with_day_df.columns

['uuid_pv',
 'document_id',
 'timestamp_pv',
 'platform',
 'geo_location',
 'traffic_source',
 'dummyPageView',
 'day']

In [31]:
events_with_day_df = events_df.withColumn('day', truncate_day_from_timestamp_udf('timestamp_event')).withColumn('dummyEvent', F.lit(1))

In [35]:
unique_users_df = clicks_train_joined_df.select('uuid_event').distinct()

In [None]:
unique_users_df.count() # clicks_train에 포함된 고유한 uuid 수: 14814344

In [36]:
page_views_unique_df = page_views_with_day_df \
                    .join(unique_users_df, on=[F.col('uuid_pv') == F.col('uuid_event')], how = 'inner').cache()

In [None]:
page_views_unique_df.count() # 154011175

In [37]:
page_views_unique_df.columns

['uuid_pv',
 'document_id',
 'timestamp_pv',
 'platform',
 'geo_location',
 'traffic_source',
 'dummyPageView',
 'day',
 'uuid_event']

In [38]:
events_with_day_df.columns

['display_id',
 'uuid_event',
 'document_id',
 'timestamp_event',
 'platform',
 'geo_location',
 'dummyEvent',
 'day']

In [39]:
page_views_left_join_events_df = page_views_unique_df \
                                        .join(events_with_day_df, 
                                              on=['uuid_event','document_id','platform','geo_location','day'], 
                                              how='left')

In [41]:
# page_views_left_join_events_df.count() # 154,382,412

In [42]:
page_views_left_join_events_df.columns

['uuid_event',
 'document_id',
 'platform',
 'geo_location',
 'day',
 'uuid_pv',
 'timestamp_pv',
 'traffic_source',
 'dummyPageView',
 'display_id',
 'timestamp_event',
 'dummyEvent']

In [None]:
# page_views_left_join_events_df.select('uuid_event').distinct().count() # 14,814,344명

In [43]:
page_views_events_df = page_views_left_join_events_df.sort('timestamp_pv', ascending = True)

In [None]:
# page_views_events_df.show(3)

In [44]:
windowval = (Window.partitionBy('uuid_event').orderBy('timestamp_pv')
             .rangeBetween(Window.unboundedPreceding, 0))

In [45]:
user_views_df = page_views_events_df.withColumn('user_views', F.count('uuid_event').over(windowval))

In [None]:
# user_views_df.select('user_views').describe('user_views').show()

In [47]:
# user_views_df.filter((user_views_df["user_views"] == "") | user_views_df["user_views"].isNull() | F.isnan(user_views_df["user_views"])).count()

In [48]:
train_merged_df = spark.read.parquet("gs://line_2018/outputtrain_final_3")

In [60]:
train_merged_df.columns

['display_id',
 'ad_id',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'platform',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'campaign_id',
 'advertiser_id',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'timestamp_event',
 'ad_source_id',
 'ad_publisher_id',
 'uuid',
 'document_id',
 'timestamp_event',
 'platform',
 'geo_location',
 'dummyEvent']

In [50]:
user_views_df = user_views_df.select('uuid_event', 'user_views', 'timestamp_event', 'display_id')

In [57]:
train_user_added_df = train_merged_df \
                    .join(user_views_df, on = ["display_id", "timestamp_event"], how = 'left')

In [53]:
train_user_added_df.filter((train_user_added_df["user_views"] == "") | train_user_added_df["user_views"].isNull() | F.isnan(train_user_added_df["user_views"])).count()

195463

cf) 결측치가 생기는 원인 파악 필요

In [58]:
train_user_added_df = train_user_added_df.fillna({"user_views":1})

In [59]:
train_user_added_df.filter((train_user_added_df["user_views"] == "") | train_user_added_df["user_views"].isNull() | F.isnan(train_user_added_df["user_views"])).count()

0

In [60]:
train_user_added_df.columns

['display_id',
 'timestamp_event',
 'ad_id',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'platform',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'campaign_id',
 'advertiser_id',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'uuid_event',
 'user_views']

## Part C: ad_views(광고 누적 뷰 수 카운팅) 생성하기

In [162]:
windowval2 = (Window.partitionBy("ad_id").orderBy("timestamp_event")
             .rangeBetween(Window.unboundedPreceding, 0))

In [164]:
train_ad_added_df = train_merged_df.withColumn("ad_views", F.count("ad_id").over(windowval2))

In [63]:
train_user_ad_added_df.filter((train_user_ad_added_df["ad_views"] == "") | train_user_ad_added_df["ad_views"].isNull() | F.isnan(train_user_ad_added_df["ad_views"])).count()

0

In [None]:
# train_user_added_df.count() # 87141731

## Part D: ad_doc_view(랜딩 페이지의 누적 페이지 뷰 수 카운팅) 생성하기

In [32]:
page_views_ad_doc_unique_df = page_views_with_day_df \
                    .join(promoted_content_df, on=[F.col("document_id") == F.col("document_id_promo")], how = 'inner').cache()

In [33]:
page_views_ad_doc_unique_df.columns

['uuid',
 'document_id',
 'timestamp_pv',
 'platform',
 'geo_location',
 'traffic_source',
 'dummyPageView',
 'day',
 'ad_id',
 'document_id_promo',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent']

In [94]:
page_views_event_joined_df = page_views_ad_doc_unique_df \
                                        .join(events_with_day_df, 
                                              on=["uuid", "document_id", "day", "platform", "geo_location"],
                                              how='left')

In [95]:
windowval3 = (Window.partitionBy('document_id').orderBy('timestamp_pv')
             .rangeBetween(Window.unboundedPreceding, 0))

In [96]:
page_views_event_joined_df = page_views_event_joined_df.withColumn('ad_doc_views', F.count('document_id').over(windowval3)).cache()

In [97]:
page_views_event_joined_df.columns

['uuid',
 'document_id',
 'day',
 'platform',
 'geo_location',
 'timestamp_pv',
 'traffic_source',
 'dummyPageView',
 'ad_id',
 'document_id_promo',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent',
 'display_id',
 'timestamp_event',
 'dummyEvent',
 'ad_doc_views']

In [106]:
page_views_event_joined_df.show(2)

+--------------+-----------+---+--------+------------+------------+--------------+-------------+------+-----------------+-----------+-------------+--------------------+----------+---------------+----------+------------+
|          uuid|document_id|day|platform|geo_location|timestamp_pv|traffic_source|dummyPageView| ad_id|document_id_promo|campaign_id|advertiser_id|dummyPromotedContent|display_id|timestamp_event|dummyEvent|ad_doc_views|
+--------------+-----------+---+--------+------------+------------+--------------+-------------+------+-----------------+-----------+-------------+--------------------+----------+---------------+----------+------------+
|3c889cceac8ce6|      90461|  0|       1|          PK|     2801334|             2|            1|240962|            90461|      25864|          128|                   1|      null|           null|      null|           1|
|382a5a300415e3|      90461|  0|       1|   US>NV>839|     2887274|             2|            1|240962|            90461

In [117]:
promoted_content_df.columns

['ad_id',
 'document_id_promo',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent']

In [101]:
train_merged_df = spark.read.parquet("gs://line_2018/outputtrain_final_3")

In [102]:
train_merged_df = train_merged_df \
                         .join(events_df, on=['display_id', 'timestamp_event', 'platform'], how='left').cache()

In [103]:
train_merged_df.columns

['display_id',
 'timestamp_event',
 'platform',
 'ad_id',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'campaign_id',
 'advertiser_id',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'uuid',
 'document_id',
 'geo_location',
 'dummyEvent']

In [104]:
train_merged_ad_doc_df = train_merged_df \
                    .join(promoted_content_df, on= ["ad_id", "campaign_id", "advertiser_id"], how = 'left').cache()

In [105]:
train_merged_ad_doc_df.columns

['ad_id',
 'campaign_id',
 'advertiser_id',
 'display_id',
 'timestamp_event',
 'platform',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'uuid',
 'document_id',
 'geo_location',
 'dummyEvent',
 'document_id_promo',
 'dummyPromotedContent']

In [75]:
# train_merged_ad_doc_df3 = train_merged_ad_doc_df.drop('document_id')

In [107]:
pv_joined_df = page_views_event_joined_df.select('document_id', 'platform', 'ad_id', 'traffic_source', 'display_id', 'timestamp_event', 'ad_doc_views')

In [108]:
train_merged_ad_doc_df = train_merged_ad_doc_df \
                        .join(pv_joined_df, on= ['document_id', 'platform', 'traffic_source', 'display_id', 'ad_id'], how = 'left')

In [109]:
train_merged_ad_doc_df.columns

['document_id',
 'platform',
 'traffic_source',
 'display_id',
 'ad_id',
 'campaign_id',
 'advertiser_id',
 'timestamp_event',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'is_train',
 'country',
 'day_event',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'uuid',
 'geo_location',
 'dummyEvent',
 'document_id_promo',
 'dummyPromotedContent',
 'timestamp_event',
 'ad_doc_views']

In [110]:
train_merged_ad_doc_df2 = train_merged_ad_doc_df.select('display_id', 'ad_id', 'ad_doc_views', 'document_id').cache()

In [113]:
train_merged_ad_doc_df2.filter((train_merged_ad_doc_df2["ad_doc_views"] == "") | train_merged_ad_doc_df2["ad_doc_views"].isNull() | F.isnan(train_merged_ad_doc_df2["ad_doc_views"])).count()

87141731

In [None]:
# csv 파일로 쓰기: binning3.csv
# train_merged_ad_doc_df2.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'binning3.csv', header = True)

## Part E: 테이블 합치기

In [160]:
# train_user_ad_doc_added_df = train_merged_ad_doc_df.drop('document_id_promo', 'dummyPromotedContent', 'timestamp_event_pv', 'platform_pv',
#                                                          , 'document_id_promo')

In [132]:
# # binning:
# table = train_user_ad_doc_added_df.select('display_id', 'ad_id', 'ad_views', 'ad_doc_views', 'user_views')

In [161]:
# table2 = train_user_ad_doc_added_df.select('ad_id', 'ad_doc_views')

In [133]:
# # csv 파일로 쓰기: binning.csv
# table.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'binning.csv', header = True)

In [162]:
# # csv 파일로 쓰기: binning2.csv
# table2.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'binning2.csv', header = True)

In [114]:
train_merged_ad_doc_df.columns

['document_id',
 'platform',
 'traffic_source',
 'display_id',
 'ad_id',
 'campaign_id',
 'advertiser_id',
 'timestamp_event',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'is_train',
 'country',
 'day_event',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'uuid',
 'geo_location',
 'dummyEvent',
 'document_id_promo',
 'dummyPromotedContent',
 'timestamp_event',
 'ad_doc_views']

In [115]:
train_table = train_merged_ad_doc_df.drop('pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id', 'user_views', 'ad_doc_views', 'ad_views')

In [137]:
# # column별 수준 수 세기
# for col in train_valid_merged_df.columns:
#   print(col, train_valid_merged_df.select(col).distinct().count())

In [None]:
# # column별 NA 세기
# train_valid_merged_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in train_valid_merged_df.columns]).show()

In [119]:
# 미국, 캐나다, 영국이 아닌 경우 U로 할당
train_table = train_table.withColumn("country", \
              F.when((train_table["country"] != 'US') &
                     (train_table["country"] != 'CA') &
                     (train_table["country"] != 'GB'), 'U').otherwise(train_table["country"]))

In [120]:
cont_binned_schema = StructType(
                    [StructField("pop_ad_id", StringType(), True),
                    StructField("pop_ad_doc_id", StringType(), True),                    
                    StructField("pop_ad_publisher_id", StringType(), True),
                    StructField("pop_advertiser_id", StringType(), True),
                    StructField("pop_campaign_id", StringType(), True),
                    StructField("pop_view_doc_ad_doc", StringType(), True),
                     StructField("pop_ad_source_id", StringType(), True),
                     StructField("pop_ad_topic_id", StringType(), True),
                     StructField("pop_ad_category_id", StringType(), True)
                     
                    ]
                    )

cont_binned_df = spark.read.schema(cont_binned_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "train_bin.csv")

In [None]:
# # column별 수준 수 세기
# for col in cont_binned_df.columns:
#   print(col, cont_binned_df.select(col).distinct().count())

In [None]:
# # column별 NA 세기
# cont_binned_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in cont_binned_df.columns]).show()

In [121]:
# NA인 값을 U로 바꾸기
for col in cont_binned_df.columns:
  cont_binned_df = cont_binned_df.withColumn(col, F.regexp_replace(col, 'NA', 'U'))

In [122]:
cont_binned_df = cont_binned_df.withColumn('index', F.row_number().over(Window.orderBy(F.lit(1))))

In [123]:
train_table2 = train_table.withColumn('index', F.row_number().over(Window.orderBy(F.lit(1))))

In [124]:
train_table2 = train_table2.join(cont_binned_df, 'index', how = 'left')

In [129]:
cont2_binned_schema = StructType(
                    [StructField("display_id", StringType(), True),
                    StructField("ad_id", StringType(), True),                    
                    StructField("ad_views", StringType(), True),
                    StructField("user_views", StringType(), True)
                    ]
                    )

cont2_binned_df = spark.read.schema(cont2_binned_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv("gs://line-2018-2/data/f.csv")

In [130]:
cont2_binned_df.columns

['display_id', 'ad_id', 'ad_views', 'user_views']

In [131]:
all_data = train_table2.join(cont2_binned_df, on = ['display_id', 'ad_id'], how = 'left')

In [133]:
all_data = all_data.drop('index', 'document_id', 'day_event', 'uuid', 'geo_location', 'dummyEvent', 'document_id_promo',
                        'dummyPromotedContent', 'timestamp_event')

In [135]:
events_df.columns

['display_id',
 'uuid',
 'document_id',
 'timestamp_event',
 'platform',
 'geo_location',
 'dummyEvent']

In [137]:
all_data = all_data.join(events_df, on = ['display_id', 'platform'], how = 'left')

In [139]:
all_data.columns

all_data = all_data.drop('geo_location', 'uuid', 'document_id', 'dummyEvent')

['display_id',
 'platform',
 'ad_id',
 'traffic_source',
 'campaign_id',
 'advertiser_id',
 'label',
 'is_leak',
 'weekend',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'is_train',
 'country',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'ad_views',
 'user_views',
 'uuid',
 'document_id',
 'timestamp_event',
 'geo_location',
 'dummyEvent']

In [141]:
# # column별 NA 세기
# all_data.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in all_data.columns]).show()

# # 0일 것으로 기대...

+----------+--------+-----+--------------+-----------+-------------+-----+-------+-------+-----------------+--------------+----+--------+-------+--------------+-----------+----------------+-------------+------------+---------------+---------+-------------+-------------------+-----------------+---------------+-------------------+----------------+---------------+------------------+--------+----------+---------------+
|display_id|platform|ad_id|traffic_source|campaign_id|advertiser_id|label|is_leak|weekend|view_publisher_id|view_source_id|hour|is_train|country|ad_category_id|ad_topic_id|view_category_id|view_topic_id|ad_source_id|ad_publisher_id|pop_ad_id|pop_ad_doc_id|pop_ad_publisher_id|pop_advertiser_id|pop_campaign_id|pop_view_doc_ad_doc|pop_ad_source_id|pop_ad_topic_id|pop_ad_category_id|ad_views|user_views|timestamp_event|
+----------+--------+-----+--------------+-----------+-------------+-----+-------+-------+-----------------+--------------+----+--------+-------+--------------+--

In [None]:
# # column별 수준 수 세기
# for col in all_data.columns:
#   print(col, all_data.select(col).distinct().count())
  
# # 결과는 아래 참고
# # ('display_id', 16874593)
# # ('platform', 3)
# # ('ad_id', 478950)
# # ('traffic_source', 3)
# # ('campaign_id', 32676)
# # ('advertiser_id', 4174)
# # ('label', 2)
# # ('is_leak', 3)
# # ('weekend', 2)
# # ('view_publisher_id', 485)
# # ('view_source_id', 4034)
# # ('hour', 6)
# # ('is_train', 2)
# # ('country', 4)
# # ('ad_category_id', 95)
# # ('ad_topic_id', 301)
# # # ('view_category_id', 91)
# # ('ad_source_id', 6991)
# # ('ad_publisher_id', 883)
# # ('pop_ad_id', 5)
# # ('pop_ad_doc_id', 5)
# # ('pop_ad_publisher_id', 5)

In [144]:
all_data = all_data.sort('timestamp_event', ascending = True)

In [145]:
# 파케이로 쓰기: all_data_parquet
all_data.write.parquet(OUTPUT_BUCKET_FOLDER + 'all_data_parquet', mode='overwrite')

In [None]:
# # csv로 쓰기: all_data.csv 
# all_data.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'all_data.csv', header = True)

In [148]:
all_data = all_data.drop('is_train')

In [156]:
# all_data.count() # 87141731 아니니...?

In [149]:
all_data = all_data.withColumn('index', F.row_number().over(Window.orderBy(F.lit(1))))

In [157]:
train = all_data.filter(all_data['index'] <= 60000000)

In [158]:
test = all_data.filter(all_data['index'] > 60000000)

In [159]:
train.count() # 60000000

60000000

In [160]:
test.count() # 27141731

27141731

In [168]:
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [170]:
train = train.select(*(F.col(c).cast('string').alias(c) for c in train.columns))

In [171]:
test = test.select(*(F.col(c).cast('string').alias(c) for c in train.columns))

In [172]:
train = train.withColumn('is_train', F.lit(1))
test = test.withColumn('is_train', F.lit(0))

In [173]:
union = train.union(test)

In [174]:
union.columns

['display_id',
 'platform',
 'ad_id',
 'traffic_source',
 'campaign_id',
 'advertiser_id',
 'label',
 'is_leak',
 'weekend',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'country',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'ad_views',
 'user_views',
 'timestamp_event',
 'index',
 'is_train']

In [175]:
union = union.drop('index', 'timestamp_event')

In [177]:
union.printSchema()
union = union.withColumnRenamed('label', 'ClickOrNot')

root
 |-- display_id: string (nullable = true)
 |-- platform: string (nullable = true)
 |-- ad_id: string (nullable = true)
 |-- traffic_source: string (nullable = true)
 |-- campaign_id: string (nullable = true)
 |-- advertiser_id: string (nullable = true)
 |-- label: string (nullable = true)
 |-- is_leak: string (nullable = true)
 |-- weekend: string (nullable = true)
 |-- view_publisher_id: string (nullable = true)
 |-- view_source_id: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- country: string (nullable = true)
 |-- ad_category_id: string (nullable = true)
 |-- ad_topic_id: string (nullable = true)
 |-- view_category_id: string (nullable = true)
 |-- view_topic_id: string (nullable = true)
 |-- ad_source_id: string (nullable = true)
 |-- ad_publisher_id: string (nullable = true)
 |-- pop_ad_id: string (nullable = true)
 |-- pop_ad_doc_id: string (nullable = true)
 |-- pop_ad_publisher_id: string (nullable = true)
 |-- pop_advertiser_id: string (nullable = true

In [178]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [179]:
categoricalColumns = categoricalColumns = ['weekend', 'view_publisher_id', 'view_source_id', 'hour', 'platform', 'traffic_source', 'country', 'campaign_id', 'advertiser_id', 'ad_category_id', 'ad_topic_id', 'view_category_id', 'view_topic_id', 'ad_source_id', 'ad_publisher_id', 'pop_ad_id', 'pop_ad_doc_id', 'pop_ad_publisher_id', 'pop_advertiser_id', 'pop_campaign_id', 'pop_view_doc_ad_doc', 'pop_ad_source_id', 'pop_ad_topic_id', 'pop_ad_category_id', 'ad_views', 'user_views']

In [180]:
stages = []

In [181]:
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

In [182]:
label_stringIdx = StringIndexer(inputCol = 'ClickOrNot', outputCol = 'label')
stages += [label_stringIdx]

In [183]:
assemblerInputs = [c + "classVec" for c in categoricalColumns]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(union)
transform_df = pipelineModel.transform(union)
cols = union.columns
selectedCols = ['label', 'features'] + cols
selected_df = transform_df.select(selectedCols)

In [None]:
# train - test split
train = selected_df.filter(selected_df.is_train == 1)
test = selected_df.filter(selected_df.is_train == 0)

In [None]:
import datetime
start_time = time.time()

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter = 10)
lrModel = lr.fit(train)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
lrPredictions = lrModel.transform(test)

In [None]:
logistic_final_df = lrPredictions.select('display_id', 'ad_id', 'probability')

In [None]:
logistic_final_pd = logistic_final_df.toPandas()

In [None]:
logistic_final_pd['clickprob'] = logistic_final_pd.probability

In [None]:
for i in range(len(logistic_final_pd)):
  logistic_final_pd.clickprob[i] = logistic_final_pd.probability[i][1]

In [None]:
logistic_final_pd = logistic_final_pd[['display_id', 'ad_id', 'clickprob']]

from io import BytesIO
logistic_final_pd.to_csv('logistic.csv', index = False)
!gsutil cp 'logistic.csv' 'gs://line-2018-2/output/logistic.csv'

In [None]:
test_label = test.select('display_id', 'ad_id', 'label').toPandas()

In [None]:
pd_merge = pd.merge(test_label, logistic_final_pd, how = 'left', left_on = ['display_id',
                                                                           'ad_id'], right_on = ['display_id', 'ad_id'])

In [None]:
pd_merge.sort_values(['display_id', 'clickprob'], inplace= True, ascending = [True, False])

In [None]:
from ml_metrics import mapk

y_ads = pd_merge[pd_merge.label == 1].ad_id.values.reshape(-1, 1)
p_ads = pd_merge.groupby(by = 'display_id', sort = False).ad_id.apply(lambda x: x.values).values

In [None]:
score = mapk(y_ads, p_ads, 12)
print('MAP: %.12f' % score)

In [None]:
# # 파케이로 쓰기: train_parquet
# train.write.parquet(OUTPUT_BUCKET_FOLDER + 'train_parquet', mode='overwrite')

In [None]:
# # 파케이로 쓰기: test_parquet
# test.write.parquet(OUTPUT_BUCKET_FOLDER + 'test_parquet', mode='overwrite')

In [161]:
# csv로 쓰기: train_csv
train.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'train_final.csv', header = True)

In [166]:
# # csv로 쓰기: test_csv
# test.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'test_final.csv', header = True)

KeyboardInterrupt: 