# Creating user_views, page_views, ad_views & Train-Test Set Split

As of Dec 13, 2018

## Part A: 데이터 로드 및 환경 설정

In [2]:
from pyspark.sql import Window

In [3]:
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all" # 한 셀(cell)에서의 코드 실행 결과가 다 보이도록 설정
import numpy as np
import scipy.sparse
import math
import datetime
import time
import itertools
import random
random.seed(42)
import pandas as pd
%matplotlib inline

In [4]:
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [5]:
OUTPUT_BUCKET_FOLDER = "gs://line-2018-2/output/"
DATA_BUCKET_FOLDER = "gs://upload-bigquery180927/data/"

In [26]:
page_views_schema = StructType(
                    [StructField("uuid_pv", StringType(), True),
                    StructField("document_id", IntegerType(), True),
                    StructField("timestamp_pv", IntegerType(), True),
                    StructField("platform", IntegerType(), True),
                    StructField("geo_location", StringType(), True),
                    StructField("traffic_source", IntegerType(), True)]
                    )
#Data is read from Google Cloud Storage, a distributed filesystem like HDFS stored externally to the cluster
#If you wanna use your own HDFS, just prefix the file path with the absolute path 'hdfs:' or 
#with the relative path if HDFS is your cluster default filesystem
page_views_df = spark.read.schema(page_views_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "page_views.csv")

In [18]:
events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid_event", StringType(), True),
                    StructField("document_id", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform", IntegerType(), True),
                    StructField("geo_location", StringType(), True)]
                    )

events_df = spark.read \
            .schema(events_schema) \
            .options(header='true', inferschema='false', nullValue='\\N') \
            .csv(DATA_BUCKET_FOLDER + "events.csv").cache()

In [27]:
page_views_df = page_views_df.withColumn('dummyPageView', F.lit(1))
events_df = events_df.withColumn('dummyEvent', F.lit(1))

In [28]:
clicks_train_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True),                    
                    StructField("clicked", IntegerType(), True)]
                    )

clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
                .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train').cache()

In [29]:
promoted_content_schema = StructType(
                    [StructField("ad_id", IntegerType(), True),
                    StructField("document_id_promo", IntegerType(), True),                    
                    StructField("campaign_id", IntegerType(), True),
                    StructField("advertiser_id", IntegerType(), True)]
                    )

promoted_content_df = spark.read.schema(promoted_content_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
                .withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content').cache()

In [30]:
clicks_train_joined_df = clicks_train_df \
                         .join(promoted_content_df, on='ad_id', how='left') \
                         .join(events_df, on='display_id', how='left').cache()                      

In [31]:
clicks_train_joined_df.columns

['display_id',
 'ad_id',
 'clicked',
 'dummyClicksTrain',
 'document_id_promo',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent',
 'uuid_event',
 'document_id',
 'timestamp_event',
 'platform',
 'geo_location',
 'dummyEvent']

## Part B: user_views(유저의 누적 페이지 뷰 수 카운팅) 생성하기

In [13]:
truncate_day_from_timestamp_udf = F.udf(lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType())

In [32]:
page_views_with_day_df = page_views_df.withColumn('day', truncate_day_from_timestamp_udf('timestamp_pv')).withColumn('dummyPageView', F.lit(1))

In [33]:
page_views_with_day_df.columns

['uuid_pv',
 'document_id',
 'timestamp_pv',
 'platform',
 'geo_location',
 'traffic_source',
 'dummyPageView',
 'day']

In [34]:
events_with_day_df = events_df.withColumn('day', truncate_day_from_timestamp_udf('timestamp_event')).withColumn('dummyEvent', F.lit(1))

In [35]:
unique_users_df = clicks_train_joined_df.select('uuid_event').distinct()

In [None]:
unique_users_df.count() # clicks_train에 포함된 고유한 uuid 수: 14814344

In [36]:
page_views_unique_df = page_views_with_day_df \
                    .join(unique_users_df, on=[F.col('uuid_pv') == F.col('uuid_event')], how = 'inner').cache()

In [None]:
page_views_unique_df.count() # 154011175

In [37]:
page_views_unique_df.columns

['uuid_pv',
 'document_id',
 'timestamp_pv',
 'platform',
 'geo_location',
 'traffic_source',
 'dummyPageView',
 'day',
 'uuid_event']

In [38]:
events_with_day_df.columns

['display_id',
 'uuid_event',
 'document_id',
 'timestamp_event',
 'platform',
 'geo_location',
 'dummyEvent',
 'day']

In [39]:
page_views_left_join_events_df = page_views_unique_df \
                                        .join(events_with_day_df, 
                                              on=['uuid_event','document_id','platform','geo_location','day'], 
                                              how='left')

In [41]:
# page_views_left_join_events_df.count() # 154,382,412

In [42]:
page_views_left_join_events_df.columns

['uuid_event',
 'document_id',
 'platform',
 'geo_location',
 'day',
 'uuid_pv',
 'timestamp_pv',
 'traffic_source',
 'dummyPageView',
 'display_id',
 'timestamp_event',
 'dummyEvent']

In [None]:
# page_views_left_join_events_df.select('uuid_event').distinct().count() # 14,814,344명

In [43]:
page_views_events_df = page_views_left_join_events_df.sort('timestamp_pv', ascending = True)

In [None]:
# page_views_events_df.show(3)

In [44]:
windowval = (Window.partitionBy('uuid_event').orderBy('timestamp_pv')
             .rangeBetween(Window.unboundedPreceding, 0))

In [45]:
user_views_df = page_views_events_df.withColumn('user_views', F.count('uuid_event').over(windowval))

In [None]:
# user_views_df.select('user_views').describe('user_views').show()

In [47]:
# user_views_df.filter((user_views_df["user_views"] == "") | user_views_df["user_views"].isNull() | F.isnan(user_views_df["user_views"])).count()

In [48]:
train_merged_df = spark.read.parquet("gs://line_2018/outputtrain_final_3")

In [49]:
train_merged_df.columns

['display_id',
 'ad_id',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'platform',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'campaign_id',
 'advertiser_id',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'timestamp_event',
 'ad_source_id',
 'ad_publisher_id']

In [50]:
user_views_df = user_views_df.select('uuid_event', 'user_views', 'timestamp_event', 'display_id')

In [57]:
train_user_added_df = train_merged_df \
                    .join(user_views_df, on = ["display_id", "timestamp_event"], how = 'left')

In [53]:
train_user_added_df.filter((train_user_added_df["user_views"] == "") | train_user_added_df["user_views"].isNull() | F.isnan(train_user_added_df["user_views"])).count()

195463

cf) 결측치가 생기는 원인 파악 필요

In [58]:
train_user_added_df = train_user_added_df.fillna({"user_views":1})

In [59]:
train_user_added_df.filter((train_user_added_df["user_views"] == "") | train_user_added_df["user_views"].isNull() | F.isnan(train_user_added_df["user_views"])).count()

0

In [60]:
train_user_added_df.columns

['display_id',
 'timestamp_event',
 'ad_id',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'platform',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'campaign_id',
 'advertiser_id',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'uuid_event',
 'user_views']

## Part C: ad_views(광고 누적 뷰 수 카운팅) 생성하기

In [61]:
windowval2 = (Window.partitionBy("ad_id").orderBy("timestamp_event")
             .rangeBetween(Window.unboundedPreceding, 0))

In [62]:
train_user_ad_added_df = train_user_added_df.withColumn("ad_views", F.count("ad_id").over(windowval2))

In [63]:
train_user_ad_added_df.filter((train_user_ad_added_df["ad_views"] == "") | train_user_ad_added_df["ad_views"].isNull() | F.isnan(train_user_ad_added_df["ad_views"])).count()

0

In [None]:
# train_user_added_df.count() # 87141731

## Part D: ad_doc_view(랜딩 페이지의 누적 페이지 뷰 수 카운팅) 생성하기

In [142]:
page_views_ad_doc_unique_df = page_views_with_day_df \
                    .join(promoted_content_df, on=[F.col("document_id") == F.col("document_id_promo")], how = 'inner').cache()

In [146]:
events_with_day_df.columns

['display_id',
 'uuid',
 'document_id',
 'timestamp_event',
 'platform',
 'geo_location',
 'dummyEvent',
 'day']

In [145]:
page_views_ad_doc_unique_df.columns

['uuid_pv',
 'document_id',
 'timestamp_pv',
 'platform',
 'geo_location',
 'traffic_source',
 'dummyPageView',
 'day',
 'ad_id',
 'document_id_promo',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent']

In [149]:
page_views_ad_doc_unique_df = page_views_ad_doc_unique_df.withColumnRenamed("uuid_pv", "uuid")

In [150]:
page_views_event_joined_df = page_views_ad_doc_unique_df \
                                        .join(events_with_day_df, 
                                              on=["uuid", "document_id", "day", "platform", "geo_location"],
                                              how='left')

In [151]:
windowval3 = (Window.partitionBy('document_id').orderBy('timestamp_pv')
             .rangeBetween(Window.unboundedPreceding, 0))

In [152]:
page_views_event_joined_df = page_views_event_joined_df.withColumn('ad_doc_views', F.count('document_id').over(windowval3))

In [153]:
page_views_event_joined_df.columns

['uuid',
 'document_id',
 'day',
 'platform',
 'geo_location',
 'timestamp_pv',
 'traffic_source',
 'dummyPageView',
 'ad_id',
 'document_id_promo',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent',
 'display_id',
 'timestamp_event',
 'dummyEvent',
 'ad_doc_views']

In [116]:
train_user_ad_added_df.columns

['display_id',
 'timestamp_event',
 'ad_id',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'platform',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'campaign_id',
 'advertiser_id',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'uuid_event',
 'user_views',
 'ad_views']

In [117]:
promoted_content_df.columns

['ad_id',
 'document_id_promo',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent']

In [155]:
train_user_ad_doc_added_df = train_user_ad_added_df \
                    .join(promoted_content_df, on= "ad_id", how = 'left').cache()

In [156]:
train_user_ad_doc_added_df.columns

['ad_id',
 'display_id',
 'timestamp_event',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'platform',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'campaign_id',
 'advertiser_id',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'uuid_event',
 'user_views',
 'ad_views',
 'document_id_promo',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent']

In [157]:
train_user_ad_doc_added_df = train_user_ad_doc_added_df \
                        .join(page_views_event_joined_df, on= ["document_id_promo", "timestamp_event", "ad_id", "campaign_id", "advertiser_id", "display_id"], how = "left").cache()

In [158]:
train_user_ad_doc_added_df.columns

['document_id_promo',
 'timestamp_event',
 'ad_id',
 'campaign_id',
 'advertiser_id',
 'display_id',
 'label',
 'is_leak',
 'weekend',
 'pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id',
 'view_publisher_id',
 'view_source_id',
 'hour',
 'platform',
 'traffic_source',
 'is_train',
 'country',
 'day_event',
 'ad_category_id',
 'ad_topic_id',
 'view_category_id',
 'view_topic_id',
 'ad_source_id',
 'ad_publisher_id',
 'uuid_event',
 'user_views',
 'ad_views',
 'campaign_id',
 'advertiser_id',
 'dummyPromotedContent',
 'uuid',
 'document_id',
 'day',
 'platform',
 'geo_location',
 'timestamp_pv',
 'traffic_source',
 'dummyPageView',
 'dummyPromotedContent',
 'dummyEvent',
 'ad_doc_views']

In [159]:
train_user_ad_doc_added_df.filter((train_user_ad_doc_added_df["ad_doc_views"] == "") | train_user_ad_doc_added_df["ad_doc_views"].isNull() | F.isnan(train_user_ad_doc_added_df["ad_doc_views"])).count()

KeyboardInterrupt: 

## Part E: 테이블 합치기

In [160]:
train_user_ad_doc_added_df = train_user_ad_doc_added_df.drop('dummyEvent', 'dummyPromotedContent', 'dummyPageView', 'timestamp_pv', 'geo_location', 'document_id', 'day_event', 'document_id_promo')

In [132]:
# # binning:
# table = train_user_ad_doc_added_df.select('display_id', 'ad_id', 'ad_views', 'ad_doc_views', 'user_views')

In [161]:
table2 = train_user_ad_doc_added_df.select('ad_id', 'ad_doc_views')

In [133]:
# # csv 파일로 쓰기: binning.csv
# table.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'binning.csv', header = True)

In [162]:
# # csv 파일로 쓰기: binning.csv
# table2.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'binning2.csv', header = True)

In [163]:
train_user_ad_doc_added_df2 = train_user_ad_doc_added_df.drop('pop_ad_id',
 'pop_ad_doc_id',
 'pop_ad_publisher_id',
 'pop_advertiser_id',
 'pop_campaign_id',
 'pop_view_doc_ad_doc',
 'pop_ad_source_id',
 'pop_ad_topic_id',
 'pop_ad_category_id', 'user_views', 'ad_doc_views', 'ad_views')

In [137]:
# # column별 수준 수 세기
# for col in train_valid_merged_df.columns:
#   print(col, train_valid_merged_df.select(col).distinct().count())

In [None]:
# # column별 NA 세기
# train_valid_merged_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in train_valid_merged_df.columns]).show()

In [164]:
train_valid_merged_df2 = train_user_ad_doc_added_df2

In [165]:
# 미국, 캐나다, 영국이 아닌 경우 U로 할당
train_valid_merged_df2 = train_valid_merged_df2.withColumn("country", \
              F.when((train_valid_merged_df2["country"] != 'US') &
                     (train_valid_merged_df2["country"] != 'CA') &
                     (train_valid_merged_df2["country"] != 'GB'), 'U').otherwise(train_valid_merged_df2["country"]))

In [166]:
cont_binned_schema = StructType(
                    [StructField("pop_ad_id", StringType(), True),
                    StructField("pop_ad_doc_id", StringType(), True),                    
                    StructField("pop_ad_publisher_id", StringType(), True),
                    StructField("pop_advertiser_id", StringType(), True),
                    StructField("pop_campaign_id", StringType(), True),
                    StructField("pop_view_doc_ad_doc", StringType(), True),
                     StructField("pop_ad_source_id", StringType(), True),
                     StructField("pop_ad_topic_id", StringType(), True),
                     StructField("pop_ad_category_id", StringType(), True)
                     
                    ]
                    )

cont_binned_df = spark.read.schema(cont_binned_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "train_bin.csv")

In [None]:
# # column별 수준 수 세기
# for col in cont_binned_df.columns:
#   print(col, cont_binned_df.select(col).distinct().count())

In [None]:
# # column별 NA 세기
# cont_binned_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in cont_binned_df.columns]).show()

In [167]:
# NA인 값을 U로 바꾸기
for col in cont_binned_df.columns:
  cont_binned_df = cont_binned_df.withColumn(col, F.regexp_replace(col, 'NA', 'U'))

In [169]:
cont_binned_df = cont_binned_df.withColumn('index', F.row_number().over(Window.orderBy(F.lit(1))))

In [170]:
train_valid_merged_df2 = train_valid_merged_df2.withColumn('index', F.row_number().over(Window.orderBy(F.lit(1))))

In [171]:
train_final = train_valid_merged_df2.join(cont_binned_df, 'index', how = 'left')

In [175]:
train_sample.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'train_final.csv', header = True)

AnalysisException: u'Found duplicate column(s) when inserting into gs://line-2018-2/output/train_final(ing)_parquet: `traffic_source`, `advertiser_id`, `platform`, `campaign_id`;'

## Part F: Train - Test Set으로 구분