# User-Views 계산

In [1]:
OUTPUT_BUCKET_FOLDER = "gs://line_2018/output/"
DATA_BUCKET_FOLDER = "gs://upload-bigquery180927/data/"

In [2]:
from IPython.display import display

In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [4]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all" # 한 셀(cell)에서의 코드 실행 결과가 다 보이도록 설정

In [5]:
import numpy as np
import scipy.sparse

In [6]:
import warnings 
warnings.filterwarnings('ignore') # warning 메시지 반환되지 않도록 설정

In [7]:
import math
import datetime
import time
import itertools

In [8]:
import pickle

In [9]:
import pandas as pd
%matplotlib inline

In [10]:
# clicks_train과 events 매칭하여 uuid_event 정보 빼내기

events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid_event", StringType(), True),                    
                    StructField("document_id_event", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform_event", IntegerType(), True),
                    StructField("geo_location_event", StringType(), True)]
                    )

events_df = spark.read.schema(events_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "events.csv") \
                .alias('events')  

In [11]:
clicks_train_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True),                    
                    StructField("clicked", IntegerType(), True)]
                    )

clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv").cache()

In [12]:
clicks_train_merged_df = clicks_train_df \
                         .join(events_df, on='display_id', how='left')

In [13]:
clicks_train_merged_df.columns

['display_id',
 'ad_id',
 'clicked',
 'uuid_event',
 'document_id_event',
 'timestamp_event',
 'platform_event',
 'geo_location_event']

In [16]:
page_views_schema = StructType(
                    [StructField("uuid_pv", StringType(), True),
                    StructField("document_id_pv", IntegerType(), True),
                    StructField("timestamp_pv", IntegerType(), True),
                    StructField("platform_pv", IntegerType(), True),
                    StructField("geo_location_pv", StringType(), True),
                    StructField("traffic_source_pv", IntegerType(), True)]
                    )

page_views_df = spark.read.schema(page_views_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"page_views.csv") \
                .alias('page_views').cache()

In [None]:
page_views_subset = page_views_df.filter(F.col('uuid_pv').isin(unique_user_list)).cache() # unique_user_list에 있는 uuid의 page_view만 필터링

Exception KeyboardInterrupt in <function <lambda> at 0x7fc59499f488> ignored
Exception KeyboardInterrupt in <function <lambda> at 0x7fc59499f488> ignored
Exception KeyboardInterrupt in <function <lambda> at 0x7fc59499f488> ignored
Exception KeyboardInterrupt in <function <lambda> at 0x7fc59499f488> ignored


In [23]:
page_views_subset = page_views_df \
                          .join(clicks_train_merged_df, on=[F.col('uuid_pv') == F.col('uuid_event')], how='inner')

In [24]:
page_views_subset.select('uuid_pv').distinct().count()

14814344

In [25]:
clicks_train_merged_df.select('uuid_event').distinct().count()

14814344

In [26]:
page_views_subset.columns

['uuid_pv',
 'document_id_pv',
 'timestamp_pv',
 'platform_pv',
 'geo_location_pv',
 'traffic_source_pv',
 'display_id',
 'ad_id',
 'clicked',
 'uuid_event',
 'document_id_event',
 'timestamp_event',
 'platform_event',
 'geo_location_event']

In [27]:
page_views_subset.show(1)

+--------------+--------------+------------+-----------+---------------+-----------------+----------+-----+-------+--------------+-----------------+---------------+--------------+------------------+
|       uuid_pv|document_id_pv|timestamp_pv|platform_pv|geo_location_pv|traffic_source_pv|display_id|ad_id|clicked|    uuid_event|document_id_event|timestamp_event|platform_event|geo_location_event|
+--------------+--------------+------------+-----------+---------------+-----------------+----------+-----+-------+--------------+-----------------+---------------+--------------+------------------+
|100013af048bbf|       2441155|   896484104|          3|      US>CA>807|                1|  16757900| 7033|      0|100013af048bbf|            38915|     1116182031|             3|         US>CA>807|
+--------------+--------------+------------+-----------+---------------+-----------------+----------+-----+-------+--------------+-----------------+---------------+--------------+------------------+
only 

In [13]:
# unique_user_list = clicks_train_merged_df.select('uuid_event').distinct().toPandas()
# unique_user_list = unique_user_list['uuid_event'].tolist()

In [None]:
# unique_view_list = clicks_train_merged_df.select('document_id_event').distinct().toPandas()
# unique_view_list = unique_view_list['document_id_event'].tolist()

In [None]:
# clicks_train_joined_df = clicks_train_df \
#                          .join(promoted_content_df, on='ad_id', how='left') 

In [None]:
# unique_ad_list = clicks_train_joined_df.select('document_id_promo').distinct().toPandas()
# unique_ad_list = unique_ad_list['document_id_promo'].tolist()

### Window를 사용하여 유저의 누적 뷰 수 계산

In [28]:
from pyspark.sql import Window

In [29]:
windowval = (Window.partitionBy('uuid_pv').orderBy('timestamp_pv')
             .rangeBetween(Window.unboundedPreceding, 0))

In [30]:
df_cum_count = page_views_subset.withColumn('user_views', F.count('uuid_pv').over(windowval))

In [31]:
df_cum_count.columns

['uuid_pv',
 'document_id_pv',
 'timestamp_pv',
 'platform_pv',
 'geo_location_pv',
 'traffic_source_pv',
 'display_id',
 'ad_id',
 'clicked',
 'uuid_event',
 'document_id_event',
 'timestamp_event',
 'platform_event',
 'geo_location_event',
 'user_views']

In [33]:
df_cum_count.limit(50).toPandas()

Unnamed: 0,uuid_pv,document_id_pv,timestamp_pv,platform_pv,geo_location_pv,traffic_source_pv,display_id,ad_id,clicked,uuid_event,document_id_event,timestamp_event,platform_event,geo_location_event,user_views
0,100013af048bbf,2621739,864380892,3,US>CA>807,1,16757900,7033,0,100013af048bbf,38915,1116182031,3,US>CA>807,3
1,100013af048bbf,2621739,864380892,3,US>CA>807,1,16757900,147242,0,100013af048bbf,38915,1116182031,3,US>CA>807,3
2,100013af048bbf,2621739,864380892,3,US>CA>807,1,16757900,156270,1,100013af048bbf,38915,1116182031,3,US>CA>807,3
3,100013af048bbf,2436405,896305195,3,US>CA>807,1,16757900,7033,0,100013af048bbf,38915,1116182031,3,US>CA>807,6
4,100013af048bbf,2436405,896305195,3,US>CA>807,1,16757900,147242,0,100013af048bbf,38915,1116182031,3,US>CA>807,6
5,100013af048bbf,2436405,896305195,3,US>CA>807,1,16757900,156270,1,100013af048bbf,38915,1116182031,3,US>CA>807,6
6,100013af048bbf,2438447,896311212,3,US>CA>807,1,16757900,7033,0,100013af048bbf,38915,1116182031,3,US>CA>807,9
7,100013af048bbf,2438447,896311212,3,US>CA>807,1,16757900,147242,0,100013af048bbf,38915,1116182031,3,US>CA>807,9
8,100013af048bbf,2438447,896311212,3,US>CA>807,1,16757900,156270,1,100013af048bbf,38915,1116182031,3,US>CA>807,9
9,100013af048bbf,2434209,896334608,3,US>CA>807,1,16757900,7033,0,100013af048bbf,38915,1116182031,3,US>CA>807,12


In [41]:
page_views_df.filter(page_views_df['uuid_pv'] == '100013af048bbf').orderBy('timestamp_pv').show(100)

+--------------+--------------+------------+-----------+---------------+-----------------+
|       uuid_pv|document_id_pv|timestamp_pv|platform_pv|geo_location_pv|traffic_source_pv|
+--------------+--------------+------------+-----------+---------------+-----------------+
|100013af048bbf|       2621739|   864380892|          3|      US>CA>807|                1|
|100013af048bbf|       2436405|   896305195|          3|      US>CA>807|                1|
|100013af048bbf|       2438447|   896311212|          3|      US>CA>807|                1|
|100013af048bbf|       2434209|   896334608|          3|      US>CA>807|                1|
|100013af048bbf|       2434044|   896346355|          3|      US>CA>807|                1|
|100013af048bbf|       2438962|   896360102|          3|      US>CA>807|                1|
|100013af048bbf|       2434065|   896378254|          3|      US>CA>807|                1|
|100013af048bbf|       2442888|   896390294|          3|      US>CA>807|                1|

In [38]:
events_joined_df = events_df.join(df_cum_count,
                                           on=[F.col('events_df.uuid_event').alias('ue') == F.col('df_cum_count.uuid_pv'),
                                               F.col('events_df.document_id_event').alias('ie') == F.col('df_cum_count.document_id_pv'),
                                               F.col('events_df.platform_event').alias('pe') == F.col('df_cum_count.platform_pv'),
                                               F.col('events_df.geo_location_event').alias('le') == F.col('df_cum_count.geo_location_pv')],
                                               how='left') \
                                    .alias('events').cache()

AnalysisException: u"cannot resolve '`events_df.uuid_event`' given input columns: [page_views.traffic_source_pv, page_views.document_id_pv, events.uuid_event, page_views.platform_pv, events.geo_location_event, events.geo_location_event, clicked, events.uuid_event, display_id, user_views, page_views.timestamp_pv, events.document_id_event, ad_id, page_views.geo_location_pv, page_views.uuid_pv, events.display_id, events.timestamp_event, events.document_id_event, events.timestamp_event, events.platform_event, events.platform_event];;\n'Join LeftOuter, (((('events_df.uuid_event = 'df_cum_count.uuid_pv) && ('events_df.document_id_event = 'df_cum_count.document_id_pv)) && ('events_df.platform_event = 'df_cum_count.platform_pv)) && ('events_df.geo_location_event = 'df_cum_count.geo_location_pv))\n:- SubqueryAlias events\n:  +- Relation[display_id#0,uuid_event#1,document_id_event#2,timestamp_event#3,platform_event#4,geo_location_event#5] csv\n+- Project [uuid_pv#45, document_id_pv#46, timestamp_pv#47, platform_pv#48, geo_location_pv#49, traffic_source_pv#50, display_id#13, ad_id#14, clicked#15, uuid_event#578, document_id_event#579, timestamp_event#580, platform_event#581, geo_location_event#582, user_views#435L]\n   +- Project [uuid_pv#45, document_id_pv#46, timestamp_pv#47, platform_pv#48, geo_location_pv#49, traffic_source_pv#50, display_id#13, ad_id#14, clicked#15, uuid_event#578, document_id_event#579, timestamp_event#580, platform_event#581, geo_location_event#582, user_views#435L, user_views#435L]\n      +- Window [count(uuid_pv#45) windowspecdefinition(uuid_pv#45, timestamp_pv#47 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS user_views#435L], [uuid_pv#45], [timestamp_pv#47 ASC NULLS FIRST]\n         +- Project [uuid_pv#45, document_id_pv#46, timestamp_pv#47, platform_pv#48, geo_location_pv#49, traffic_source_pv#50, display_id#13, ad_id#14, clicked#15, uuid_event#578, document_id_event#579, timestamp_event#580, platform_event#581, geo_location_event#582]\n            +- Join Inner, (uuid_pv#45 = uuid_event#578)\n               :- SubqueryAlias page_views\n               :  +- Relation[uuid_pv#45,document_id_pv#46,timestamp_pv#47,platform_pv#48,geo_location_pv#49,traffic_source_pv#50] csv\n               +- Project [display_id#13, ad_id#14, clicked#15, uuid_event#578, document_id_event#579, timestamp_event#580, platform_event#581, geo_location_event#582]\n                  +- Join LeftOuter, (display_id#13 = display_id#577)\n                     :- Relation[display_id#13,ad_id#14,clicked#15] csv\n                     +- SubqueryAlias events\n                        +- Relation[display_id#577,uuid_event#578,document_id_event#579,timestamp_event#580,platform_event#581,geo_location_event#582] csv\n"

In [None]:
train_pv_merged_df = clicks_train_df \
                         .join(events_joined_df, on='display_id', how='left')

In [None]:
windowval2 = (Window.partitionBy('ad_id').orderBy('timestamp')
             .rangeBetween(Window.unboundedPreceding, 0))

In [None]:
train_pv_merged_df = train_pv_merged_df.withColumn('ad_views', F.count('ad_id').over(windowval2))

In [None]:
windowval3 = (Window.partitionBy('document_id_event').orderBy('timestamp')
             .rangeBetween(Window.unboundedPreceding, 0))

In [None]:
train_pv_merged_df = train_pv_merged_df.withColumn('doc_views', F.count('document_id_event').over(windowval3))

In [None]:
train_pv_select_df = train_pv_merged_df.select('display_id', 'ad_id', 'user_views', 'ad_views', 'doc_views')

In [None]:
train_pv_select_df.repartition(1).write.csv(OUTPUT_BUCKET_FOLDER + 'views_out.csv', header = True)

In [None]:
# user_views, ad_views, doc_views를 기준으로 4등분: if there are NAs, 5등분(w/ U)