## 설정

In [53]:
evaluation = True
evaluation_verbose = False

OUTPUT_BUCKET_FOLDER = "gs://capstone-02/output/"
DATA_BUCKET_FOLDER = "gs://capstone-02/data/"

In [54]:
from IPython.display import display

In [55]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [56]:
import numpy as np
import scipy.sparse

In [57]:
import math
import datetime
import time
import itertools

In [58]:
import pickle

In [59]:
import random
random.seed(42)

In [60]:
import pandas as pd
%matplotlib inline

In [61]:
start_time = time.time()

In [62]:
# 해시 함수를 가진 모듈 hashlib
import hashlib
def hashstr(s, nr_bins):
    return int(hashlib.md5(s.encode('utf8')).hexdigest(), 16)%(nr_bins-1)+1

## train_feature_vectors_exported_df

In [63]:
train_feature_vectors_exported_df = spark.read.parquet("gs://capstone-01/output/train_feature_vectors_integral_eval")

In [81]:
train_feature_vectors_exported_df.show(1)

+--------------+----------+-----+-----------------+-----------+-----+--------------------+
|          uuid|display_id|ad_id|document_id_event|document_id|label|      feature_vector|
+--------------+----------+-----+-----------------+-----------+-----+--------------------+
|10005a0add15f6|   5686397|61941|          2135921|    1017869|    1|(103,[0,3,4,5,6,7...|
+--------------+----------+-----+-----------------+-----------+-----+--------------------+
only showing top 1 row



In [82]:
type(train_feature_vectors_exported_df)

pyspark.sql.dataframe.DataFrame

## Feature Vector

In [110]:
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

In [111]:
int_feature_names = ['user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published', 
                    ]

In [112]:
float_feature_names = [                                
                'pop_ad_id',       
                'pop_ad_id_conf',   
                'pop_ad_id_conf_multipl', 
                'pop_document_id',                
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campain_id',
                'pop_campain_id_conf',
                'pop_campain_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',  
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',    
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id', 
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id', 
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',    
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',    
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',                    
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',    
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',    
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',                    
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl'
               ]

In [113]:
TRAFFIC_SOURCE_FV='traffic_source'
EVENT_HOUR_FV='event_hour'
EVENT_COUNTRY_FV = 'event_country'
EVENT_COUNTRY_STATE_FV = 'event_country_state'
EVENT_GEO_LOCATION_FV = 'event_geo_location'
EVENT_PLATFORM_FV = 'event_platform'
AD_ADVERTISER_FV = 'ad_advertiser'
DOC_AD_SOURCE_ID_FV='doc_ad_source_id'
DOC_AD_PUBLISHER_ID_FV='doc_ad_publisher_id'
DOC_EVENT_SOURCE_ID_FV='doc_event_source_id'
DOC_EVENT_PUBLISHER_ID_FV='doc_event_publisher_id'
DOC_AD_CATEGORY_ID_FV='doc_ad_category_id'
DOC_AD_TOPIC_ID_FV='doc_ad_topic_id'
DOC_AD_ENTITY_ID_FV='doc_ad_entity_id'
DOC_EVENT_CATEGORY_ID_FV='doc_event_category_id'
DOC_EVENT_TOPIC_ID_FV='doc_event_topic_id'
DOC_EVENT_ENTITY_ID_FV='doc_event_entity_id'

In [114]:
category_feature_names_integral = ['ad_advertiser',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_ad_entity_id_1', 
 'doc_ad_entity_id_2', 
 'doc_ad_entity_id_3', 
 'doc_ad_entity_id_4', 
 'doc_ad_entity_id_5', 
 'doc_ad_entity_id_6', 
 'doc_ad_publisher_id',
 'doc_ad_source_id', 
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_event_entity_id_1',
 'doc_event_entity_id_2',
 'doc_event_entity_id_3',
 'doc_event_entity_id_4',
 'doc_event_entity_id_5',
 'doc_event_entity_id_6',
 'doc_event_publisher_id',
 'doc_event_source_id', 
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'traffic_source']

In [115]:
feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + \
                                 category_feature_names_integral

In [116]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open("train_feature_vectors_integral_eval.csv"+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [117]:
def sparse_vector_to_csv_with_nulls_row(additional_column_values, vec, num_columns):    
    return ','.join([str(value) for value in additional_column_values] + 
                     list([ '{:.5}'.format(vec[x]) if x in vec.indices else '' for x in range(vec.size) ])[:num_columns]) \
            .replace('.0,',',')

In [118]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn('is_leak', F.lit(-1)) \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [119]:
type(train_feature_vectors_integral_csv_rdd)

pyspark.rdd.PipelinedRDD

In [98]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_integral_csv_rdd.map(lambda x : x.split(","))
SparkS = SparkSession.builder.getOrCreate()
train_df = SparkS.createDataFrame(train_feature_vectors_integral_csv_rdd, train_schema)
type(train_df)

pyspark.sql.dataframe.DataFrame

In [99]:
train_df.printSchema()

root
 |-- label: string (nullable = true)
 |-- display_id: integer (nullable = true)
 |-- ad_id: integer (nullable = true)
 |-- doc_id: integer (nullable = true)
 |-- doc_event_id: integer (nullable = true)
 |-- is_leak: integer (nullable = true)
 |-- event_weekend: integer (nullable = true)
 |-- user_has_already_viewed_doc: integer (nullable = true)
 |-- user_views: integer (nullable = true)
 |-- ad_views: integer (nullable = true)
 |-- doc_views: integer (nullable = true)
 |-- doc_event_days_since_published: integer (nullable = true)
 |-- doc_event_hour: integer (nullable = true)
 |-- doc_ad_days_since_published: integer (nullable = true)
 |-- pop_ad_id: float (nullable = true)
 |-- pop_ad_id_conf: float (nullable = true)
 |-- pop_ad_id_conf_multipl: float (nullable = true)
 |-- pop_document_id: float (nullable = true)
 |-- pop_document_id_conf: float (nullable = true)
 |-- pop_document_id_conf_multipl: float (nullable = true)
 |-- pop_publisher_id: float (nullable = true)
 |-- pop_p

In [88]:
# Create String Indexer for workclass and salary

from pyspark.ml.feature import StringIndexer,VectorAssembler,OneHotEncoder
from pyspark.ml import Pipeline

In [100]:
from pyspark.ml.feature import StringIndexer,VectorAssembler,OneHotEncoder
from pyspark.ml import Pipeline

salaryIndexer = StringIndexer().setInputCol("label").setOutputCol("label2")
vectorAssembler = VectorAssembler().setInputCols(['event_weekend',
                      'user_has_already_viewed_doc',
                                                  'user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published',
                                                  'pop_ad_id',       
                'pop_ad_id_conf',   
                'pop_ad_id_conf_multipl', 
                'pop_document_id',                
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campain_id',
                'pop_campain_id_conf',
                'pop_campain_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',  
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',    
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id', 
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id', 
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',    
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',    
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',                    
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',    
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',    
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',                    
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl'
                                                 ]).setOutputCol("features")

In [106]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

sparkSession = SparkSession.builder \
       .master("local") \
       .appName("Spark ML") \
       .getOrCreate()

sparkDf = sparkSession.read.format("parquet").load("gs://capstone-01/output/train_feature_vectors_integral_eval")

In [107]:
type(sparkDf)

pyspark.sql.dataframe.DataFrame

In [108]:
# spark dataframe 로드
sparkDf.head(1)

[Row(uuid=u'10005a0add15f6', display_id=5686397, ad_id=61941, document_id_event=2135921, document_id=1017869, label=1, feature_vector=SparseVector(103, {0: 1.0, 3: 11370.0, 4: 15083.0, 5: 0.0, 6: 3.0, 7: 173.0, 8: 0.4069, 9: 0.5623, 10: 0.2288, 11: 0.3902, 12: 0.5131, 13: 0.2002, 14: 0.2691, 15: 0.3531, 16: 0.095, 17: 0.3902, 18: 0.5131, 19: 0.2002, 20: 0.3902, 21: 0.5131, 22: 0.2002, 26: 0.3902, 27: 0.5131, 28: 0.2002, 29: 0.3902, 30: 0.5131, 31: 0.2002, 32: 0.3826, 33: 0.2833, 34: 0.1084, 35: 0.3962, 36: 0.2833, 37: 0.1122, 38: 0.2738, 39: 0.0012, 40: 0.0003, 41: 0.2801, 42: 0.0012, 43: 0.0003, 44: 0.2298, 45: 0.0622, 46: 0.0143, 47: 0.2336, 48: 0.0595, 49: 0.0139, 59: 0.0, 60: 0.0004, 61: 0.0, 62: 0.0, 63: 0.0, 64: 0.0, 68: 2413.0, 69: 1403.0, 70: 1610.0, 72: 108.0, 75: 194.0, 76: 15.0, 81: 440.0, 82: 4016.0, 83: 1702.0, 84: 1707.0, 86: 137.0, 95: 723.0, 96: 4194.0, 97: 18595452.0, 98: 745661.0, 99: 33260.0, 100: 3.0, 101: 2.0, 102: 1.0}))]

In [109]:
sparkDf.show(1)

+--------------+----------+-----+-----------------+-----------+-----+--------------------+
|          uuid|display_id|ad_id|document_id_event|document_id|label|      feature_vector|
+--------------+----------+-----+-----------------+-----------+-----+--------------------+
|10005a0add15f6|   5686397|61941|          2135921|    1017869|    1|(103,[0,3,4,5,6,7...|
+--------------+----------+-----+-----------------+-----------+-----+--------------------+
only showing top 1 row

