# Clustering publishers with cosine similarity

In [1]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
import pandas as pd
import seaborn as sns
%matplotlib inline

In [2]:
import google.datalab.storage as storage
import pandas as pd
from io import BytesIO
from google.datalab import Context

In [3]:
# hide warning message
import warnings
warnings.filterwarnings('ignore')

In [4]:
my_bucket = "gs://summer-heaven-223608-bucket/data/"

In [5]:
my_bucket = storage.Bucket('summer-heaven-223608-bucket/data')

docu_meta = my_bucket.object('documents_meta.csv')
uri5 = docu_meta.uri
%gcs read --object $uri5 --variable c_docu_meta
docu_meta = pd.read_csv(BytesIO(c_docu_meta))

events = my_bucket.object('events.csv')
uri7 = events.uri
%gcs read --object $uri7 --variable c_events
events = pd.read_csv(BytesIO(c_events))

In [6]:
documents_categories = my_bucket.object('documents_categories.csv')
uri8 = documents_categories.uri
%gcs read --object $uri8 --variable c_documents_categories
docu_cate = pd.read_csv(BytesIO(c_documents_categories))

In [7]:
docu_meta.head()

Unnamed: 0,document_id,source_id,publisher_id,publish_time
0,1595802,1.0,603.0,2016-06-05 00:00:00
1,1524246,1.0,603.0,2016-05-26 11:00:00
2,1617787,1.0,603.0,2016-05-27 00:00:00
3,1615583,1.0,603.0,2016-06-07 00:00:00
4,1615460,1.0,603.0,2016-06-20 00:00:00


In [8]:
events.head()

Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location
0,1,cb8c55702adb93,379743,61,3,US>SC>519
1,2,79a85fa78311b9,1794259,81,2,US>CA>807
2,3,822932ce3d8757,1179111,182,2,US>MI>505
3,4,85281d0a49f7ac,1777797,234,2,US>WV>564
4,5,8d0daef4bf5b56,252458,338,2,SG>00


In [9]:
docu_cate.head()

Unnamed: 0,document_id,category_id,confidence_level
0,1595802,1611,0.92
1,1595802,1610,0.07
2,1524246,1807,0.92
3,1524246,1608,0.07
4,1617787,1807,0.92


In [10]:
# events와 docu_meta를 inner_join

view_page = pd.merge(events, docu_meta, how = "inner", on = "document_id")

In [11]:
view_page.shape

(23120126, 9)

In [12]:
view_page.head()

Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location,source_id,publisher_id,publish_time
0,1,cb8c55702adb93,379743,61,3,US>SC>519,6482.0,24.0,
1,1239772,a2eb3a4e0a4c08,379743,74476960,1,CA>BC,6482.0,24.0,
2,1306670,c9d0cf076200f3,379743,78297892,2,US,6482.0,24.0,
3,2357895,6e0aabe592ba61,379743,148985378,2,US,6482.0,24.0,
4,2785946,1eb4bef36f8623,379743,177159039,1,US>RI>521,6482.0,24.0,


In [13]:
# view_page와 documents_categories를 inner_join

view_page_conf = pd.merge(view_page, docu_cate, how = "inner", on = "document_id")

In [14]:
view_page_conf.head()

Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location,source_id,publisher_id,publish_time,category_id,confidence_level
0,1,cb8c55702adb93,379743,61,3,US>SC>519,6482.0,24.0,,1203,0.345952
1,1,cb8c55702adb93,379743,61,3,US>SC>519,6482.0,24.0,,1702,0.026322
2,1239772,a2eb3a4e0a4c08,379743,74476960,1,CA>BC,6482.0,24.0,,1203,0.345952
3,1239772,a2eb3a4e0a4c08,379743,74476960,1,CA>BC,6482.0,24.0,,1702,0.026322
4,1306670,c9d0cf076200f3,379743,78297892,2,US,6482.0,24.0,,1203,0.345952


In [15]:
view_page_conf.shape

(43828357, 11)

In [16]:
del_cols = ['display_id', 'uuid', 'document_id', 'timestamp', 'platform', 'geo_location', 'source_id', 'publish_time']
view_page_conf = view_page_conf.drop(del_cols, axis = 1)

In [17]:
view_page_conf.head()

Unnamed: 0,publisher_id,category_id,confidence_level
0,24.0,1203,0.345952
1,24.0,1702,0.026322
2,24.0,1203,0.345952
3,24.0,1702,0.026322
4,24.0,1203,0.345952


In [20]:
view_page_conf.shape

(43828357, 3)

In [27]:
view_page_conf['category_id'].nunique()

90

In [30]:
import numpy as np

In [32]:
# 행을 publisher_id, 열을 category_id, 값을 confidence_level의 sum으로 하는 pivot table 생성

view_pivot = pd.pivot_table(view_page_conf, 
                            values = "confidence_level", 
                            index = 'publisher_id', 
                            columns = 'category_id',
                            aggfunc = np.sum)

In [37]:
# NaN값을 0으로 채움

view_pivot = view_pivot.fillna(0)
view_pivot.head()

category_id,1000,1100,1202,1203,1204,1205,1206,1207,1208,1209,...,1912,1913,1914,1915,2002,2003,2004,2005,2006,2100
publisher_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.0,0.0,9.267442,0.952714,0.0,0.0,4.842871,0.0,2.76,3.68,0.99,...,0.0,0.0,0.0,0.0,1.91,1.84,4.573072,0.0,0.0,0.0
3.0,77.208282,1286.895133,3.04,256.851352,61.71,2060.901529,10.01,226.193974,0.14,817.449758,...,1847.67141,56.464953,24.531165,1.84,139.32,859.949061,7213.722404,54.05,517.089837,1700.355983
7.0,0.0,74.08,2.507,49.655835,3.74,93.343045,3.551,6.78,9.05,57.333796,...,0.0,0.0,0.0,0.0,3.45,4.45,33.32,0.21,0.0,23.72
9.0,4.63935,2039.959758,24.039716,520.571316,246.33925,2254.028291,34.759792,136.812663,28.985821,317.241752,...,1073.647467,153.702126,12.617994,5.203346,8.232909,205.615276,4399.66692,2.05,27.795822,1258.42232
10.0,8.68,955.254396,0.0,0.168587,0.0,0.0,0.0,0.0,0.0,0.0,...,0.14,218.788042,0.0,28.139334,0.0,0.0,0.0,0.0,0.35,0.21
