<a href="https://colab.research.google.com/github/lengochai97/thesis/blob/master/notebooks/feature_construction/41_News_Click_Counts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect to Google Drive

In [0]:
%%capture

import google.colab.drive

google.colab.drive.mount('/content/gdrive', force_remount=True)

# Install Spark and dependencies

In [0]:
import os

os.environ['HADOOP_VERSION'] = '2.7'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/opt/spark'
os.environ['SPARK_VERSION'] = '2.4.3'

In [0]:
%%capture

!wget -qN https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz -C /opt
!rm spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!rm -rf /opt/spark
!ln -s /opt/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION /opt/spark
!pip install -q findspark

# Create SparkSession

In [0]:
import findspark

findspark.init()

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

# Read files

In [0]:
import json

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
DATA_PATH = '/content/gdrive/My Drive/dataset/adressa/one_week'

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'samples.json')) as file:
  sample_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_sample = spark.read.json(os.path.join(DATA_PATH, 'samples'), schema=sample_schema)

# Extract (`newsId`, `time`) pairs and bucketize by `time`

In [0]:
TIME_BUCKET_RANGE = 60

In [0]:
df_sample_bucketized = (
    df_sample
    .select(
        F.column('newsId'),
        F.column('time'),
        (F.column('time') / TIME_BUCKET_RANGE).cast(T.LongType()).alias('timeBucket'),
        F.column('clickLabel'),
    )
)

In [0]:
df_news_click_counts_bucketized = (
    df_sample_bucketized
    .groupBy(F.column('newsId'), F.column('timeBucket'))
    .agg(F.sum('clickLabel').alias('clickCount'))
)

# Calculate news click counts

In [0]:
from pyspark.sql import Window

In [0]:
def calculate_news_click_count(df_news_click_counts_bucketized, time_tag, time_delta):
  return (
      df_news_click_counts_bucketized
      .withColumn(
          f'newsClickCount{time_tag}',
          F.sum(F.column('clickCount')).over(
              Window
              .partitionBy('newsId')
              .orderBy('timeBucket')
              .rangeBetween(-(time_delta // TIME_BUCKET_RANGE), -1)
          ),
      )
      .fillna(0, subset=[f'newsClickCount{time_tag}'])
      .select(
          F.column('newsId'),
          F.column('timeBucket'),
          F.column(f'newsClickCount{time_tag}')
      )
  )

In [0]:
time_info = (
    ('1H', 3600),
    ('6H', 21600),
    ('1D', 86400),
    ('1W', 604800),
)

In [0]:
news_click_counts_column = (
    'newsClickCount1H',
    'newsClickCount6H',
    'newsClickCount1D',
    'newsClickCount1W',
)

In [0]:
df_news_click_counts = (
    df_sample_bucketized
    .select(
        F.column('newsId'),
        F.column('time'),
        F.column('timeBucket'),
    )
    .dropDuplicates(subset=['newsId', 'time'])
)

for time_tag, time_delta in time_info:
  df_news_click_counts = (
      df_news_click_counts
      .join(
          calculate_news_click_count(df_news_click_counts_bucketized, time_tag, time_delta),
          on=['newsId', 'timeBucket'],
          how='inner',
      )
  )

df_news_click_counts = (
    df_news_click_counts
    .select(
        F.column('newsId'),
        F.column('time'),
        *news_click_counts_column,
    )
)

# Write files

In [0]:
df_news_click_counts = (
    df_news_click_counts
    .repartition(F.column('newsId'))
    .sortWithinPartitions(
        F.column('time'),
        F.column('newsId'),
    )
)

In [0]:
%%time

df_news_click_counts.write.json(os.path.join(DATA_PATH, 'news_click_counts'))

CPU times: user 128 ms, sys: 24.6 ms, total: 152 ms
Wall time: 11min 39s


In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'news_click_counts.json'), 'w+') as file:
  json.dump(df_news_click_counts.schema.jsonValue(), file)

In [0]:
df_news_click_counts.count()

13630336