<a href="https://colab.research.google.com/github/lengochai97/thesis/blob/master/notebooks/feature_construction/61_All_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect to Google Drive

In [0]:
%%capture

import google.colab.drive

google.colab.drive.mount('/content/gdrive', force_remount=True)

# Install Spark and dependencies

In [0]:
import os

os.environ['HADOOP_VERSION'] = '2.7'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/opt/spark'
os.environ['SPARK_VERSION'] = '2.4.3'

In [0]:
%%capture

!wget -qN https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz -C /opt
!rm spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!rm -rf /opt/spark
!ln -s /opt/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION /opt/spark
!pip install -q findspark

# Create SparkSession

In [0]:
import findspark

findspark.init()

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').config('spark.sql.autoBroadcastJoinThreshold', -1).getOrCreate()

# Read files

In [0]:
import json

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
DATA_PATH = '/content/gdrive/My Drive/dataset/adressa/one_week'

## Cleaned

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'clean.json')) as file:
  clean_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_clean = spark.read.json(os.path.join(DATA_PATH, 'clean'), schema=clean_schema)

## Samples

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'samples.json')) as file:
  sample_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_sample = spark.read.json(os.path.join(DATA_PATH, 'samples'), schema=sample_schema)

## News features

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'news_features.json')) as file:
  news_features_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_news_features = spark.read.json(os.path.join(DATA_PATH, 'news_features'), schema=news_features_schema)

## User features

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'user_features.json')) as file:
  user_features_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_user_features = spark.read.json(os.path.join(DATA_PATH, 'user_features'), schema=user_features_schema)

## News click counts

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'news_click_counts.json')) as file:
  news_click_counts_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_news_click_counts = spark.read.json(os.path.join(DATA_PATH, 'news_click_counts'), schema=news_click_counts_schema)

## User activeness

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'user_activeness.json')) as file:
  user_activeness_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_user_activeness = spark.read.json(os.path.join(DATA_PATH, 'user_activeness'), schema=user_activeness_schema)

# Construct user feature vectors

In [0]:
from pyspark.ml.feature import CountVectorizerModel, Normalizer

In [0]:
count_vectorizer_model_path = os.path.join(DATA_PATH, 'model', 'category_count_vectorizer')

In [0]:
df_sample_user_news = (
    df_sample
    .join(
        df_user_features
        .select(
            F.column('userId'),
            F.column('time'),
            F.column('categoryListHistory'),
            F.column('categoryList1H'),
            F.column('categoryList6H'),
            F.column('categoryList1D'),
            F.column('categoryList1W'),
            F.column('userClickCount1H'),
            F.column('userClickCount6H'),
            F.column('userClickCount1D'),
            F.column('userClickCount1W'),
        ),
        on=['userId', 'time'],
        how='inner',
    )
    .join(
        df_news_features
        .select(
            F.column('newsId'),
            F.column('categoryList'),
            F.column('categoryVector'),
        ),
        on='newsId',
        how='inner',
    )
    .withColumn(
        'categoryListHistoryNext',
        F.concat(F.column('categoryList'), F.column('categoryListHistory')),
    )
    .withColumn(
        'categoryList1HNext',
        F.concat(F.column('categoryList'), F.column('categoryList1H')),
    )
    .withColumn(
        'categoryList6HNext',
        F.concat(F.column('categoryList'), F.column('categoryList6H')),
    )
    .withColumn(
        'categoryList1DNext',
        F.concat(F.column('categoryList'), F.column('categoryList1D')),
    )
    .withColumn(
        'categoryList1WNext',
        F.concat(F.column('categoryList'), F.column('categoryList1W')),
    )
    .withColumn(
        'userClickCount1HNext',
        (F.column('userClickCount1H') + 1),
    )
    .withColumn(
        'userClickCount6HNext',
        (F.column('userClickCount6H') + 1),
    )
    .withColumn(
        'userClickCount1DNext',
        (F.column('userClickCount1D') + 1),
    )
    .withColumn(
        'userClickCount1WNext',
        (F.column('userClickCount1W') + 1),
    )
)

# Build user feature vectors for current state

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(False)
    .setInputCol('categoryListHistory')
    .setOutputCol('userHistoryVector')
    .transform(df_sample_user_news)
)

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(True)
    .setInputCol('categoryList1H')
    .setOutputCol('userProfileVector1H')
    .transform(df_sample_user_news)
)

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(True)
    .setInputCol('categoryList6H')
    .setOutputCol('userProfileVector6H')
    .transform(df_sample_user_news)
)

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(True)
    .setInputCol('categoryList1D')
    .setOutputCol('userProfileVector1D')
    .transform(df_sample_user_news)
)

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(True)
    .setInputCol('categoryList1W')
    .setOutputCol('userProfileVector1W')
    .transform(df_sample_user_news)
)

# Build user feature vectors for next state

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(False)
    .setInputCol('categoryListHistoryNext')
    .setOutputCol('userHistoryVectorNext')
    .transform(df_sample_user_news)
)

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(True)
    .setInputCol('categoryList1HNext')
    .setOutputCol('userProfileVector1HNext')
    .transform(df_sample_user_news)
)

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(True)
    .setInputCol('categoryList6HNext')
    .setOutputCol('userProfileVector6HNext')
    .transform(df_sample_user_news)
)

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(True)
    .setInputCol('categoryList1DNext')
    .setOutputCol('userProfileVector1DNext')
    .transform(df_sample_user_news)
)

df_sample_user_news = (
    CountVectorizerModel
    .load(count_vectorizer_model_path)
    .setBinary(True)
    .setInputCol('categoryList1WNext')
    .setOutputCol('userProfileVector1WNext')
    .transform(df_sample_user_news)
)

# Normalize user history (optional)

df_sample_user_news = (
    Normalizer(
        p=1.0,
        inputCol='userHistoryVector',
        outputCol='userHistoryVectorNormalized'
    )
    .transform(df_sample_user_news)
)

df_sample_user_news = (
    Normalizer(
        p=1.0,
        inputCol='userHistoryVectorNext',
        outputCol='userHistoryVectorNextNormalized'
    )
    .transform(df_sample_user_news)
)

# Filter columns

df_sample_user_news = (
    df_sample_user_news
    .select(
        F.column('userId'),
        F.column('time'),
        F.column('newsId'),
        F.column('clickLabel'),
        F.column('categoryVector'),
        F.column('userHistoryVectorNormalized').alias('userHistoryVector'),
        F.column('userProfileVector1H'),
        F.column('userProfileVector6H'),
        F.column('userProfileVector1D'),
        F.column('userProfileVector1W'),
        F.column('userClickCount1H'),
        F.column('userClickCount6H'),
        F.column('userClickCount1D'),
        F.column('userClickCount1W'),
        F.column('userHistoryVectorNextNormalized').alias('userHistoryVectorNext'),
        F.column('userProfileVector1HNext'),
        F.column('userProfileVector6HNext'),
        F.column('userProfileVector1DNext'),
        F.column('userProfileVector1WNext'),
        F.column('userClickCount1HNext'),
        F.column('userClickCount6HNext'),
        F.column('userClickCount1DNext'),
        F.column('userClickCount1WNext'),
    )
)

# Construct context feature vector

In [0]:
from pyspark.ml.feature import OneHotEncoderEstimator

In [0]:
df_time_weekday = (
    spark.range(0, 24)
    .withColumnRenamed('id', 'timeContext')
    .crossJoin(
        spark.range(0, 7)
        .withColumnRenamed('id', 'weekdayContext')
    )
)

time_weekday_context_ohe = (
    OneHotEncoderEstimator(
        inputCols=['timeContext', 'weekdayContext'],
        outputCols=['timeContextVector', 'weekdayContextVector'],
        dropLast=False,
    )
    .fit(df_time_weekday)
)

In [0]:
df_sample_context = (
    df_news_click_counts
    .join(
        df_news_features
        .select(
            F.column('newsId'),
            F.column('publishtime'),
        ),
        on='newsId',
        how='inner',
    )
    .withColumn(
        'newsFreshnessContext',
        (1. / (F.column('time') - F.column('publishtime'))),
    )
    .withColumn(
        'timeContext',
        F.hour(F.from_unixtime(F.column('time'))),
    )
    .withColumn(
        'weekdayContext',
        (F.dayofweek(F.from_unixtime(F.column('time'))) - 1),
    )
)

df_sample_context = (
    time_weekday_context_ohe
    .transform(df_sample_context)
    .select(
        F.column('newsId'),
        F.column('time'),
        F.column('newsClickCount1H'),
        F.column('newsClickCount6H'),
        F.column('newsClickCount1D'),
        F.column('newsClickCount1W'),
        F.column('timeContextVector'),
        F.column('weekdayContextVector'),
        F.column('newsFreshnessContext'),
    )
)

# Construct `eventId`

In [0]:
from pyspark.sql import Window

In [0]:
df_event_id = (
    df_clean
    .select(
        F.column('userId'),
        F.column('time'),
    )
    .withColumn(
        'eventId',
        F.row_number().over(
            Window
            .orderBy('time', 'userId')
        ),
    )
    .withColumn(
        'eventId',
        (F.column('eventId') - 1),
    )
)

# Construct feature vectors for all features

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
df_all_features = (
    df_sample_user_news
    .join(
        df_sample_context,
        on=['newsId', 'time'],
        how='inner',
    )
    .join(
        df_user_activeness,
        on=['userId', 'time'],
        how='inner',
    )
    .join(
        df_event_id,
        on=['userId', 'time'],
        how='inner',
    )
)

# Build news feature vectors

df_all_features = (
    VectorAssembler(
        inputCols=[
            'newsClickCount1H',
            'newsClickCount6H',
            'newsClickCount1D',
            'newsClickCount1W',
        ],
        outputCol='newsClickCountVector',
    )
    .transform(df_all_features)
)

# Build user feature vectors for current state

df_all_features = (
    VectorAssembler(
        inputCols=[
            'userProfileVector1H',
            'userProfileVector6H',
            'userProfileVector1D',
            'userProfileVector1W',
        ],
        outputCol='userProfileVector',
    )
    .transform(df_all_features)
)

df_all_features = (
    VectorAssembler(
        inputCols=[
            'userClickCount1H',
            'userClickCount6H',
            'userClickCount1D',
            'userClickCount1W',
        ],
        outputCol='userClickCountVector',
    )
    .transform(df_all_features)
)

# Build user feature vectors for next state

df_all_features = (
    VectorAssembler(
        inputCols=[
            'userProfileVector1HNext',
            'userProfileVector6HNext',
            'userProfileVector1DNext',
            'userProfileVector1WNext',
        ],
        outputCol='userProfileVectorNext',
    )
    .transform(df_all_features)
)

df_all_features = (
    VectorAssembler(
        inputCols=[
            'userClickCount1HNext',
            'userClickCount6HNext',
            'userClickCount1DNext',
            'userClickCount1WNext',
        ],
        outputCol='userClickCountVectorNext',
    )
    .transform(df_all_features)
)

# Build context feature vectors

df_all_features = (
    VectorAssembler(
        inputCols=[
            'timeContextVector',
            'weekdayContextVector',
            'newsFreshnessContext',
        ],
        outputCol='contextVector',
    )
    .transform(df_all_features)
)

# Filter columns

df_all_features = (
    df_all_features
    .select(
        F.column('eventId'),
        F.column('userId'),
        F.column('time'),
        F.column('newsId'),
        F.column('clickLabel'),
        F.column('userActiveness'),
        F.column('categoryVector'),
        F.column('newsClickCountVector'),
        F.column('contextVector'),
        F.column('userHistoryVector'),
        F.column('userProfileVector'),
        F.column('userClickCountVector'),
        F.column('userHistoryVectorNext'),
        F.column('userProfileVectorNext'),
        F.column('userClickCountVectorNext'),
    )
)

# Train test split

In [0]:
train_test_split_time = 1483743600 # 2017/01/06 23:00:00 UTC

In [0]:
df_all_features_train = df_all_features.filter(F.column('time') < train_test_split_time)

In [0]:
df_all_features_test = df_all_features.filter(F.column('time') >= train_test_split_time)

# Write files

## Write train

In [0]:
df_all_features_train = (
    df_all_features_train
    .repartitionByRange(F.column('eventId'))
    .sortWithinPartitions(F.column('eventId'))
)

In [0]:
%%time

df_all_features_train.write.json(os.path.join(DATA_PATH, 'all_features', 'train'))

CPU times: user 343 ms, sys: 86.2 ms, total: 429 ms
Wall time: 30min 50s


## Write test

In [0]:
df_all_features_test = (
    df_all_features_test
    .repartitionByRange(F.column('eventId'))
    .sortWithinPartitions(F.column('eventId'))
)

In [0]:
%%time

df_all_features_test.write.json(os.path.join(DATA_PATH, 'all_features', 'test'))

CPU times: user 212 ms, sys: 50.4 ms, total: 263 ms
Wall time: 23min 35s


## Write schema

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'all_features.json'), 'w+') as file:
  json.dump(df_all_features.schema.jsonValue(), file)