<a href="https://colab.research.google.com/github/lengochai97/thesis/blob/master/notebooks/feature_construction/11_News_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect to Google Drive

In [0]:
%%capture

import google.colab.drive

google.colab.drive.mount('/content/gdrive', force_remount=True)

# Install Spark and dependencies

In [0]:
import os

os.environ['HADOOP_VERSION'] = '2.7'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/opt/spark'
os.environ['SPARK_VERSION'] = '2.4.3'

In [0]:
%%capture

!wget -qN https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz -C /opt
!rm spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!rm -rf /opt/spark
!ln -s /opt/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION /opt/spark
!pip install -q findspark

# Create SparkSession

In [0]:
import findspark

findspark.init()

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

# Read files

In [0]:
import json

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
DATA_PATH = '/content/gdrive/My Drive/dataset/adressa/one_week'

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'clean.json')) as file:
  clean_schema = T.StructType.fromJson(json.load(file))

In [10]:
df_clean = spark.read.json(os.path.join(DATA_PATH, 'clean'), schema=clean_schema)
df_clean.cache()

DataFrame[userId: string, time: bigint, newsId: string, publishtime: bigint, categoryList: array<string>]

# Construct `category` vocabulary

In [0]:
train_test_split_time = 1483743600 # 2017/01/06 23:00:00 UTC

In [12]:
df = (
    df_clean
    .filter(F.column('time') >= train_test_split_time)
)

df.show(truncate=False)

+-------------------------------------------+----------+----------------------------------------+-----------+-----------------------+
|userId                                     |time      |newsId                                  |publishtime|categoryList           |
+-------------------------------------------+----------+----------------------------------------+-----------+-----------------------+
|cx:10aahg3cyumaa128zgcrqm02zi:2gi7mzuwpxq8j|1483802430|01d923a1af0487ccbf9804bea12f49c12727214a|1483797105 |[100sport, fotball]    |
|cx:10aahg3cyumaa128zgcrqm02zi:2gi7mzuwpxq8j|1483802488|7e98f8a1a50a409a25831be225e01e261dfe04fc|1483790765 |[100sport, vintersport]|
|cx:10aahg3cyumaa128zgcrqm02zi:2gi7mzuwpxq8j|1483826863|7e98f8a1a50a409a25831be225e01e261dfe04fc|1483790765 |[100sport, vintersport]|
|cx:113ysgcwozlbw1zxe59kpugoom:2dvot488xvkc0|1483773257|bb8ff8365233ea91dfcdb36fdd84f87fcc33e1a8|1483739512 |[nyheter, sortrondelag]|
|cx:113ysgcwozlbw1zxe59kpugoom:2dvot488xvkc0|1483819358|faa8cc

In [13]:
df.count()

149824

In [0]:
df_news_train = (
    df_clean
    .filter(F.column('time') < train_test_split_time)
    .select(
        F.column('newsId'),
        F.column('categoryList'),
    )
    .dropDuplicates(subset=['newsId'])
)

In [0]:
from pyspark.ml.feature import CountVectorizer

In [16]:
%%time

category_count_vectorizer = (
    CountVectorizer(
        binary=True,
        inputCol='categoryList',
        outputCol='categoryVector',
    )
    .fit(df_news_train)
)

CPU times: user 11.6 ms, sys: 4.34 ms, total: 15.9 ms
Wall time: 9.18 s


# Construct `category` feature vector

In [0]:
df_news_features = (
    df_clean
    .select(
        F.column('newsId'),
        F.column('publishtime'),
        F.column('categoryList'),
    )
    .dropDuplicates(subset=['newsId'])
)

df_news_features = category_count_vectorizer.transform(df_news_features)

# Write files

In [0]:
category_count_vectorizer.save(os.path.join(DATA_PATH, 'model', 'category_count_vectorizer'))

In [21]:
%%time

df_news_features = (
    df_news_features
    .repartition(1)
    .sortWithinPartitions(
        F.column('publishtime'),
        F.column('newsId'),
    )
)

df_news_features.write.json(os.path.join(DATA_PATH, 'news_features'))

CPU times: user 4.44 ms, sys: 2.48 ms, total: 6.92 ms
Wall time: 5.04 s


In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'news_features.json'), 'w+') as file:
  json.dump(df_news_features.schema.jsonValue(), file)