<a href="https://colab.research.google.com/github/lengochai97/thesis/blob/master/notebooks/feature_construction/01_Data_Clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect to Google Drive

In [0]:
%%capture

import google.colab.drive

google.colab.drive.mount('/content/gdrive', force_remount=True)

# Install Spark and dependencies

In [0]:
import os

os.environ['HADOOP_VERSION'] = '2.7'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/opt/spark'
os.environ['SPARK_VERSION'] = '2.4.3'

In [0]:
%%capture

!wget -qN https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz -C /opt
!rm spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!rm -rf /opt/spark
!ln -s /opt/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION /opt/spark
!pip install -q findspark

# Create SparkSession

In [0]:
import findspark

findspark.init()

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

# Read files

In [0]:
import glob
import json

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
DATA_PATH = '/content/gdrive/My Drive/dataset/adressa/one_week'

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'raw.json')) as file:
  raw_schema = T.StructType.fromJson(json.load(file))

In [0]:
raw_filepaths = glob.glob(os.path.join(DATA_PATH, '2017010*'))

In [0]:
df_raw = spark.read.json(raw_filepaths, schema=raw_schema)

# Clean data

In [0]:
df_clean = (
    df_raw
    .select(
        F.column('userId'),
        F.column('time'),
        F.column('id').alias('newsId'),
        F.unix_timestamp(F.column('publishtime'), "yyyy-MM-dd'T'HH:mm:ss.SSSXX").alias('publishtime'),
        F.column('category1').alias('categoryList'),
        F.column('referrerHostClass'),
        F.column('referrerUrl'),
    )
    .dropna(subset=['userId', 'time', 'newsId', 'publishtime', 'referrerHostClass', 'referrerUrl'])
    .filter(
        (F.column('referrerHostClass') == 'internal') &
        (F.column('referrerUrl') == 'http://adressa.no') &
        (F.column('time') > F.column('publishtime'))
    )
    .select(
        F.column('userId'),
        F.column('time'),
        F.column('newsId'),
        F.column('publishtime'),
        F.column('categoryList'),
    )
    .fillna('', subset=['categoryList'])
    .withColumn(
        'categoryList',
        F.array_distinct(F.split(F.column('categoryList'), '\|')),
    )
    .repartition(F.column('userId'))
    .sortWithinPartitions(
        F.column('userId'),
        F.column('time'),
        F.column('publishtime'),
        ascending=[True, True, False],
    )
    .dropDuplicates(subset=['userId', 'time'])
)

# Write files

In [0]:
%%time

df_clean.write.json(os.path.join(DATA_PATH, 'clean'))

CPU times: user 83.5 ms, sys: 25.5 ms, total: 109 ms
Wall time: 8min 11s


In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'clean.json'), 'w+') as file:
  json.dump(df_clean.schema.jsonValue(), file)