<a href="https://colab.research.google.com/github/lengochai97/thesis/blob/master/notebooks/feature_construction/32_Samples_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect to Google Drive

In [0]:
%%capture

import google.colab.drive

google.colab.drive.mount('/content/gdrive', force_remount=True)

# Install Spark and dependencies

In [0]:
import os

os.environ['HADOOP_VERSION'] = '2.7'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/opt/spark'
os.environ['SPARK_VERSION'] = '2.4.3'

In [0]:
%%capture

!wget -qN https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz -C /opt
!rm spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!rm -rf /opt/spark
!ln -s /opt/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION /opt/spark
!pip install -q findspark

# Create SparkSession

In [0]:
import findspark

findspark.init()

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

# Read files

In [0]:
import json

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
DATA_PATH = '/content/gdrive/My Drive/dataset/adressa/one_week'

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'samples.json')) as file:
  sample_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_sample = spark.read.json(os.path.join(DATA_PATH, 'sample'), schema=sample_schema)

# Check results

In [0]:
df_sample.show(truncate=False)

+-------------------------------------------------+----------+----------------------------------------+----------+
|userId                                           |time      |newsId                                  |clickLabel|
+-------------------------------------------------+----------+----------------------------------------+----------+
|cx:0317f1009ed4ba0be5703f93d490915a:1aheiz3vos0sx|1483363834|0d086df067aff22a3852ce26497e61ff621fc383|1         |
|cx:0317f1009ed4ba0be5703f93d490915a:1aheiz3vos0sx|1483363834|020541cc5842eb943d87f87e7e44aa68deb35ead|0         |
|cx:0317f1009ed4ba0be5703f93d490915a:1aheiz3vos0sx|1483363834|0348fac56fdf144682f5ffa6f6f10591963f9de3|0         |
|cx:0317f1009ed4ba0be5703f93d490915a:1aheiz3vos0sx|1483363834|0867dbb33bb90970ae48592057be34246a0124ac|0         |
|cx:0317f1009ed4ba0be5703f93d490915a:1aheiz3vos0sx|1483363834|1cabcf81fbf2248cc427811d125079f6172a8e7d|0         |
|cx:0317f1009ed4ba0be5703f93d490915a:1aheiz3vos0sx|1483363834|338d849c5c3e0a320d

## Number of samples

In [0]:
df_sample.count()

14712022

## Number of (`userId`, `time`) pairs

In [0]:
df_sample.select(F.column('userId'), F.column('time')).distinct().count()

1226167

## Number of (`newsId`, `time`) pairs

In [0]:
df_sample.select(F.column('newsId'), F.column('time')).distinct().count()

13631812

## Number of click samples

In [0]:
n_click_samples = df_sample.filter(F.column('clickLabel') == 1).count()
n_click_samples

1226167

## Number of no-click samples

In [0]:
n_no_click_samples = df_sample.filter(F.column('clickLabel') == 0).count()
n_no_click_samples

13485855

## No-click / click ratio

In [0]:
n_no_click_samples / n_click_samples

10.998383580703118

## Disk usage

In [3]:
!du -sh /content/gdrive/My\ Drive/dataset/adressa/one_week/samples

1.9G	/content/gdrive/My Drive/dataset/adressa/one_week/samples
