<a href="https://colab.research.google.com/github/lengochai97/thesis/blob/master/notebooks/feature_construction/52_User_Activeness_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect to Google Drive

In [0]:
%%capture

import google.colab.drive

google.colab.drive.mount('/content/gdrive', force_remount=True)

# Install Spark and dependencies

In [0]:
import os

os.environ['HADOOP_VERSION'] = '2.7'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/opt/spark'
os.environ['SPARK_VERSION'] = '2.4.3'

In [0]:
%%capture

!wget -qN https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz -C /opt
!rm spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
!rm -rf /opt/spark
!ln -s /opt/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION /opt/spark
!pip install -q findspark


# Create SparkSession

In [0]:
import findspark

findspark.init()

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

# Read files

In [0]:
import json

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
DATA_PATH = '/content/gdrive/My Drive/dataset/adressa/one_week'

In [0]:
with open(os.path.join(DATA_PATH, 'schema', 'user_activeness.json')) as file:
  user_activeness_schema = T.StructType.fromJson(json.load(file))

In [0]:
df_user_activeness = spark.read.json(os.path.join(DATA_PATH, 'user_activeness'), schema=user_activeness_schema)

# Check results

In [0]:
df_user_activeness.show(truncate=False)

+-------------------------------------------+----------+--------------+
|userId                                     |time      |userActiveness|
+-------------------------------------------+----------+--------------+
|cx:hua808o791zl6cx9:3s9kdckn8su34          |1483225204|0.5           |
|cx:221xclq7aa1yx2slhf0xq7svbf:l1m6d375szc  |1483225209|0.5           |
|cx:2olo1siax7ic82xivrf9ygayuy:3v1wyr8b7noee|1483225209|0.5           |
|cx:ijq4o5e7yj6a8l1o:4p63qy0dhsr4           |1483225222|0.5           |
|cx:3s5uucstq9kbzra8l2zlivvng:3i8vbhs9uldg9 |1483225235|0.5           |
|cx:i946i2jhoixihtyu:1ad4eauxl3fq7          |1483225239|0.5           |
|cx:3d1vemqqmn7403lkoeyrnt0s87:1nicr05d23gcx|1483225243|0.5           |
|cx:imqsaoe34rwjbg9g:18dhkqxtl2c3x          |1483225248|0.5           |
|cx:ibtqoms9qsyvndlb:cx6zmv3ax3bm           |1483225249|0.5           |
|cx:1qqnec12fcm3ccpqwci5eu36v:11o2fc7kej3zs |1483225252|0.5           |
|cx:hxqibaks92oi1i6l:1yfkt4566v2oy          |1483225252|0.5     

## Number of events

In [0]:
df_user_activeness.count()

1226167

## Number of users

In [0]:
df_user_activeness.select(F.column('userId')).distinct().count()

252196

## Statistics

In [0]:
(
    df_user_activeness
    .agg(
        F.min(F.column('userActiveness')),
        F.max(F.column('userActiveness')),
        F.avg(F.column('userActiveness')),
    )
).show(truncate=False)

+-------------------+-------------------+-------------------+
|min(userActiveness)|max(userActiveness)|avg(userActiveness)|
+-------------------+-------------------+-------------------+
|0.3203592388104152 |1.0                |0.7504409980895755 |
+-------------------+-------------------+-------------------+



## Disk usage

In [0]:
!du -sh /content/gdrive/My\ Drive/dataset/adressa/one_week/user_activeness

117M	/content/gdrive/My Drive/dataset/adressa/one_week/user_activeness
