In [18]:
import json
import pandas as pd
import pyspark as ps

from pyspark.sql.functions import col, count, countDistinct, collect_set

In [7]:
# load sample
file = open("../data/files/part-00000.json")
item = json.loads(file.readline())
item

{'at': '2017-11-16 02:10:20',
 'browser': 'Chrome 62',
 'carrier': 'Telemar Norte Leste S.a.',
 'country': 'br',
 'custom_1': 'ESTÁCIO EAD',
 'custom_2': 'Pedagogia',
 'custom_4': 'Core User',
 'device_new': False,
 'install_uuid': 'fdfff303505f8a18b17ee40587e785f6bb9c8374',
 'language': 'pt',
 'library_ver': 'web_3.3.3',
 'model': 'Linux armv7l',
 'name': 'Page View',
 'nth': 17,
 'os_ver': '',
 'platform': 'Linux',
 'session_uuid': '188031bec37fc43b737c2c49349076700ae89128',
 'type': 'e',
 'user_type': 'known',
 'uuid': '1b3ed1360694ceae79f6361ed11b03cf245311c8',
 'studentId_clientType': '34cbeaf4a28c798de94cd9afb43d4e2e49ce80d6b52364e097371db586d4ea48@Website',
 'Page Name': '/perfil/22482764/materiais',
 'Page Category': 'perfil',
 'Page Category 1': 'perfil',
 'Page Category 2': 'Undefined',
 'Page Category 3': 'Undefined',
 'Last Accessed Url': '/'}

In [3]:
spark = (ps.sql.SparkSession
            .builder
            .appName("sandbox")
            .config('spark.driver.extraClassPath', '/home/jovyan/work/notebooks/sqlite-jdbc-3.32.3.2.jar')
            .getOrCreate()
        )

print(spark.version)

3.0.1


In [5]:
properties = {
    'driver': 'org.sqlite.JDBC',
    'url': 'jdbc:sqlite:data_warehouse.db',
}

df1 = spark.read\
    .format('jdbc')\
    .option('driver', properties['driver']) \
    .option('url', properties['url']) \
    .option('dbtable', 'dim_course') \
    .load()

In [6]:
df1.head()

Row(course_id=1199453, name='Administração')

In [37]:
df_fact_logged_student = spark.read\
    .format('jdbc')\
    .option('driver', properties['driver']) \
    .option('url', properties['url']) \
    .option('dbtable', 'fact_logged_student') \
    .load()
df_fact_logged_student.printSchema()

root
 |-- id: integer (nullable = true)
 |-- student_id: string (nullable = true)
 |-- time_id: integer (nullable = true)
 |-- address_id: integer (nullable = true)
 |-- university_id: integer (nullable = true)
 |-- course_id: integer (nullable = true)



In [36]:
df_dim_course = spark.read\
    .format('jdbc')\
    .option('driver', properties['driver']) \
    .option('url', properties['url']) \
    .option('dbtable', 'dim_course') \
    .load()
df_dim_course.printSchema()

root
 |-- course_id: integer (nullable = true)
 |-- name: string (nullable = true)



In [10]:
# reading all events files
df2 = spark.read.json("../data/part-*.json")

In [11]:
df2.head()

Row(Last Accessed Url='/', Page Category='perfil', Page Category 1='perfil', Page Category 2='Undefined', Page Category 3='Undefined', Page Name='/perfil/22482764/materiais', at='2017-11-16 02:10:20', browser='Chrome 62', carrier='Telemar Norte Leste S.a.', city_name=None, clv_total=None, country='br', custom_1='ESTÁCIO EAD', custom_2='Pedagogia', custom_3=None, custom_4='Core User', device_new=False, first-accessed-page=None, install_uuid='fdfff303505f8a18b17ee40587e785f6bb9c8374', language='pt', library_ver='web_3.3.3', marketing_campaign=None, marketing_medium=None, marketing_source=None, model='Linux armv7l', name='Page View', nth=17, os_ver='', platform='Linux', region=None, session_uuid='188031bec37fc43b737c2c49349076700ae89128', studentId_clientType='34cbeaf4a28c798de94cd9afb43d4e2e49ce80d6b52364e097371db586d4ea48@Website', type='e', user_type='known', uuid='1b3ed1360694ceae79f6361ed11b03cf245311c8')

In [28]:
df2.printSchema()

root
 |-- Last Accessed Url: string (nullable = true)
 |-- Page Category: string (nullable = true)
 |-- Page Category 1: string (nullable = true)
 |-- Page Category 2: string (nullable = true)
 |-- Page Category 3: string (nullable = true)
 |-- Page Name: string (nullable = true)
 |-- at: string (nullable = true)
 |-- browser: string (nullable = true)
 |-- carrier: string (nullable = true)
 |-- city_name: string (nullable = true)
 |-- clv_total: long (nullable = true)
 |-- country: string (nullable = true)
 |-- custom_1: string (nullable = true)
 |-- custom_2: string (nullable = true)
 |-- custom_3: string (nullable = true)
 |-- custom_4: string (nullable = true)
 |-- device_new: boolean (nullable = true)
 |-- first-accessed-page: string (nullable = true)
 |-- install_uuid: string (nullable = true)
 |-- language: string (nullable = true)
 |-- library_ver: string (nullable = true)
 |-- marketing_campaign: string (nullable = true)
 |-- marketing_medium: string (nullable = true)
 |-- mark

In [21]:
df3 = df2.select(collect_set("custom_4"))
df3.show(truncate=False)

+-----------------------------------------------------------+
|collect_set(custom_4)                                      |
+-----------------------------------------------------------+
|[Core User, unknown, Casual User, Cold User, New User, yes]|
+-----------------------------------------------------------+



In [26]:
df2.groupBy("custom_2").count().orderBy(col("count").desc()).show(truncate=False)

+-------------------------------------+------+
|custom_2                             |count |
+-------------------------------------+------+
|null                                 |100653|
|Direito                              |96020 |
|Administração                        |53755 |
|unknown                              |43128 |
|Engenharia Civil                     |33256 |
|Pedagogia                            |32081 |
|Contabilidade / Ciências Contábeis   |30788 |
|Fisioterapia                         |19321 |
|Psicologia                           |17719 |
|Nutrição                             |16534 |
|Enfermagem e Obstetrícia             |14978 |
|Engenharia de Produção               |14371 |
|Educação Física                      |10913 |
|Gestão de Recursos Humanos           |10249 |
|Análise e Desenvolvimento de Sistemas|10028 |
|Engenharia Elétrica                  |9603  |
|Engenharia Mecânica                  |9246  |
|Farmácia / Ciências Farmacêuticas    |8282  |
|Sistemas de 

In [35]:
df_dim_course.printSchema()

root
 |-- course_id: integer (nullable = true)
 |-- name: string (nullable = true)



In [38]:
df_dim_course[df_dim_course['name']=='Direito'].show()

+---------+-------+
|course_id|   name|
+---------+-------+
|  1199517|Direito|
+---------+-------+



In [40]:
df_count_by_course = df_fact_logged_student.groupBy('course_id').count().show()

+---------+-----+
|course_id|count|
+---------+-----+
+---------+-----+

