In [20]:
import json

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
spark = SparkSession.builder \
    .appName('Data Analysis with Python and PySpark - Chapter 07 Examples') \
    .getOrCreate()

23/01/05 16:52:12 WARN Utils: Your hostname, karlos-300E5M-300E5L resolves to a loopback address: 127.0.1.1; using 10.0.0.89 instead (on interface wlp2s0)
23/01/05 16:52:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/05 16:52:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Chemical elements example

### Use the saved schema from disk if it exists. Otherwhise, save the infered schema to disk.

Therefore, after the first dataset read operation, any subsequent read will not need to infer the schema, which requires Spark to read the dataset two times, one for infer the schema, and one to read the data itself, in contrast to when the schema is not infered, which requires one read operation.

In [46]:
infer_elements_schema = False
elements_schema = T.StructType()

try:
    with open('./elements_schema.json', mode='r') as f:
        elements_schema = T.StructType.fromJson(json.load(f))
except FileNotFoundError:
    infer_elements_schema = True


In [47]:
ELEMENTS_FILEPATH =  '../../data/elements/Periodic_Table_Of_Elements.csv'

if infer_elements_schema:
    elements = spark.read.csv(
        path=ELEMENTS_FILEPATH,
        header=True,
        inferSchema=True
    )
else:
    elements = spark.read.csv(
        path=ELEMENTS_FILEPATH,
        header=True,
        schema=elements_schema,
        inferSchema=False
    )


elements

DataFrame[AtomicNumber: int, Element: string, Symbol: string, AtomicMass: double, NumberofNeutrons: int, NumberofProtons: int, NumberofElectrons: int, Period: int, Group: int, Phase: string, Radioactive: string, Natural: string, Metal: string, Nonmetal: string, Metalloid: string, Type: string, AtomicRadius: double, Electronegativity: double, FirstIonization: double, Density: double, MeltingPoint: double, BoilingPoint: double, NumberOfIsotopes: int, Discoverer: string, Year: int, SpecificHeat: double, NumberofShells: int, NumberofValence: int]

In [48]:
if infer_elements_schema:
    with open('./elements_schema.json', mode='w') as f:
        json.dump(elements.schema.jsonValue(), f)

infer_elements_schema

False

In [49]:
elements.select(elements.columns[:5]).show()

+------------+----------+------+----------+----------------+
|AtomicNumber|   Element|Symbol|AtomicMass|NumberofNeutrons|
+------------+----------+------+----------+----------------+
|           1|  Hydrogen|     H|     1.007|               0|
|           2|    Helium|    He|     4.002|               2|
|           3|   Lithium|    Li|     6.941|               4|
|           4| Beryllium|    Be|     9.012|               5|
|           5|     Boron|     B|    10.811|               6|
|           6|    Carbon|     C|    12.011|               6|
|           7|  Nitrogen|     N|    14.007|               7|
|           8|    Oxygen|     O|    15.999|               8|
|           9|  Fluorine|     F|    18.998|              10|
|          10|      Neon|    Ne|     20.18|              10|
|          11|    Sodium|    Na|     22.99|              12|
|          12| Magnesium|    Mg|    24.305|              12|
|          13|  Aluminum|    Al|    26.982|              14|
|          14|   Silicon

### Query the data set to find the number of entries with a liquid state per period.

#### PySpark version

In [5]:
elements.where(F.col('Phase') == 'liq') \
    .groupby('Period') \
    .count() \
    .orderBy('Period') \
    .show()

+------+-----+
|Period|count|
+------+-----+
|     4|    1|
|     6|    1|
+------+-----+



#### SQL version

##### First register a view based on the DataFrame to be able to query it with SQL statements.

In [6]:
elements.createOrReplaceTempView('elements')

spark.sql("""
    SELECT period, COUNT(*)
    FROM elements
    WHERE phase = "liq"
    GROUP BY period
    ORDER BY period
    """
).show()

23/01/05 16:52:31 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+------+--------+
|period|count(1)|
+------+--------+
|     4|       1|
|     6|       1|
+------+--------+



### Manage and obtain database metadata with `spark.catalog`.

In [7]:
spark.catalog.listTables()

[Table(name='elements', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [8]:
spark.sql("""SHOW TABLES""").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         | elements|       true|
+---------+---------+-----------+



In [9]:
spark.catalog.currentDatabase()

'default'

In [10]:
spark.sql("""SELECT CURRENT_DATABASE()""").show()

+------------------+
|current_database()|
+------------------+
|           default|
+------------------+



## Data set from Backblaze. 

Reference: https://www.backblaze.com/blog/backblaze-hard-drive-stats-q3-2019/

### Use the saved schema from disk if it exists. Otherwhise, save the infered schema to disk.

Therefore, after the first dataset read operation, any subsequent read will not need to infer the schema, which requires Spark to read the dataset two times, one for infer the schema, and one to read the data itself, in contrast to when the schema is not infered, which requires one read operation.

In [52]:
infer_backblaze_2019_q3_schema = False
backblaze_2019_q3_schema = T.StructType()

try:
    with open('backblaze_2019_q3', mode='r') as f:
        backblaze_2019_q3_schema = T.StructType.fromJson(json.load(f))
except FileNotFoundError:
    infer_backblaze_2019_q3_schema = True

In [53]:
BACKBLAZE_2019_Q3_FILEPATH = '../../data/backblaze/2019_q3/*.csv'

if infer_backblaze_2019_q3_schema:
    backblaze_2019_q3 = spark.read.csv(
        path=BACKBLAZE_2019_Q3_FILEPATH,
        header=True,
        inferSchema=True
    )
else:
    backblaze_2019_q3 = spark.read.csv(
        path=BACKBLAZE_2019_Q3_FILEPATH,
        header=True,
        schema=backblaze_2019_q3_schema,
        inferSchema=False
    )

backblaze_2019_q3

                                                                                

DataFrame[date: timestamp, serial_number: string, model: string, capacity_bytes: bigint, failure: int, smart_1_normalized: int, smart_1_raw: int, smart_2_normalized: int, smart_2_raw: int, smart_3_normalized: int, smart_3_raw: int, smart_4_normalized: int, smart_4_raw: int, smart_5_normalized: int, smart_5_raw: int, smart_7_normalized: int, smart_7_raw: bigint, smart_8_normalized: int, smart_8_raw: int, smart_9_normalized: int, smart_9_raw: int, smart_10_normalized: int, smart_10_raw: int, smart_11_normalized: int, smart_11_raw: int, smart_12_normalized: int, smart_12_raw: int, smart_13_normalized: string, smart_13_raw: string, smart_15_normalized: string, smart_15_raw: string, smart_16_normalized: int, smart_16_raw: int, smart_17_normalized: int, smart_17_raw: int, smart_22_normalized: int, smart_22_raw: int, smart_23_normalized: int, smart_23_raw: int, smart_24_normalized: int, smart_24_raw: int, smart_168_normalized: int, smart_168_raw: int, smart_170_normalized: int, smart_170_raw:

In [None]:
if infer_backblaze_2019_q3_schema:
    with open('backblaze_2019_q3.json', mode='w') as f:
        json.dump(backblaze_2019_q3.schema.jsonValue(), f)

#### Count the number of rows. We have a fairly big dataset, but Spark can handle it well even with only one node.

In [59]:
print(f'{backblaze_2019_q3.count():_}')



10_338_153


                                                                                

In [60]:
backblaze_2019_q3.select('date', 'serial_number', 'model', 'capacity_bytes', 'failure') \
    .show(truncate=False)

+-------------------+--------------+--------------------+--------------+-------+
|date               |serial_number |model               |capacity_bytes|failure|
+-------------------+--------------+--------------------+--------------+-------+
|2019-09-27 00:00:00|Z305B2QN      |ST4000DM000         |4000787030016 |0      |
|2019-09-27 00:00:00|ZJV0XJQ4      |ST12000NM0007       |12000138625024|0      |
|2019-09-27 00:00:00|ZJV0XJQ3      |ST12000NM0007       |12000138625024|0      |
|2019-09-27 00:00:00|ZJV0XJQ0      |ST12000NM0007       |12000138625024|0      |
|2019-09-27 00:00:00|PL1331LAHG1S4H|HGST HMS5C4040ALE640|4000787030016 |0      |
|2019-09-27 00:00:00|ZA16NQJR      |ST8000NM0055        |8001563222016 |0      |
|2019-09-27 00:00:00|ZJV02XWG      |ST12000NM0007       |12000138625024|0      |
|2019-09-27 00:00:00|ZJV1CSVX      |ST12000NM0007       |12000138625024|0      |
|2019-09-27 00:00:00|ZJV02XWA      |ST12000NM0007       |12000138625024|0      |
|2019-09-27 00:00:00|ZA18CEB

In [91]:
backblaze_2019_q3.createOrReplaceTempView('backblaze_2019_q3')

spark.catalog.listTables()

[Table(name='backblaze_2019_q3', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='elements', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

### Quick exploratory data analysis on a subset of the columns presented.

#### Check how many different models there are.

##### PySpark version

In [15]:
number_of_different_models = backblaze_2019_q3.select('model') \
    .distinct() \
    .count()

number_of_different_models

                                                                                

43

##### SQL version

In [62]:
spark.sql(
    """
    SELECT COUNT(DISTINCT model) AS number_of_different_models
    FROM backblaze_2019_q3
    """
).show()



+--------------------------+
|number_of_different_models|
+--------------------------+
|                        43|
+--------------------------+



                                                                                

#### Show a few serial numbers of HDDs which have failed at some point.

##### PySpark version

In [65]:
backblaze_2019_q3.select('serial_number') \
    .where('failure = 1') \
    .show(truncate=False)

[Stage 87:>                                                         (0 + 1) / 1]

+--------------+
|serial_number |
+--------------+
|ZA10MCJ5      |
|ZCH07T9K      |
|ZCH0CA7Z      |
|Z302F381      |
|ZCH0B3Z2      |
|PL2331LAGMTS1J|
|ZCH0BTJN      |
|ZA13QBVZ      |
|ZJV00EXD      |
|ZCH0ADRN      |
|8HJ91VRH      |
|ZDEABH54      |
|ZCH07C9X      |
|ZJV004VF      |
|S301M4YT      |
|ZCH07VQ8      |
|ZJV05KLD      |
|ZCH07HHL      |
|ZCH073TG      |
|ZCH09FCW      |
+--------------+
only showing top 20 rows



                                                                                

##### SQL version

In [68]:
spark.sql(
    """
    SELECT serial_number
    FROM backblaze_2019_q3
    WHERE failure = 1
    """
).show(truncate=False)

[Stage 93:>                                                         (0 + 1) / 1]

+--------------+
|serial_number |
+--------------+
|ZA10MCJ5      |
|ZCH07T9K      |
|ZCH0CA7Z      |
|Z302F381      |
|ZCH0B3Z2      |
|PL2331LAGMTS1J|
|ZCH0BTJN      |
|ZA13QBVZ      |
|ZJV00EXD      |
|ZCH0ADRN      |
|8HJ91VRH      |
|ZDEABH54      |
|ZCH07C9X      |
|ZJV004VF      |
|S301M4YT      |
|ZCH07VQ8      |
|ZJV05KLD      |
|ZCH07HHL      |
|ZCH073TG      |
|ZCH09FCW      |
+--------------+
only showing top 20 rows



                                                                                

#### Find the min and max capacity in GB for each model

##### PySpark version

In [69]:
backblaze_2019_q3.groupBy('model') \
    .agg(
        F.min(F.col('capacity_bytes') / F.pow(F.lit(1024), 3)).alias('min_GB'), 
        F.max(F.col('capacity_bytes') / F.pow(F.lit(1024), 3)).alias('max_GB')
    ).select('model', 'min_GB', 'max_GB') \
    .orderBy('max_GB', ascending=False) \
    .show(truncate=False)



+--------------------+----------------------+-----------------+
|model               |min_GB                |max_GB           |
+--------------------+----------------------+-----------------+
|TOSHIBA MG07ACA14TA |13039.0               |13039.0          |
|ST12000NM0117       |11176.0               |11176.0          |
|ST12000NM0007       |-9.313225746154785E-10|11176.0          |
|HGST HUH721212ALN604|-9.313225746154785E-10|11176.0          |
|HGST HUH721212ALE600|11176.0               |11176.0          |
|HGST HUH721010ALE600|-9.313225746154785E-10|9314.0           |
|ST10000NM0086       |-9.313225746154785E-10|9314.0           |
|ST8000DM004         |7452.036460876465     |7452.036460876465|
|ST8000DM002         |-9.313225746154785E-10|7452.036460876465|
|TOSHIBA HDWF180     |7452.036460876465     |7452.036460876465|
|ST8000NM0055        |-9.313225746154785E-10|7452.036460876465|
|HGST HUH728080ALE600|-9.313225746154785E-10|7452.036460876465|
|ST8000DM005         |7452.036460876465 

                                                                                

##### SQL version

In [19]:
spark.sql(
    """
    SELECT
        model,
        MIN(capacity_bytes / POW(1024, 3)) AS min_GB,
        MAX(capacity_bytes / POW(1024, 3)) AS max_GB
    FROM backblaze_2019_q3
    GROUP BY model
    ORDER BY max_GB ASC
    """
).show(truncate=False)



+-----------------------------------+----------------------+------------------+
|model                              |min_GB                |max_GB            |
+-----------------------------------+----------------------+------------------+
|Seagate BarraCuda SSD ZA250CM10002 |232.88591766357422    |232.88591766357422|
|Seagate SSD                        |232.88591766357422    |232.88591766357422|
|ST9250315AS                        |232.88591766357422    |232.88591766357422|
|ST320LT007                         |298.09114837646484    |298.09114837646484|
|DELLBOSS VD                        |447.06915283203125    |447.06915283203125|
|WDC WD5000LPVX                     |465.7617416381836     |465.7617416381836 |
|WDC WD5000BPKT                     |465.7617416381836     |465.7617416381836 |
|WDC WD5000LPCX                     |465.7617416381836     |465.7617416381836 |
|TOSHIBA MQ01ABF050                 |465.7617416381836     |465.7617416381836 |
|ST500LM021                         |465

                                                                                

#### Calculate the number of days of operation that a model has and the number of HDD failures and save to 2 different SQL views.

##### PySpark version

In [82]:
drive_days = backblaze_2019_q3.groupBy('model') \
    .agg(F.count('*').alias('drive_days')) \
    .select('model', 'drive_days') \
    .orderBy('drive_days', ascending=False)

drive_days.show(truncate=False)



+--------------------+----------+
|model               |drive_days|
+--------------------+----------+
|ST12000NM0007       |3212635   |
|ST4000DM000         |1796728   |
|ST8000NM0055        |1324122   |
|HGST HMS5C4040BLE640|1173136   |
|HGST HUH721212ALN604|946724    |
|ST8000DM002         |906588    |
|HGST HMS5C4040ALE640|245904    |
|HGST HUH721212ALE600|122200    |
|TOSHIBA MG07ACA14TA |112235    |
|ST10000NM0086       |110282    |
|HGST HUH728080ALE600|92092     |
|ST6000DX000         |81512     |
|ST500LM012 HN       |46309     |
|TOSHIBA MQ01ABF050  |44808     |
|TOSHIBA MQ01ABF050M |35351     |
|ST500LM030          |21447     |
|WDC WD5000LPVX      |19723     |
|TOSHIBA MD04ABA400V |9108      |
|WDC WD5000LPCX      |4967      |
|Seagate SSD         |4734      |
+--------------------+----------+
only showing top 20 rows



                                                                                

In [101]:
failures = backblaze_2019_q3.where(F.col('failure') == 1) \
    .groupBy('model') \
    .agg(F.count('*').alias('number_of_failures')) \
    .select('model', 'number_of_failures') \
    .orderBy('number_of_failures', ascending=False)

failures.show(truncate=False)



+----------------------------------+------------------+
|model                             |number_of_failures|
+----------------------------------+------------------+
|ST12000NM0007                     |364               |
|ST4000DM000                       |72                |
|ST8000NM0055                      |50                |
|ST8000DM002                       |36                |
|TOSHIBA MQ01ABF050                |25                |
|HGST HMS5C4040BLE640              |19                |
|HGST HUH721212ALN604              |15                |
|ST500LM030                        |9                 |
|ST500LM012 HN                     |7                 |
|HGST HMS5C4040ALE640              |6                 |
|ST12000NM0117                     |5                 |
|TOSHIBA MQ01ABF050M               |5                 |
|ST6000DX000                       |4                 |
|TOSHIBA MG07ACA14TA               |2                 |
|HGST HUH721212ALE600              |2           

                                                                                

In [102]:
drive_days.createOrReplaceTempView('drive_days')
failures.createOrReplaceTempView('failures')

spark.catalog.listTables()

[Table(name='backblaze_2019_q3', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='drive_days', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='elements', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='failures', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

##### SQL version

In [92]:
spark.sql(
    """
    SELECT model, COUNT(*) drive_days
    FROM backblaze_2019_q3
    GROUP BY model
    ORDER BY drive_days DESC
    """
).show(truncate=False)



+--------------------+----------+
|model               |drive_days|
+--------------------+----------+
|ST12000NM0007       |3212635   |
|ST4000DM000         |1796728   |
|ST8000NM0055        |1324122   |
|HGST HMS5C4040BLE640|1173136   |
|HGST HUH721212ALN604|946724    |
|ST8000DM002         |906588    |
|HGST HMS5C4040ALE640|245904    |
|HGST HUH721212ALE600|122200    |
|TOSHIBA MG07ACA14TA |112235    |
|ST10000NM0086       |110282    |
|HGST HUH728080ALE600|92092     |
|ST6000DX000         |81512     |
|ST500LM012 HN       |46309     |
|TOSHIBA MQ01ABF050  |44808     |
|TOSHIBA MQ01ABF050M |35351     |
|ST500LM030          |21447     |
|WDC WD5000LPVX      |19723     |
|TOSHIBA MD04ABA400V |9108      |
|WDC WD5000LPCX      |4967      |
|Seagate SSD         |4734      |
+--------------------+----------+
only showing top 20 rows



                                                                                

In [98]:
spark.sql("""SHOW TABLES""").show()

spark.sql("""DROP VIEW IF EXISTS drive_days""").show()

spark.sql(
    """
    CREATE OR REPLACE TEMP VIEW drive_days AS
        SELECT model, COUNT(*) drive_days
        FROM backblaze_2019_q3
        GROUP BY model
        ORDER BY drive_days DESC
    """
).show(truncate=False)


spark.sql("""SHOW TABLES""").show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|         |backblaze_2019_q3|       true|
|         |       drive_days|       true|
|         |         elements|       true|
+---------+-----------------+-----------+

++
||
++
++

++
||
++
++

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|         |backblaze_2019_q3|       true|
|         |       drive_days|       true|
|         |         elements|       true|
+---------+-----------------+-----------+

