In [1]:
%%capture
!pip install pyspark
!pip install wget

In [2]:
import json
import sys
import wget

import pyspark.sql.functions as f
import pyspark.sql.types as t

from functools import reduce
from pprint import pprint
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.utils import AnalysisException

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
spark = (
    SparkSession
    .builder
    .appName("app")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

In [4]:
DATASETS_FULL = [
    "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2019.zip",
    "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2019.zip",
    "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2019.zip",
    "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2019.zip",
]

DATASETS_MINIMAL = DATASETS_FULL[2:3]  # Slice to keep as a list. Simplifies

for dataset in DATASETS_MINIMAL:
    print("\n", dataset.split("/")[-1])
    wget.download(dataset, out="/content/")


 data_Q3_2019.zip


In [5]:
%%capture
!unzip /content/data_Q3_2019.zip -d /content/data_Q3_2019

читаем данные и меняем типы данных.  
метод createOrReplaceTempView создаем локальное временное представление таблицы, чтобы работать с ней как с sql таблицей

In [6]:
backblaze_2019 = spark.read.csv(
    '/content/data_Q3_2019', header=True, inferSchema=True
)

backblaze_2019 = backblaze_2019.select(
    [
        f.col(x).cast(t.LongType()) if x.startswith("smart") else f.col(x)
        for x in backblaze_2019.columns
    ]
)

backblaze_2019.createOrReplaceTempView("backblaze_stats_2019")

Выбираем серийные номера жестких дисков, которые сломались (failure = 1)

In [7]:
# sql
spark.sql(
    "select serial_number from backblaze_stats_2019 where failure = 1"
).show(5)

# pyspark
(
    backblaze_2019
    .select('serial_number')
    .where(f.col('failure')==1)
).show(5)

+-------------+
|serial_number|
+-------------+
|     ZA10MCJ5|
|     ZCH07T9K|
|     ZCH0CA7Z|
|     Z302F381|
|     ZCH0B3Z2|
+-------------+
only showing top 5 rows

+-------------+
|serial_number|
+-------------+
|     ZA10MCJ5|
|     ZCH07T9K|
|     ZCH0CA7Z|
|     Z302F381|
|     ZCH0B3Z2|
+-------------+
only showing top 5 rows



выводим максимальный и минимальный размер диска в гигабайтах для каждой модели, сортируем по максимальному размер

In [8]:
spark.sql(
    """
    select
    model,
    min(capacity_bytes) / pow(1024,3) as min_GB,
    max(capacity_bytes) / pow(1024,3) as max_GB
    from backblaze_stats_2019
    group by model
    order by max_GB desc
    """
).show(5)

(
    backblaze_2019.groupby(f.col("model")).agg(
        f.min(f.col("capacity_bytes") / f.pow(f.lit(1024), 3)).alias("min_GB"),
        f.max(f.col("capacity_bytes") / f.pow(f.lit(1024), 3)).alias("max_GB"),
    ).orderBy(f.col("max_GB"), ascending=False)
).show(5)

+--------------------+--------------------+-------+
|               model|              min_GB| max_GB|
+--------------------+--------------------+-------+
| TOSHIBA MG07ACA14TA|             13039.0|13039.0|
|       ST12000NM0007|-9.31322574615478...|11176.0|
|HGST HUH721212ALE600|             11176.0|11176.0|
|       ST12000NM0117|             11176.0|11176.0|
|HGST HUH721212ALN604|-9.31322574615478...|11176.0|
+--------------------+--------------------+-------+
only showing top 5 rows

+--------------------+--------------------+-------+
|               model|              min_GB| max_GB|
+--------------------+--------------------+-------+
| TOSHIBA MG07ACA14TA|             13039.0|13039.0|
|       ST12000NM0007|-9.31322574615478...|11176.0|
|HGST HUH721212ALE600|             11176.0|11176.0|
|       ST12000NM0117|             11176.0|11176.0|
|HGST HUH721212ALN604|-9.31322574615478...|11176.0|
+--------------------+--------------------+-------+
only showing top 5 rows



выводим только те объекты, у которых минимальный размер меньше 0

In [9]:
spark.sql(
    """
    select
    model,
    min(capacity_bytes) / pow(1024,3) as min_GB,
    max(capacity_bytes) / pow(1024,3) as max_GB
    from backblaze_stats_2019
    group by model
    having min_GB < 0
    order by max_GB desc
    """
).show(5)

(
    backblaze_2019
    .groupby(f.col("model"))
    .agg(
        f.min(f.col("capacity_bytes") / f.pow(f.lit(1024), 3)).alias("min_GB"),
        f.max(f.col("capacity_bytes") / f.pow(f.lit(1024), 3)).alias("max_GB"),
    )
    .orderBy(f.col("max_GB"), ascending=False)
    .filter(f.col('min_GB')<0)
).show(5)

+--------------------+--------------------+-----------------+
|               model|              min_GB|           max_GB|
+--------------------+--------------------+-----------------+
|       ST12000NM0007|-9.31322574615478...|          11176.0|
|HGST HUH721212ALN604|-9.31322574615478...|          11176.0|
|HGST HUH721010ALE600|-9.31322574615478...|           9314.0|
|       ST10000NM0086|-9.31322574615478...|           9314.0|
|        ST8000NM0055|-9.31322574615478...|7452.036460876465|
+--------------------+--------------------+-----------------+
only showing top 5 rows

+--------------------+--------------------+-----------------+
|               model|              min_GB|           max_GB|
+--------------------+--------------------+-----------------+
|       ST12000NM0007|-9.31322574615478...|          11176.0|
|HGST HUH721212ALN604|-9.31322574615478...|          11176.0|
|HGST HUH721010ALE600|-9.31322574615478...|           9314.0|
|       ST10000NM0086|-9.31322574615478...|  

в pyspark промежуточные результаты работы с данными мы можем сохранить в перменные.  
в sql требуется создать временное представление тким образом:  
CREATE OR REPLACE TEMP VIEW НАЗВАНИЕ_ТАБЛИЦЫ AS

In [10]:
spark.sql(
    """
    CREATE OR REPLACE TEMP VIEW drive_days AS
    SELECT model, count(*) AS drive_days
    FROM backblaze_stats_2019
    GROUP BY model
    """
)
spark.sql(
    """
    CREATE OR REPLACE TEMP VIEW failures AS
    SELECT model, count(*) AS failures
    FROM backblaze_stats_2019
    WHERE failure = 1
    GROUP BY model
    """
)

# талицу, полученную после sql запроса тоже можно сохранить в переменную
# без создания временного представления
drive_days = spark.sql(
    """
    SELECT model, count(*) AS drive_days
    FROM backblaze_stats_2019
    GROUP BY model
    """
)

drive_days = (
    backblaze_2019
    .groupby(f.col("model"))
    .agg(
        f.count(f.col("*")).alias("drive_days")
    )
)
failures = (
    backblaze_2019
    .where(f.col("failure") == 1)
    .groupby(f.col("model"))
    .agg(f.count(f.col("*")).alias("failures"))
)

In [17]:
a = spark.sql(
    """
    SELECT model, count(*) AS failures
    FROM backblaze_stats_2019
    WHERE failure = 1
    GROUP BY model
    """
)
a.show()

+--------------------+--------+
|               model|failures|
+--------------------+--------+
|         ST4000DM000|      72|
|       ST12000NM0007|     361|
|         ST8000DM002|      36|
|HGST HMS5C4040BLE640|      18|
|          ST500LM030|       9|
|HGST HUH721212ALN604|      15|
| TOSHIBA MQ01ABF050M|       5|
|        ST8000NM0055|      50|
|       ST12000NM0117|       5|
|HGST HMS5C4040ALE640|       6|
|  TOSHIBA MQ01ABF050|      25|
|HGST HUH721212ALE600|       2|
|       ST500LM012 HN|       7|
| TOSHIBA MG07ACA14TA|       2|
|       ST10000NM0086|       2|
|         ST6000DX000|       4|
|         ST8000DM005|       1|
|HGST HUH728080ALE600|       1|
|      WDC WD5000LPVX|       1|
+--------------------+--------+



операция объединения строк

In [11]:
col = 'model'
spark.sql(
    f"""
    select {col}
    from drive_days union all
    select {col}
    from failures
    """
).show(5)

(
    failures.select(col)
    .union(drive_days.select(col))
).show(5)

+-------------+
|        model|
+-------------+
|  ST9250315AS|
|  ST4000DM000|
|ST12000NM0007|
|  ST8000DM005|
|   ST320LT007|
+-------------+
only showing top 5 rows

+--------------------+
|               model|
+--------------------+
|         ST4000DM000|
|       ST12000NM0007|
|         ST8000DM002|
|HGST HMS5C4040BLE640|
|          ST500LM030|
+--------------------+
only showing top 5 rows



операция объединения столбцов

In [12]:
spark.sql(
    f"""
    select *
    from drive_days
    join failures on drive_days.model == failures.model
    """
).show(5)

(
    drive_days
    .join(failures, on='model', how='inner')
).show(5)

+--------------------+----------+--------------------+--------+
|               model|drive_days|               model|failures|
+--------------------+----------+--------------------+--------+
|         ST4000DM000|   1796728|         ST4000DM000|      72|
|       ST12000NM0007|   3212635|       ST12000NM0007|     361|
|         ST8000DM002|    906588|         ST8000DM002|      36|
|HGST HMS5C4040BLE640|   1173136|HGST HMS5C4040BLE640|      18|
|          ST500LM030|     21447|          ST500LM030|       9|
+--------------------+----------+--------------------+--------+
only showing top 5 rows

+--------------------+----------+--------+
|               model|drive_days|failures|
+--------------------+----------+--------+
|         ST4000DM000|   1796728|      72|
|       ST12000NM0007|   3212635|     361|
|         ST8000DM002|    906588|      36|
|HGST HMS5C4040BLE640|   1173136|      18|
|          ST500LM030|     21447|       9|
+--------------------+----------+--------+
only showing

выводим топ самых надежных дисков. надежность определяем как дни работы, деленные на количество поломок

In [13]:
def failure_rate(drive_stats):
    drive_days = (
        drive_stats
        .groupby(f.col("model"))
        .agg(f.count(f.col("*")).alias("drive_days"))
    )
    failures = (
        drive_stats
        .where(f.col("failure") == 1)
        .groupby(f.col("model"))
        .agg(f.count(f.col("*")).alias("failures"))
    )
    answer = (
        drive_days.join(failures, on="model", how="inner")
        .withColumn("failure_rate", f.col("failures") / f.col("drive_days"))
        .orderBy(f.col("failure_rate").desc())
    )
    return answer

failure_rate(backblaze_2019).show(5)

+------------------+----------+--------+--------------------+
|             model|drive_days|failures|        failure_rate|
+------------------+----------+--------+--------------------+
|     ST12000NM0117|       259|       5|0.019305019305019305|
|TOSHIBA MQ01ABF050|     44808|      25|5.579360828423496E-4|
|       ST8000DM005|      2280|       1|4.385964912280702E-4|
|        ST500LM030|     21447|       9| 4.19639110365086E-4|
|     ST500LM012 HN|     46309|       7|1.511585221015353...|
+------------------+----------+--------+--------------------+
only showing top 5 rows



#### Использование синтаксиса SQL для написания более компактного кода pysark
задача - по введенному размеру диска определить топ самых надежных

selectExpr можно использовать, чтобы при select более компактно выполнить преобразования для колонки  
метод expr можно использовать в agg дляЮ, чтобы не писать много alias

In [18]:
data = spark.read.csv(
    '/content/data_Q3_2019', header=True, inferSchema=True
)

full_data = data.selectExpr(
    "model",
    "capacity_bytes / pow(1024, 3) as capacity_GB",
    "date",
    "failure"
)

# Альтернатива без sql

# full_data = full_data.select(
#     f.col("model"),
#     (f.col("capacity_bytes") / f.pow(f.lit(1024), 3)).alias("capacity_GB"),
#     f.col("date"),
#     f.col("failure")
# )


drive_days = (
    full_data
    .groupby("model", "capacity_GB")
    .agg(
        f.count("*").alias("drive_days")
    )
)

failures = (
    full_data
    .where("failure = 1")
    .groupby("model", "capacity_GB")
    .agg(f.expr("count(*) failures"))
)

# Альтернатива без sql
# failures = (
#     full_data
#     .where("failure = 1")
#     .groupby("model", "capacity_GB")
#     .agg(f.count("*").alias("failures"))
# )

summarized_data = (
    drive_days
    .join(failures, on=["model", "capacity_GB"], how="left")
    .fillna(0.0, ["failures"])
    .selectExpr("model", "capacity_GB", "failures / drive_days as failure_rate")
    .cache()
)
summarized_data.show(5)

+--------------------+-----------------+--------------------+
|               model|      capacity_GB|        failure_rate|
+--------------------+-----------------+--------------------+
|       ST12000NM0117|          11176.0|0.019305019305019305|
|      WDC WD5000LPCX|465.7617416381836|                 0.0|
|         ST6000DM004| 5589.02986907959|                 0.0|
|         ST4000DM005|3726.023277282715|                 0.0|
|HGST HMS5C4040BLE641|3726.023277282715|                 0.0|
+--------------------+-----------------+--------------------+
only showing top 5 rows



sql синтаксис можно использовать в методах where/filter.

In [21]:
def most_reliable_drive_for_capacity(data, capacity_GB=2048, precision=0.25,top_n=3):
    """Returns the top 3 drives for a given approximate capacity.
    Given a capacity in GB and a precision as a decimal number, we keep the N
    drives where:
    - the capacity is between (capacity * 1/(1+precision)), capacity *
    (1+precision)
    - the failure rate is the lowest
    """
    capacity_min = capacity_GB / (1 + precision)
    capacity_max = capacity_GB * (1 + precision)
    answer = (
        data
        .filter(f"capacity_GB between {capacity_min} and {capacity_max}")
        .orderBy("failure_rate", "capacity_GB", ascending=[True, False])
        .limit(top_n)
    )
    # Альтернатива без sql
    # answer = (
    #     data
    #     .filter(f.col('capacity_GB').between(capacity_min, capacity_max))
    #     .orderBy("failure_rate", "capacity_GB", ascending=[True, False])
    #     .limit(top_n)
    # )

    return answer

most_reliable_drive_for_capacity(summarized_data, capacity_GB=11176.0).show()

+--------------------+-----------+--------------------+
|               model|capacity_GB|        failure_rate|
+--------------------+-----------+--------------------+
|HGST HUH721010ALE600|     9314.0|                 0.0|
|HGST HUH721212ALN604|    11176.0|1.585588480593982...|
|HGST HUH721212ALE600|    11176.0|1.636661211129296...|
+--------------------+-----------+--------------------+



#### Задание
Если вы посмотрите на данные, то увидите, что некоторые модели накопителей могут сообщать об ошибочной емкости. На этапе подготовки данных измените размер фрейма данных full_data таким образом, чтобы использовалась наиболее распространенная емкость для каждого диска.

In [32]:
full_data = (
    full_data.join(
        (
            full_data
            .groupby('model')
            .agg(
                f.mode(f.col('capacity_GB')).alias('capacity_GB_mode')
            )
        ),
        on='model',
        how='left'
    )
)

full_data.filter("capacity_GB < 0").show(5)

+-------------+--------------------+----------+-------+----------------+----------------+
|        model|         capacity_GB|      date|failure|capacity_GB_mode|capacity_GB_mode|
+-------------+--------------------+----------+-------+----------------+----------------+
|ST12000NM0007|-9.31322574615478...|2019-09-24|      0|         11176.0|         11176.0|
|ST12000NM0007|-9.31322574615478...|2019-09-24|      0|         11176.0|         11176.0|
|ST12000NM0007|-9.31322574615478...|2019-09-24|      0|         11176.0|         11176.0|
|ST12000NM0007|-9.31322574615478...|2019-09-24|      0|         11176.0|         11176.0|
|ST12000NM0007|-9.31322574615478...|2019-09-24|      0|         11176.0|         11176.0|
+-------------+--------------------+----------+-------+----------------+----------------+
only showing top 5 rows

