<a href="https://colab.research.google.com/github/kareemullah123456789/big_data_advanced/blob/main/section_6_python_spark_sql_joins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/cde_data/elements

Periodic_Table_Of_Elements.csv


In [None]:
# Import the SparkSession (entry point for PySpark)
from pyspark.sql import SparkSession

# Import AnalysisException for error handling
from pyspark.sql.utils import AnalysisException

# Import PySpark SQL functions (e.g., col(), when(), agg())
import pyspark.sql.functions as F

# Import PySpark SQL data types (e.g., StringType, IntegerType)
import pyspark.sql.types as T

In [None]:
spark = SparkSession.builder.appName("ElementsAnalysis").getOrCreate()

In [None]:
# Read the elements CSV file
elements = spark.read.csv(
    "/content/drive/MyDrive/cde_data/elements/Periodic_Table_Of_Elements.csv",
    header=True,        # First row contains column headers
    inferSchema=True,   # Automatically infer column data types
)

# Filter for liquid elements, group by period, and count
(elements
 .where(F.col("phase") == "liq")  # Filter rows where phase is "liq"
 .groupby("period")                # Group by the "period" column
 .count()                          # Count elements in each group
 .show()                           # Display the results
)

+------+-----+
|period|count|
+------+-----+
|     6|    1|
|     4|    1|
+------+-----+



In [None]:
# -- Assume the data is registered as a temporary SQL table called `elements`
# SELECT
#   period,           -- Group by the "period" column
#   COUNT(*) AS count -- Count elements in each group
# FROM elements
# WHERE phase = 'liq' -- Filter rows where phase is "liq"
# GROUP BY period;    -- Group results by period

In [None]:
elements.createOrReplaceTempView("elements")

In [None]:
try:
    # Execute an SQL query on the Spark session
    spark.sql(
        """
        SELECT period, COUNT(*) AS count
        FROM elements
        WHERE phase = 'liq'
        GROUP BY period
        """
    ).show(5)  # Display up to 5 rows of the result
except AnalysisException as e:
    # Handle any SQL-related errors (e.g., missing table or column)
    print(f"An error occurred: {e}")

+------+-----+
|period|count|
+------+-----+
|     6|    1|
|     4|    1|
+------+-----+



In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls /content/drive/MyDrive/cde_data/

 1342-0.txt
 broadcast_logs
 data.csv
 data_Q3_2019.zip
 elements
 gutenberg_books
 prideandprejudice.csv
'pyngrok_UI_CODE_Working_with_RDDs_in_PySpark_(2).ipynb'
 pyspark_tutorial.ipynb
 sales_data.csv
 sample.csv
 sample_data.csv
 Section_2_Resilient_Distributed_Datasets_Transformations.ipynb
 Section_3_Resilient_Distributed_Datasets_Actions.ipynb
 Section_4_Spark_DataFrames_and_Transformations.ipynb
 shows
 simple_count.csv
 simple_count_single_partition.csv
 simple_count_single_partition_final.csv
 Spark_SQL.ipynb


In [None]:
!ls /content/drive/MyDrive/cde_data/data_Q3_2019.zip

/content/drive/MyDrive/cde_data/data_Q3_2019.zip


In [None]:


# Extract the ZIP file
import zipfile
zip_path = '/content/drive/MyDrive/cde_data/data_Q3_2019.zip'

# Create a directory to extract the files
!mkdir -p /content/extracted_data
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/extracted_data')

# Check the extracted files
!ls -la /content/extracted_data

total 3070944
drwxr-xr-x 3 root root     4096 Mar 21 18:39 .
drwxr-xr-x 1 root root     4096 Mar 21 18:36 ..
-rw-r--r-- 1 root root 33668882 Mar 21 18:38 2019-07-01.csv
-rw-r--r-- 1 root root 33668992 Mar 21 18:38 2019-07-02.csv
-rw-r--r-- 1 root root 33680258 Mar 21 18:38 2019-07-03.csv
-rw-r--r-- 1 root root 33686648 Mar 21 18:38 2019-07-04.csv
-rw-r--r-- 1 root root 33688132 Mar 21 18:38 2019-07-05.csv
-rw-r--r-- 1 root root 33671384 Mar 21 18:38 2019-07-06.csv
-rw-r--r-- 1 root root 33671774 Mar 21 18:38 2019-07-07.csv
-rw-r--r-- 1 root root 33673278 Mar 21 18:38 2019-07-08.csv
-rw-r--r-- 1 root root 33651712 Mar 21 18:38 2019-07-09.csv
-rw-r--r-- 1 root root 33659590 Mar 21 18:38 2019-07-10.csv
-rw-r--r-- 1 root root 33675871 Mar 21 18:38 2019-07-11.csv
-rw-r--r-- 1 root root 33679282 Mar 21 18:38 2019-07-12.csv
-rw-r--r-- 1 root root 33701769 Mar 21 18:38 2019-07-13.csv
-rw-r--r-- 1 root root 33703070 Mar 21 18:38 2019-07-14.csv
-rw-r--r-- 1 root root 33674989 Mar 21 18:38 2019-0

In [None]:
# Initialize PySpark if not already done
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Backblaze Data Analysis") \
    .getOrCreate()

# Assuming the extracted file is a CSV
# Adjust the file format and options as needed based on your actual file
file_path = "/content/extracted_data"  # Adjust if the filename is different

# Load the data
backblaze_2019_q3 = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(file_path)

# Apply the same schema transformation as you did for the other dataset
from pyspark.sql import functions as F
from pyspark.sql import types as T

backblaze_2019_q3 = backblaze_2019_q3.select(
    [F.col(x).cast(T.LongType()) if x.startswith("smart") else F.col(x) for x in backblaze_2019_q3.columns]
)

# Create a temp view for SQL queries
backblaze_2019_q3.createOrReplaceTempView("backblaze_stats_2019_q3")

# You can now run SQL queries on this temp view
# For example:
result = spark.sql("SELECT * FROM backblaze_stats_2019_q3 LIMIT 10")
result.show()

+----------+--------------+--------------------+--------------+-------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+-

In [None]:
spark.sql(
    "select serial_number from backblaze_stats_2019_q3 where failure = 1"
 ).show(
    5
 )

+-------------+
|serial_number|
+-------------+
|     ZA10MCJ5|
|     ZCH07T9K|
|     ZCH0CA7Z|
|     Z302F381|
|     ZCH0B3Z2|
+-------------+
only showing top 5 rows



In [None]:
backblaze_2019_q3.where("failure = 1").select(F.col("serial_number")).show(5)

+-------------+
|serial_number|
+-------------+
|     ZA10MCJ5|
|     ZCH07T9K|
|     ZCH0CA7Z|
|     Z302F381|
|     ZCH0B3Z2|
+-------------+
only showing top 5 rows



In [None]:
backblaze_2019 = backblaze_2019_q3.select(
    [
 F.col(x).cast(T.LongType()) if x.startswith("smart") else F.col(x)
 for x in backblaze_2019_q3.columns
    ]
 )

In [None]:
backblaze_2019.show()

+----------+--------------+--------------------+--------------+-------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+------------------+-----------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+-------------------+------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+-

In [None]:
# SQL Query to calculate min and max capacity in GB grouped by model
spark.sql(
    """
    SELECT
       model,  -- Select the 'model' column for grouping
       min(capacity_bytes / pow(1024, 3)) min_GB,  -- Calculate minimum capacity in GB
       max(capacity_bytes / pow(1024, 3)) max_GB  -- Calculate maximum capacity in GB
    FROM backblaze_stats_2019_q3  -- Use the registered SQL table
    GROUP BY 1  -- Group by the first column ('model')
    ORDER BY 3 DESC  -- Order by the third column ('max_GB') in descending order
    """
).show(5)  # Display the top 5 results



+--------------------+--------------------+-------+
|               model|              min_GB| max_GB|
+--------------------+--------------------+-------+
| TOSHIBA MG07ACA14TA|             13039.0|13039.0|
|       ST12000NM0007|-9.31322574615478...|11176.0|
|HGST HUH721212ALE600|             11176.0|11176.0|
|       ST12000NM0117|             11176.0|11176.0|
|HGST HUH721212ALN604|-9.31322574615478...|11176.0|
+--------------------+--------------------+-------+
only showing top 5 rows



In [None]:
# PySpark DataFrame API equivalent of the above SQL query
backblaze_2019.groupby(F.col("model")).agg(
    # Calculate minimum capacity in GB for each model
    F.min(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("min_GB"),
    # Calculate maximum capacity in GB for each model
    F.max(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("max_GB"),
).orderBy(
    F.col("max_GB"), ascending=False  # Order by 'max_GB' in descending order
).show(5)  # Display the top 5 results

+--------------------+--------------------+-------+
|               model|              min_GB| max_GB|
+--------------------+--------------------+-------+
| TOSHIBA MG07ACA14TA|             13039.0|13039.0|
|       ST12000NM0007|-9.31322574615478...|11176.0|
|HGST HUH721212ALE600|             11176.0|11176.0|
|       ST12000NM0117|             11176.0|11176.0|
|HGST HUH721212ALN604|-9.31322574615478...|11176.0|
+--------------------+--------------------+-------+
only showing top 5 rows



In [None]:
# SQL Query to calculate min and max capacity in GB grouped by model,
# filtering for models where min_GB != max_GB
spark.sql(
    """
    SELECT
       model,  -- Select the 'model' column for grouping
       min(capacity_bytes / pow(1024, 3)) min_GB,  -- Calculate minimum capacity in GB
       max(capacity_bytes / pow(1024, 3)) max_GB  -- Calculate maximum capacity in GB
    FROM backblaze_stats_2019_q3  -- Use the registered SQL table
    GROUP BY 1  -- Group by the first column ('model')
    HAVING min_GB != max_GB  -- Filter out models where min_GB equals max_GB
    ORDER BY 3 DESC  -- Order by the third column ('max_GB') in descending order
    """
).show(5)  # Display the top 5 results

+--------------------+--------------------+-----------------+
|               model|              min_GB|           max_GB|
+--------------------+--------------------+-----------------+
|       ST12000NM0007|-9.31322574615478...|          11176.0|
|HGST HUH721212ALN604|-9.31322574615478...|          11176.0|
|HGST HUH721010ALE600|-9.31322574615478...|           9314.0|
|       ST10000NM0086|-9.31322574615478...|           9314.0|
|        ST8000NM0055|-9.31322574615478...|7452.036460876465|
+--------------------+--------------------+-----------------+
only showing top 5 rows



In [None]:
# PySpark DataFrame API equivalent of the above SQL query
backblaze_2019.groupby(F.col("model")).agg(
    # Calculate minimum capacity in GB for each model
    F.min(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("min_GB"),
    # Calculate maximum capacity in GB for each model
    F.max(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("max_GB"),
).where(
    F.col("min_GB") != F.col("max_GB")  # Filter out models where min_GB equals max_GB
).orderBy(
    F.col("max_GB"), ascending=False  # Order by 'max_GB' in descending order
).show(
    5  # Display the top 5 results
)

+--------------------+--------------------+-----------------+
|               model|              min_GB|           max_GB|
+--------------------+--------------------+-----------------+
|       ST12000NM0007|-9.31322574615478...|          11176.0|
|HGST HUH721212ALN604|-9.31322574615478...|          11176.0|
|HGST HUH721010ALE600|-9.31322574615478...|           9314.0|
|       ST10000NM0086|-9.31322574615478...|           9314.0|
|        ST8000NM0055|-9.31322574615478...|7452.036460876465|
+--------------------+--------------------+-----------------+
only showing top 5 rows



In [None]:
# # Register the `backblaze_2019` DataFrame as a temporary SQL table named "drive_stats"
# backblaze_2019_q3.createOrReplaceTempView("drive_stats")

# # SQL Query to create a temporary view `drive_days` that calculates the total number of drive days per model
# spark.sql(
#     """
#     CREATE OR REPLACE TEMP VIEW drive_days AS
#     SELECT
#         model,  -- Select the 'model' column for grouping
#         count(*) AS drive_days  -- Count the total number of rows (drive days) for each model
#     FROM drive_stats  -- Use the registered SQL table
#     GROUP BY model  -- Group by the 'model' column
#     """
# )

# # SQL Query to create a temporary view `failures` that calculates the total number of failures per model
# spark.sql(
#     """
#     CREATE OR REPLACE TEMP VIEW failures AS
#     SELECT
#         model,  -- Select the 'model' column for grouping
#         count(*) AS failures  -- Count the total number of rows where 'failure = 1'
#     FROM drive_stats  -- Use the registered SQL table
#     WHERE failure = 1  -- Filter rows where 'failure' is 1
#     GROUP BY model  -- Group by the 'model' column
#     """
# )

DataFrame[]

In [None]:


# # PySpark DataFrame API equivalent of the `drive_days` SQL query
# drive_days = backblaze_2019_q3.groupby(F.col("model")).agg(
#     F.count(F.col("*")).alias("drive_days")  # Count all rows for each model and alias it as 'drive_days'
# )

# # PySpark DataFrame API equivalent of the `failures` SQL query
# failures = (
#     backblaze_2019.where(F.col("failure") == 1)  # Filter rows where 'failure' is 1
#     .groupby(F.col("model"))  # Group by the 'model' column
#     .agg(F.count(F.col("*")).alias("failures"))  # Count all rows for each model and alias it as 'failures'
# )