<a href="https://colab.research.google.com/github/kareemullah123456789/big_data_advanced/blob/main/2(full)_spark_sql_tabular_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls /content/drive/MyDrive/cde_data

 1342-0.txt
 broadcast_logs
 data.csv
 gutenberg_books
 prideandprejudice.csv
'pyngrok_UI_CODE_Working_with_RDDs_in_PySpark_(2).ipynb'
 pyspark_tutorial.ipynb
 sales_data.csv
 sample.csv
 sample_data.csv
 Section_2_Resilient_Distributed_Datasets_Transformations.ipynb
 Section_3_Resilient_Distributed_Datasets_Actions.ipynb
 Section_4_Spark_DataFrames_and_Transformations.ipynb
 simple_count.csv
 simple_count_single_partition.csv
 simple_count_single_partition_final.csv
 Spark_SQL.ipynb


In [4]:
import os
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

# Initialize Spark session with an application name
spark = SparkSession.builder.appName(
    "Getting the Canadian TV channels with the highest/lowest proportion of commercials."
).getOrCreate()

# Set log level to WARN to reduce unnecessary output
spark.sparkContext.setLogLevel("WARN")

# Define the directory where the data files are stored
DIRECTORY = "/content/drive/MyDrive/cde_data/broadcast_logs"

# Load broadcast logs dataset
logs = spark.read.csv(
    os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"),
    sep="|",  # Delimiter used in the CSV file
    header=True,  # First row contains column headers
    inferSchema=True,  # Automatically infer column data types
)

# Load log identifier dataset
log_identifier = spark.read.csv(
    os.path.join(DIRECTORY, "ReferenceTables/LogIdentifier.csv"),
    sep="|",
    header=True,
    inferSchema=True,
)

# Load category dataset and select relevant columns
cd_category = spark.read.csv(
    os.path.join(DIRECTORY, "ReferenceTables/CD_Category.csv"),
    sep="|",
    header=True,
    inferSchema=True,
).select(
    "CategoryID",  # Unique identifier for categories
    "CategoryCD",  # Code representing the category
    F.col("EnglishDescription").alias("Category_Description"),  # Rename for clarity
)

# Load program class dataset and select relevant columns
cd_program_class = spark.read.csv(
    os.path.join(DIRECTORY, "ReferenceTables/CD_ProgramClass.csv"),
    sep="|",
    header=True,
    inferSchema=True,
).select(
    "ProgramClassID",  # Unique identifier for program classes
    "ProgramClassCD",  # Code representing the program class
    F.col("EnglishDescription").alias("ProgramClass_Description"),  # Rename for clarity
)

# Data processing: Drop unnecessary columns from the logs DataFrame
logs = logs.drop("BroadcastLogID", "SequenceNO")  # Remove unused columns

# Convert Duration from HH:MM:SS format to seconds
logs = logs.withColumn(
    "duration_seconds",
    (
        F.col("Duration").substr(1, 2).cast("int") * 60 * 60  # Hours to seconds
        + F.col("Duration").substr(4, 2).cast("int") * 60     # Minutes to seconds
        + F.col("Duration").substr(7, 2).cast("int")          # Seconds
    ),
)

# Filter log_identifier to include only primary logs
log_identifier = log_identifier.where(F.col("PrimaryFG") == 1)  # Retain only primary logs

# Join logs with log_identifier to enrich the data
logs_and_channels = logs.join(log_identifier, "LogServiceID")  # Join on LogServiceID

# Perform additional joins to add category and program class metadata
full_log = logs_and_channels.join(cd_category, "CategoryID", how="left").join(
    cd_program_class, "ProgramClassID", how="left"
)

# Calculate the commercial ratio for each channel (LogIdentifierID)
answer = (
    full_log.groupby("LogIdentifierID")  # Group by LogIdentifierID (channel ID)
    .agg(
        # Calculate total duration of commercial-related broadcasts
        F.sum(
            F.when(
                F.trim(F.col("ProgramClassCD")).isin(
                    ["COM", "PRC", "PGI", "PRO", "LOC", "SPO", "MER", "SOL"]
                ),  # Check if ProgramClassCD is a commercial code
                F.col("duration_seconds"),  # Include duration_seconds if true
            ).otherwise(0)  # Otherwise, include 0
        ).alias("duration_commercial"),  # Total commercial duration
        F.sum("duration_seconds").alias("duration_total"),  # Total broadcast duration
    )
    # Calculate the commercial ratio (commercial duration / total duration)
    .withColumn(
        "commercial_ratio",
        F.col("duration_commercial") / F.col("duration_total"),
    )
    # Replace null values with 0 to handle missing or invalid data
    .fillna(0)
)

# Sort the results by commercial_ratio in descending order and display the top 1000 rows
answer.orderBy("commercial_ratio", ascending=False).show(1000, False)

+---------------+-------------------+--------------+----------------+
|LogIdentifierID|duration_commercial|duration_total|commercial_ratio|
+---------------+-------------------+--------------+----------------+
|CJCO           |0                  |0             |0.0             |
|BRAVO          |0                  |0             |0.0             |
|CFTF           |0                  |0             |0.0             |
|CKCS           |0                  |0             |0.0             |
|CJNT           |0                  |0             |0.0             |
|CKES           |0                  |0             |0.0             |
|CHBX           |0                  |0             |0.0             |
|BBCKID         |0                  |0             |0.0             |
|BOOK           |0                  |0             |0.0             |
|CHAN           |0                  |0             |0.0             |
|CEVASI         |0                  |0             |0.0             |
|CMT            |0  