In [15]:
spark.sql(f"DROP DATABASE IF EXISTS gold CASCADE")
spark.sql(f"CREATE DATABASE IF NOT EXISTS gold") 

DataFrame[]

In [17]:
update_audit_table("EV_Data", progress="Ingest_Gold", status="In_Progress", start=True)

+----------+-----------+-----------+--------------------+--------+
|Table_Name|   Progress|     Status|          Start_time|End_time|
+----------+-----------+-----------+--------------------+--------+
|   EV_Data|Ingest_Gold|In_Progress|2024-12-05 19:22:...|    null|
+----------+-----------+-----------+--------------------+--------+



In [18]:
# Define the database and table names
database_name = "silver"
table_name = "vehicle_data"

# Query the data from the managed table
df = spark.sql(f"SELECT * FROM {database_name}.{table_name}")
df.limit(10).show()

+------------------+--------------------+--------+-------------------+------------+-------------------+------------+----+----------+-----------+-----------+-----+-----------+----------+---------+-------+---------------------+-----------------------------------------------+--------------+---------+--------------------+--------------+--------------------+--------------------+-----------------+--------+-----------------------+---------------------------------------+--------+-----------+-------+
|               sid|                  id|position|         created_at|created_meta|         updated_at|updated_meta|meta|  VIN_1_10|     County|       City|State|Postal_Code|Model_year|     Make|  Model|Electric_Vehicle_Type|Clean_Alternative_Fuel_Vehicle_CAFV_Eligibility|Electric_Range|Base_MSRP|Legislative_District|DOL_Vehicle_ID|    Vehicle_Location|    Electric_Utility|2020_Census_Tract|Counties|Congressional_Districts|WAOFM_GIS_Legislative_District_Boundary|err_code|err_message|err_flg|
+-----

In [None]:
#Which one of the car make is more efficient?
gold_df = spark.sql(
    f"""
    SELECT Model_Year, County, City, State, Electric_Vehicle_type, Make, Model,
           COUNT(DOL_Vehicle_ID) ev_cnt, AVG(Electric_Range) AS Avg_ev_Range
    FROM {database_name}.{table_name}
    GROUP BY Model_Year, County, City, State, Electric_Vehicle_type, Make, Model
    """)
gold_df.limit(10).show()

In [None]:
gold_db = "gold"
gold_table = "ev_insights"

gold_df.write.mode("overwrite").saveAsTable(f"{gold_db}.{gold_table}")

In [19]:
# Is there any relationship between the choice of EV make and city?
# Group by 'City' and 'Make' to see the count of each make in different cities
df_city_make = df.groupBy("City", "Make").count().orderBy("count", ascending=False)

# Show the relationship between city and make
df_city_make.limit(10).show()

+---------+---------+-----+
|     City|     Make|count|
+---------+---------+-----+
|  Seattle|    TESLA| 1597|
| Bellevue|    TESLA|  766|
|  Redmond|    TESLA|  544|
|  Seattle|   NISSAN|  515|
|Sammamish|    TESLA|  411|
| Kirkland|    TESLA|  392|
|  Bothell|    TESLA|  383|
|  Seattle|CHEVROLET|  336|
|Vancouver|    TESLA|  295|
|   Renton|    TESLA|  289|
+---------+---------+-----+



In [20]:
# Which Plug-in Hybrid Electric Vehicle (PHEV) is preferred by buyers?
# Filter for Plug-in Hybrid Electric Vehicle (PHEV)
df_phev = df.filter(col("Electric_Vehicle_Type").contains("Plug-in Hybrid Electric Vehicle (PHEV)"))

# Group by 'Make' and 'Model' to see the count of each PHEV
df_phev_preference = df_phev.groupBy("Make", "Model").count().orderBy("count", ascending=False)

# Show the most preferred PHEV by buyers
df_phev_preference.limit(10).show()

+---------+-------------+-----+
|     Make|        Model|count|
+---------+-------------+-----+
|CHEVROLET|         VOLT|  921|
|   TOYOTA|  PRIUS PRIME|  464|
|     FORD|       FUSION|  349|
|      BMW|           X5|  313|
| CHRYSLER|     PACIFICA|  309|
|     FORD|        C-MAX|  262|
|     JEEP|     WRANGLER|  238|
|      BMW|           I3|  236|
|   TOYOTA|   RAV4 PRIME|  204|
|   TOYOTA|PRIUS PLUG-IN|  171|
+---------+-------------+-----+



In [21]:
# Based on the data, which car make and model would you recommend?
from pyspark.sql.functions import avg, count
df = df.withColumn("Base_MSRP", col("Base_MSRP").cast("int"))
df_recommend = df.groupBy("Make", "Model").agg(
    avg("Electric_Range").alias("Avg_Electric_Range"),
    avg("Base_MSRP").alias("Avg_Base_MSRP"),
    count("DOL_Vehicle_ID").alias("Popularity")
).orderBy("Avg_Electric_Range", ascending=False)

df_recommend.limit(10).show()

+----------+--------+------------------+------------------+----------+
|      Make|   Model|Avg_Electric_Range|     Avg_Base_MSRP|Popularity|
+----------+--------+------------------+------------------+----------+
|   HYUNDAI|    KONA|             258.0|               0.0|        43|
|     TESLA|ROADSTER|237.85714285714286|107521.42857142857|         7|
|    JAGUAR|  I-PACE|            204.75|               0.0|        40|
|     TESLA| MODEL S| 191.8760147601476|15836.752767527676|      1355|
| CHEVROLET| BOLT EV| 182.3937924345296|               0.0|      1031|
|     TESLA| MODEL X|164.31479289940827|               0.0|       845|
|     TESLA| MODEL 3|             142.0|               0.0|      4252|
|      AUDI|  E-TRON|129.85185185185185|               0.0|       162|
|VOLKSWAGEN|  E-GOLF|108.97938144329896|               0.0|       194|
|    TOYOTA|    RAV4|             103.0|               0.0|         8|
+----------+--------+------------------+------------------+----------+



In [22]:
update_audit_table("EV_Data", progress="Ingest_Gold", status="Completed", end=True)

+----------+-------------+-----------+--------------------+--------------------+
|Table_Name|     Progress|     Status|          Start_time|            End_time|
+----------+-------------+-----------+--------------------+--------------------+
|   EV_Data|Ingest_Bronze|  Completed|2024-12-05 19:18:...|2024-12-05 19:20:...|
|   EV_Data|Ingest_Silver|  Completed|2024-12-05 19:20:...|2024-12-05 19:21:...|
|   EV_Data|  Ingest_Gold|In_Progress|2024-12-05 19:22:...|                null|
+----------+-------------+-----------+--------------------+--------------------+

+----------+-----------+-----------+--------------------+--------+
|Table_Name|   Progress|     Status|          Start_time|End_time|
+----------+-----------+-----------+--------------------+--------+
|   EV_Data|Ingest_Gold|In_Progress|2024-12-05 19:22:...|    null|
+----------+-----------+-----------+--------------------+--------+

+----------+-------------+---------+--------------------+--------------------+
|Table_Name|     