## State Summaries From Electricity Data

### Initial Setup

In [1]:
import os, sys
import pprint as p
import pyspark.sql.functions as pysF
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

%matplotlib widget
%matplotlib inline

py_file_path = os.path.join(
    os.getcwd(),
    "..",
    ".."
)

sys.path.append(py_file_path)
from app.SparkTools import MyPySpark

MySpark = None

#ensure only one sc and spark instance is running
global MySpark
MySpark = MySpark or MyPySpark(
    master = 'local[3]', 
    logger_name = 'jupyter')

### Filter and Join Data

In [2]:
p.pprint(
    list(
        set(
            MySpark\
            .spark\
            .read\
            .parquet("/Processed/ElectricityDimDF")\
            .select("value_type")
            .toPandas()["value_type"])))

['Consumption for useful thermal output (Btu)',
 'Total consumption (Btu)',
 'Average retail price of electricity',
 'Quality of fossil fuels in electricity generation',
 'Electric fuel consumption MMBtu',
 'Receipts of fossil fuels by electricity plants (Btu)',
 'Consumption for electricity generation (Btu)',
 'Electric fuel consumption quantity',
 'Revenue from retail sales of electricity',
 'Total consumption',
 'Retail sales of electricity',
 'Consumption for useful thermal output',
 'Receipts of fossil fuels by electricity plants',
 'Average cost of fossil fuels for electricity generation (per Btu)',
 'Net generation',
 'Fuel consumption MMBtu',
 'Number of customer accounts',
 'MMBtu per unit',
 'Fossil-fuel stocks for electricity generation',
 'Consumption for electricity generation',
 'Average cost of fossil fuels for electricity generation',
 'Fuel consumption quantity']


In [3]:
electricity_dim_df = MySpark\
    .spark\
    .read\
    .parquet("/Processed/ElectricityDimDF")\
    .select("series_id", "state", "fuel_type", "engine_type", "units")\
    .filter(
        (pysF.col("value_type") == 'Fuel consumption MMBtu') &
        (pysF.col("engine_type") == 'all primemovers'))\
    .withColumn(
        "fuel_type",
        pysF.regexp_replace(
            pysF.regexp_replace(
                pysF.col("fuel_type"),
                "[^a-zA-Z0-9\s]", 
                ""),
            "\s",
            "_"))

electricity_fact_df = MySpark\
    .spark\
    .read\
    .parquet("/Processed/ElectricityFactDF")

electricity_df = electricity_fact_df.join(
    pysF.broadcast(electricity_dim_df),
    on = "series_id",
    how = "right"
)

In [4]:
fuel_types_l = electricity_df\
    .select("fuel_type")\
    .distinct()\
    .rdd.map(lambda x: x[0])\
    .collect()

electricity_pivoted_summary_df = electricity_df\
    .groupBy("date", "state")\
    .pivot("fuel_type")\
    .agg(pysF.sum("value"))\
    .fillna(
        0, 
        subset=[i for i in fuel_types_l if i])\
    .drop("null")\
    .withColumn(
        "other",
        pysF.col("all_fuels") - sum(pysF.col(i) for i in fuel_types_l if i and i != "all_fuels"))

In [5]:
electricity_formatted_df = MySpark.melt(
    df = electricity_pivoted_summary_df,
    id_vars = ["date", "state"],
    value_vars = [i for i in fuel_types_l if i and i != "all_fuels"],
    var_name = 'fuel_type')

In [6]:
# make treemap of state -> fuels, and fuels -> states
# maybe group fuels to make less subcategories

# state_2019_df = electricity_formatted_df\
#     .filter(
#         (pysF.year("date") == "2019") & 
#         (pysF.col("value") > 0) &
#         (pysF.col("state").isin(["CO", "NJ"])))\
#     .groupBy("state")\
#     .agg(pysF.sum("value").alias("consumption"))\
#     .toPandas()


In [7]:
state_df = electricity_formatted_df\
    .groupBy("date","state")\
    .agg(pysF.sum("value").alias("consumption"))\

state_l = state_df\
    .groupBy("state")\
    .agg(pysF.max("consumption").alias("max_consumption"))\
    .sort(pysF.col("max_consumption").desc())\
    .rdd\
    .map(lambda x: x["state"])\
    .collect()

print(state_l)

#do top 10, next 10, etc to break it up in subplots

# fig, axs = plt.subplots(1, 1, figsize=(16,30))
# fig.tight_layout()
# axs = sns.lineplot(
#     x = "date",
#     y = "consumption",
#     hue = "state",
#     data = state_df
# )

['TX', 'CA', 'FL', 'PA', 'IL', 'GA', 'OH', 'NY', 'AL', 'NC', 'IN', 'MI', 'WA', 'LA', 'AZ', 'SC', 'TN', 'KY', 'VA', 'MO', 'WV', 'OK', 'WI', 'NJ', 'AR', 'MS', 'MN', 'IA', 'OR', 'MD', 'WY', 'KS', 'CO', 'MA', 'ND', 'UT', 'NV', 'NE', 'NM', 'CT', 'MT', 'ME', 'NH', 'ID', 'SD', 'DE', 'HI', 'AK', 'RI', 'VT', None, 'DC']


In [8]:
# Use formatted_df for plotting
# Also use over() to create percentages by year by state for relative plot.