# Whats the Cause of your Power Outage?

**Name(s)**: Kaii Bijlani, Ketan Mittal

**Website Link**: https://k1mittal.github.io/Causes_of_Power_Outages/

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
import scipy    
import folium
from folium.plugins import HeatMap
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, Binarizer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

### Interesting Questions:
- How does the cause of the power outages indicate other factors, for example, does whether related power outages result in more people having no power? Can we predict the cause of power outages?
- Is there a correlation between the time and other factors, do power outages happen in one month specifically? Has the number of power outages decreased over time? Can we predict when the next power outage is using a time series prediction?
- Can we predict the number of people affected by a power outage given certain factors? Are the number of people affected by power outages correlated to other factors?
- Can we predict the duration of power outages given certain factors? How are the duration of power outages correlated to other factors?

### Our Choice:
We decided to answer the first bullet point, which is what aspects of power outage are related to each category of cause. 

## Step 2: Data Cleaning and Exploratory Data Analysis

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)

In [6]:
# Data Init and Cleanin
raw_data = pd.read_excel(Path('./outage.xlsx'))
raw_data.columns = [f'{raw_data.columns[i]}' for i in range(len(raw_data.columns))]
raw_data = raw_data.iloc[1:, 1:].loc[:, ['OBS', 'YEAR', 'MONTH', 'U.S._STATE', 'NERC.REGION', 'CLIMATE.REGION', 'ANOMALY.LEVEL', 'OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME', 'CAUSE.CATEGORY', 'CLIMATE.CATEGORY', 'CAUSE.CATEGORY.DETAIL', 'OUTAGE.DURATION', 'DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'TOTAL.PRICE', 'TOTAL.SALES', 'TOTAL.CUSTOMERS', 'TOTAL.REALGSP']]


Below we display the only missing values for outage start dates. Note how there are only 9 entries, and each of these entries also have missing relevant features. Therefore, imputation is not something of interest in this case and we can simply drop these values

In [7]:
raw_data = raw_data.dropna(subset = ['OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME'], how = 'any')

raw_data['OUTAGE.START'] = pd.to_datetime(
    raw_data['OUTAGE.START.DATE'].astype(str) + ' ' + raw_data['OUTAGE.START.TIME'].astype(str)
)

raw_data['OUTAGE.END'] = pd.to_datetime(
    raw_data['OUTAGE.RESTORATION.DATE'].astype(str) + ' ' + raw_data['OUTAGE.RESTORATION.TIME'].astype(str)
)
raw_data = raw_data.drop(columns = ['OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME'])

raw_data['OUTAGE.START'].isna().sum()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



np.int64(0)

#### Adding Seasons using Binning

In [8]:
seasons = {'(0, 1]': 'Winter', '(1, 4]': 'Spring', '(4, 7]': 'Summer', '(7, 10]': 'Fall', '(10, 12]': 'Winter'}

raw_data['SEASONAL.BINS'] = pd.cut(raw_data['MONTH'], bins = [0, 1, 4, 7, 10, 12])
raw_data['SEASONAL.BINS'] = raw_data['SEASONAL.BINS'].astype(str).map(seasons)
raw_data

Unnamed: 0,OBS,YEAR,MONTH,U.S._STATE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CAUSE.CATEGORY,CLIMATE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,TOTAL.PRICE,TOTAL.SALES,TOTAL.CUSTOMERS,TOTAL.REALGSP,OUTAGE.START,OUTAGE.END,SEASONAL.BINS
1,1.0,2011.0,7.0,Minnesota,MRO,East North Central,-0.3,severe weather,normal,,3060,,70000.0,9.28,6562520,2.60e+06,274182,2011-07-01 17:00:00,2011-07-03 20:00:00,Summer
2,2.0,2014.0,5.0,Minnesota,MRO,East North Central,-0.1,intentional attack,normal,vandalism,1,,,9.28,5284231,2.64e+06,291955,2014-05-11 18:38:00,2014-05-11 18:39:00,Summer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1532,1532.0,2009.0,8.0,South Dakota,RFC,West North Central,0.5,islanding,warm,,59,84,,7.67,924051,4.36e+05,36504,2009-08-29 22:54:00,2009-08-29 23:53:00,Fall
1533,1533.0,2009.0,8.0,South Dakota,MRO,West North Central,0.5,islanding,warm,,181,373,,7.67,924051,4.36e+05,36504,2009-08-29 11:00:00,2009-08-29 14:01:00,Fall


#### Simple Handling of NaN Values

In [9]:
raw_data.loc[raw_data['CUSTOMERS.AFFECTED'] == 0, 'CUSTOMERS.AFFECTED'] = np.nan
raw_data.loc[raw_data['OUTAGE.DURATION'] == 0, 'OUTAGE.DURATION'] = np.nan
print(raw_data.head().to_markdown(index=False))

|   OBS |   YEAR |   MONTH | U.S._STATE   | NERC.REGION   | CLIMATE.REGION     |   ANOMALY.LEVEL | CAUSE.CATEGORY     | CLIMATE.CATEGORY   | CAUSE.CATEGORY.DETAIL   |   OUTAGE.DURATION |   DEMAND.LOSS.MW |   CUSTOMERS.AFFECTED |   TOTAL.PRICE |   TOTAL.SALES |   TOTAL.CUSTOMERS |   TOTAL.REALGSP | OUTAGE.START        | OUTAGE.END          | SEASONAL.BINS   |
|------:|-------:|--------:|:-------------|:--------------|:-------------------|----------------:|:-------------------|:-------------------|:------------------------|------------------:|-----------------:|---------------------:|--------------:|--------------:|------------------:|----------------:|:--------------------|:--------------------|:----------------|
|     1 |   2011 |       7 | Minnesota    | MRO           | East North Central |            -0.3 | severe weather     | normal             | nan                     |              3060 |              nan |                70000 |          9.28 |       6562520 |       2.5957e+06 

### EDA

In [10]:
cause_pdf = px.histogram(raw_data, x = 'CAUSE.CATEGORY', title = 'Distributions of Observations by Cause', histnorm = 'probability density')
cause_pdf.update_layout(xaxis_title = 'Cause', yaxis_title = 'Frequency', legend_title_text = 'Climate Category', showlegend = True)
cause_pdf.show()

In [11]:
states_url = f'https://raw.githubusercontent.com/python-visualization/folium/main/examples/data/us-states.json'

states_data = raw_data['U.S._STATE'].value_counts().reset_index(drop = False)
states_data

Unnamed: 0,U.S._STATE,count
0,California,198
1,Texas,122
...,...,...
47,South Dakota,2
48,North Dakota,1


In [12]:

m = folium.Map(location=[37.0902, -95.7129], zoom_start=4) # Centered on the US
folium.Choropleth(
    geo_data=states_url, 
    name='State Frequency',
    data=states_data, 
    columns=['U.S._STATE', 'count'],  # Replace 'state' and 'frequency' with your column names
    key_on='feature.properties.name', 
    fill_color='YlGn', 
    fill_opacity=0.7, 
    line_opacity=0.2, 
    legend_name='Frequency'
).add_to(m)
m

In [13]:
raw_data

Unnamed: 0,OBS,YEAR,MONTH,U.S._STATE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CAUSE.CATEGORY,CLIMATE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,TOTAL.PRICE,TOTAL.SALES,TOTAL.CUSTOMERS,TOTAL.REALGSP,OUTAGE.START,OUTAGE.END,SEASONAL.BINS
1,1.0,2011.0,7.0,Minnesota,MRO,East North Central,-0.3,severe weather,normal,,3060,,70000.0,9.28,6562520,2.60e+06,274182,2011-07-01 17:00:00,2011-07-03 20:00:00,Summer
2,2.0,2014.0,5.0,Minnesota,MRO,East North Central,-0.1,intentional attack,normal,vandalism,1,,,9.28,5284231,2.64e+06,291955,2014-05-11 18:38:00,2014-05-11 18:39:00,Summer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1532,1532.0,2009.0,8.0,South Dakota,RFC,West North Central,0.5,islanding,warm,,59,84,,7.67,924051,4.36e+05,36504,2009-08-29 22:54:00,2009-08-29 23:53:00,Fall
1533,1533.0,2009.0,8.0,South Dakota,MRO,West North Central,0.5,islanding,warm,,181,373,,7.67,924051,4.36e+05,36504,2009-08-29 11:00:00,2009-08-29 14:01:00,Fall


### Bivariate Analysis

In [14]:
duration_customers_plt = px.violin(raw_data, x = 'SEASONAL.BINS', y = 'ANOMALY.LEVEL')
duration_customers_plt.show()

In [15]:
px.scatter(raw_data, x = 'CLIMATE.REGION', y = 'OUTAGE.DURATION')

In [16]:
price_climate_cause = raw_data.pivot_table(index = 'CAUSE.CATEGORY', columns = 'CLIMATE.REGION', values = 'TOTAL.PRICE', aggfunc = 'mean')
price_climate_cause

CLIMATE.REGION,Central,East North Central,Northeast,Northwest,South,Southeast,Southwest,West,West North Central
CAUSE.CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
equipment failure,7.65,7.92,12.35,6.9,8.24,10.15,8.07,12.74,6.21
fuel supply emergency,7.44,10.51,16.03,7.29,9.17,,8.65,13.82,
...,...,...,...,...,...,...,...,...,...
severe weather,8.27,9.34,12.45,6.83,8.63,8.36,8.13,12.79,6.37
system operability disruption,8.4,8.24,13.71,6.32,8.66,9.11,8.68,12.37,


## Step 3: Assessment of Missingness

### Part 3.1: Missingness Mechanism Analysis
If we look at our feature of interest, we can see that approximately 30 percent of values are missing. This is most likely not MCAR and requires more analysis to see if its missingness depends on other features.

In [17]:
raw_data[['CAUSE.CATEGORY.DETAIL']]

Unnamed: 0,CAUSE.CATEGORY.DETAIL
1,
2,vandalism
...,...
1532,
1533,


In [18]:
raw_data['CAUSE.CATEGORY.DETAIL'].isna().mean()

np.float64(0.3035230352303523)

In [19]:
raw_data[['CAUSE.CATEGORY.DETAIL', 'CAUSE.CATEGORY']]

Unnamed: 0,CAUSE.CATEGORY.DETAIL,CAUSE.CATEGORY
1,,severe weather
2,vandalism,intentional attack
...,...,...
1532,,islanding
1533,,islanding


In [84]:
''' This cell defines valid, relevant test statistics for permutation and 
hypothesis tests.
'''
def tvd(dist1: pd.Series, dist2: pd.Series):
    return (dist1.value_counts(normalize = True) - dist2.value_counts(normalize = True)).abs().sum() / 2

def ks(dist1: pd.Series, dist2: pd.Series):
    return scipy.stats.ks_2samp(dist1, dist2).statistic

In [88]:
''' This function takes in a dataframe, a column with missing values and a 
column to analyze the type of missingness mechanism with. It will return
a p-value and an associative True or False indicating if missing_col is MAR 
withrespect to col. To conduct the permutation test, it will use the given 
test_stat.

The function will also graph the distribution of simulated test statistics 
with a line indicating where the observed lies.
'''
    
def identify_mar(df, missing_col, col, test_stat, N = 1000, alpha = 0.05):
    missing_dist = df[[col]].assign(is_missing = df[missing_col].isna()).dropna(subset = [col])
    observed = test_stat(missing_dist[missing_dist['is_missing']][col], missing_dist[~missing_dist['is_missing']][col])
    simulations = np.array([])
    for _ in range(N):
        missing_dist['is_missing'] = np.random.permutation(missing_dist['is_missing'])
        simulated = test_stat(missing_dist[missing_dist['is_missing']][col], missing_dist[~missing_dist['is_missing']][col])
        simulations = np.append(simulations, simulated)
    
    simulations = simulations[~np.isnan(simulations)]
    fig = px.histogram(x = simulations, title = f'MAR Test of {missing_col} against {col}', labels={'x': 'Simulated Test Statistics'}, histnorm = 'probability')
    fig.add_vline(x=observed, line_color='red', line_width=2, annotation_text='Observed', annotation_position='top right')
    fig.show()    
    p_value = (simulations > observed).mean()
    return p_value, p_value < alpha

In [22]:
p_val, is_mar = identify_mar(raw_data, 'CAUSE.CATEGORY.DETAIL', 'U.S._STATE', tvd, 1000, 0.05)
p_val, is_mar

(np.float64(0.0), np.True_)

In [23]:
p_val2, is_mar2 = identify_mar(raw_data, 'OUTAGE.DURATION', 'MONTH', tvd, 1000, 0.05)
p_val2, is_mar2

(np.float64(0.583), np.False_)

Clearly, the `CAUSE.CATEGORY.DETAIL_` column is **MAR** with respect to `CAUSE.CATEGORY_`. In other words, the missingness for cause category details are *highly* dependent on what the actual cause category is, which makes a lot of sense intuitively.

## Step 4: Hypothesis Testing

$H_0$: The proportion of each cause category is uniformly distributed across each season, for each cause category.

$H_a$: The proportion of each cause category is not uniformly distributed across each season, for each cause category.

In [24]:
''' Calculates the TVD for 2D distributions across each column (axis = 0). 
The resulting TVD's will be aggregated (sum or mean) to represent the TVD of 
the whole distributions. Assumes the probability distributions are already
calculated and provided.
'''
def tvd_2d(dist1: pd.DataFrame, dist2: pd.DataFrame, aggfunc):
    return (np.sum(np.abs(dist1 - dist2), axis = 0) / 2).agg(aggfunc)

In [25]:
seasonal_counts = raw_data.pivot_table(values = 'OBS', columns = 'SEASONAL.BINS', index = 'CAUSE.CATEGORY', aggfunc = 'size')

cause_totals = seasonal_counts.sum(axis = 0)
seasonal_totals = seasonal_counts.sum(axis = 1)
expected_proportions = seasonal_totals / seasonal_counts.sum().sum()

observed_dist = seasonal_counts / cause_totals
expected_dist = pd.DataFrame(data = {col: expected_proportions for col in observed_dist.columns})
observed_tvd = tvd_2d(expected_dist, observed_dist, 'sum')
observed_dist

SEASONAL.BINS,Fall,Spring,Summer,Winter
CAUSE.CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
equipment failure,0.02,0.05,0.05,0.03
fuel supply emergency,0.02,0.04,0.02,0.03
...,...,...,...,...
severe weather,0.57,0.40,0.51,0.53
system operability disruption,0.07,0.11,0.09,0.06


In [26]:
NUM_SIMULATIONS = 1000
sim_season_df = raw_data[['SEASONAL.BINS', 'CAUSE.CATEGORY', 'OBS']]
simulations = []
for _ in range(NUM_SIMULATIONS):
    sim_season_df['SEASONAL.BINS'] = np.random.permutation(sim_season_df['SEASONAL.BINS'])
    sim_counts = sim_season_df.pivot_table(values = 'OBS', columns = 'SEASONAL.BINS', index = 'CAUSE.CATEGORY', aggfunc = 'size')

    sim_cause_totals = sim_counts.sum(axis = 0)
    sim_seasonal_totals = sim_counts.sum(axis = 1)
    sim_expected_proportions = sim_seasonal_totals / sim_counts.sum().sum()

    sim_observed_dist = sim_counts / sim_cause_totals
    sim_expected_dist = pd.DataFrame(data = {col: sim_expected_proportions for col in sim_observed_dist.columns})
    sim_tvd = tvd_2d(sim_expected_dist, sim_observed_dist, 'sum')
    
    simulations.append(sim_tvd)
simulations

[np.float64(0.16026714502152867),
 np.float64(0.1527005035911017),
 np.float64(0.13768125701699752),
 np.float64(0.18453378264811734),
 np.float64(0.19074008671607195),
 np.float64(0.1271664617430849),
 np.float64(0.15071501303794624),
 np.float64(0.21244514440204446),
 np.float64(0.14420014683048013),
 np.float64(0.12646071304281425),
 np.float64(0.14554195938883396),
 np.float64(0.17085666685272974),
 np.float64(0.12045052673384661),
 np.float64(0.1873007994356884),
 np.float64(0.09773351326250175),
 np.float64(0.14164007611761692),
 np.float64(0.1133351925280717),
 np.float64(0.1729430429766036),
 np.float64(0.10066528181156773),
 np.float64(0.1840117507530693),
 np.float64(0.11715843939743827),
 np.float64(0.08648152579075728),
 np.float64(0.10114462176402486),
 np.float64(0.16597041144837343),
 np.float64(0.18646941990537227),
 np.float64(0.14668112158342667),
 np.float64(0.13129896924553336),
 np.float64(0.17293619293074092),
 np.float64(0.13679345764987483),
 np.float64(0.171970

In [27]:
fig_hyp1 = px.histogram(simulations, histnorm = 'probability', title = 'Cause Category by Season Distribution of TVD')
fig_hyp1.add_vline(x=observed_tvd, line_color='red', line_width=2, annotation_text='Observed', annotation_position='top right')
fig_hyp1.show()

In [28]:
p_val_hyp1 = (observed_tvd < simulations).mean()
p_val_hyp1

np.float64(0.0)

### Test Number 2

$H_0$: The distributions of mean affected customers for each state is the same for observations from 2005 and 2006

$H_a$: The distributions of mean affected customers for each state is different for observations from 2005 and 2006

In [29]:
raw_data['YEAR'] = raw_data['YEAR'].astype(float)
customers_dist_2005 = raw_data[raw_data['YEAR'] == 2005.0][['U.S._STATE', 'CUSTOMERS.AFFECTED']].groupby('U.S._STATE').mean()
customers_dist_2005 /= customers_dist_2005.sum(axis = 0)
customers_dist_2005 = customers_dist_2005['CUSTOMERS.AFFECTED'].dropna()

customers_dist_2006 = raw_data[raw_data['YEAR'] == 2006.0][['U.S._STATE', 'CUSTOMERS.AFFECTED']].groupby('U.S._STATE').mean()
customers_dist_2006 /= customers_dist_2006.sum(axis = 0)
customers_dist_2006 = customers_dist_2006['CUSTOMERS.AFFECTED'].dropna()

observed_tvd_customers = np.abs(customers_dist_2006 - customers_dist_2005).sum() / 2
observed_tvd_customers


np.float64(0.1931225644342009)

In [30]:
N_CUSTOMERS = 1000
sim_customers_2006 = np.random.multinomial(N_CUSTOMERS, pvals = customers_dist_2005, size = 100_000) / N_CUSTOMERS
sim_tvds_customers = np.sum(np.abs(sim_customers_2006 - customers_dist_2005.to_numpy()), axis = 1) / 2
hist = px.histogram(sim_tvds_customers).add_vline(x=observed_tvd_customers, line_color='red', line_width=2, annotation_text='Observed', annotation_position='top right')
hist.show()
sim_tvds_customers.mean(), sim_tvds_customers.std()

(np.float64(0.04384504889426157), np.float64(0.009288708952943455))

In [31]:
p_val_hyp2 = (np.array(sim_tvds_customers) >= observed_tvd_customers).mean()
p_val_hyp2

np.float64(0.0)

## Step 5: Framing a Prediction Problem

We plan to predict the cause column, which represents the general cause of power outages. Since this column contains categorical data, it is a classification problem. We will use features such as the number of people affected, outage duration, demand loss, and climatic conditions to build a predictive model. We aim to uncover patterns that help identify the causes of power outages based on their associated impacts and conditions. This will provide valuable insights for improving outage management and preparedness.

## Step 6: Baseline Model

In [32]:
mdl_data = raw_data.drop(columns = ['CAUSE.CATEGORY.DETAIL', 'OUTAGE.START', 'OUTAGE.END', 'SEASONAL.BINS', 'NERC.REGION', 'OBS'])
mdl_data.isna().sum(axis = 0).sort_values(ascending=False)

DEMAND.LOSS.MW        672
CUSTOMERS.AFFECTED    622
                     ... 
TOTAL.CUSTOMERS         0
TOTAL.REALGSP           0
Length: 14, dtype: int64

In [33]:
mdl_data[['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'OUTAGE.DURATION']] = mdl_data[['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'OUTAGE.DURATION']].fillna(0)
mdl_data.dropna(subset = ['TOTAL.PRICE', 'TOTAL.SALES', 'CLIMATE.REGION'], inplace = True)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [34]:

mdl_base = Pipeline(steps = [
    ('data_encoding', ColumnTransformer(transformers = [
        ('one_hot', OneHotEncoder(handle_unknown = 'ignore'), ['U.S._STATE', 'CLIMATE.REGION', 'CLIMATE.CATEGORY']),
        ('ordinal', OrdinalEncoder(), ['MONTH']),
    ], remainder = 'passthrough')),
    ('random_forest', RandomForestClassifier())
])

In [35]:
X = mdl_data.drop(columns = ['CAUSE.CATEGORY'])
y = mdl_data['CAUSE.CATEGORY']
X_train, X_test, y_train, y_test = train_test_split(X, y)

mdl_data.isna().sum(axis = 0).sort_values(ascending=False)

YEAR               0
MONTH              0
                  ..
TOTAL.CUSTOMERS    0
TOTAL.REALGSP      0
Length: 14, dtype: int64

In [36]:
mdl_base.fit(X_train, y_train)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [37]:
mdl_base.score(X_train, y_train)

1.0

In [38]:
y_base_preds = mdl_base.predict(X_test)
mdl_base.score(X_test, y_test), float(f1_score(y_test, y_base_preds, average = 'macro'))

(0.8246575342465754, 0.6457706380306999)

In [44]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, y_base_preds, average = 'macro'), recall_score(y_test, y_base_preds, average = 'macro')

(np.float64(0.7017526078646041), np.float64(0.6072619208412817))

## Step 7: Final Model

In [50]:
from sklearn.model_selection import GridSearchCV

params_1 = {'criterion': ['gini', 'entropy', 'log_loss'], 'max_depth': np.arange(5, 25), 'n_estimators': [5, 10, 20, 50, 100, 200]}

mdl_1 = Pipeline(steps = [
    ('data_encoding', ColumnTransformer(transformers = [
        ('one_hot', OneHotEncoder(handle_unknown = 'ignore'), ['U.S._STATE', 'CLIMATE.REGION', 'CLIMATE.CATEGORY']),
        ('ordinal', OrdinalEncoder(), ['MONTH']),
    ], remainder = 'passthrough')),
    ('gscv', GridSearchCV(RandomForestClassifier(), param_grid = params_1, cv = 5))
])

In [55]:
mdl_1.fit(X_train, y_train)


invalid value encountered in cast



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [56]:
y_1_preds = mdl_1.predict(X_test)
mdl_1.score(X_test, y_test), f1_score(y_test, y_1_preds, average = 'macro') 

(0.8273972602739726, np.float64(0.6395189726930023))

In [57]:
precision_score(y_test, y_1_preds, average = 'macro'), recall_score(y_test, y_1_preds, average = 'macro')

(np.float64(0.7132221198582754), np.float64(0.6015470595398071))

Still could use improvement, so we look at the feature space $\mathbb(F)$ now

In [106]:
mdl_2_data = raw_data.copy().drop(columns = ['OBS', 'CAUSE.CATEGORY.DETAIL', 'OUTAGE.START', 'OUTAGE.END', 'SEASONAL.BINS'])
mdl_2_data.isna().sum(axis = 0).sort_values(ascending = False).head(5)

DEMAND.LOSS.MW        672
CUSTOMERS.AFFECTED    622
OUTAGE.DURATION        78
TOTAL.PRICE            12
TOTAL.SALES            12
dtype: int64

In [None]:
from collections import defaultdict
num_columns = ['YEAR', 'MONTH', 'ANOMALY.LEVEL', 'OUTAGE.DURATION', 'DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'TOTAL.PRICE', 'TOTAL.SALES', 'TOTAL.CUSTOMERS', 'TOTAL.REALGSP']
missing_cols = ['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED']
mar_status = defaultdict(lambda: [])
for missing_col in missing_cols:
    for col in num_columns:
        p_customers, is_mar_customers = identify_mar(mdl_2_data, missing_col, col, ks, alpha = 0.01)
        mar_status[missing_col].append((col, p_customers, is_mar_customers))

mar_status

Since both `CUSTOMERS.AFFECTED` and `DEMAND.LOSS.MW` are shown to be **MAR** with respect to almost every other feature, we can impute these using techniques like K-Nearest Neighbors or Random Forest Regression

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV

params_1 = {
    'criterion': ['gini', 'entropy', 'log_loss'], 
    'max_depth': np.arange(5, 25), 
    'n_estimators': [5, 10, 20, 50, 100, 200]
}

mdl_2 = Pipeline(steps=[
    ('data_encoding', ColumnTransformer(
        transformers=[
            ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse_output = False), ['U.S._STATE', 'CLIMATE.REGION', 'CLIMATE.CATEGORY', 'NERC.REGION']),
            ('ordinal', OrdinalEncoder(), ['MONTH']),
            ('standard', StandardScaler(), ['CUSTOMERS.AFFECTED'])
        ], remainder='passthrough')
    ),
    ('imputer', IterativeImputer(
        estimator=RandomForestRegressor(), 
        max_iter=10, random_state=0 
    )),
    ('gscv', GridSearchCV(RandomForestClassifier(), param_grid=params_1, cv=5))
])

In [119]:
y_2 = mdl_2_data['CAUSE.CATEGORY']
X_2 = mdl_2_data.drop(columns = ['CAUSE.CATEGORY' ])
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2)

In [None]:
mdl_2.fit(X_2_train, y_2_train)


[IterativeImputer] Early stopping criterion not reached.


invalid value encountered in cast



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [118]:
mdl_2.score(X_2_train, y_2_train)

0.991869918699187

In [121]:
y_2_preds = mdl_2.predict(X_2_test)
mdl_2.score(X_2_test, y_2_test), f1_score(y_2_test, y_2_preds, average = 'macro')


(0.94579945799458, np.float64(0.9239085687864604))

In [122]:
precision_score(y_2_test, y_2_preds, average = 'macro'), recall_score(y_2_test, y_2_preds, average = 'macro')

(np.float64(0.9569334325595594), np.float64(0.8977236888639247))

## Step 8: Fairness Analysis

In [None]:
# TODO