# Whats the Cause of your Power Outage?

**Name(s)**: Kaii Bijlani, Ketan Mittal

**Website Link**: https://k1mittal.github.io/Causes_of_Power_Outages/

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import scipy    
import folium
from folium.plugins import HeatMap
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, Binarizer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

### Interesting Questions:
- How does the cause of the power outages indicate other factors, for example, does whether related power outages result in more people having no power? Can we predict the cause of power outages?
- Is there a correlation between the time and other factors, do power outages happen in one month specifically? Has the number of power outages decreased over time? Can we predict when the next power outage is using a time series prediction?
- Can we predict the number of people affected by a power outage given certain factors? Are the number of people affected by power outages correlated to other factors?
- Can we predict the duration of power outages given certain factors? How are the duration of power outages correlated to other factors?

### Our Choice:
We decided to answer the first bullet point, which is what aspects of power outage are related to each category of cause. 

## Step 2: Data Cleaning and Exploratory Data Analysis

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)

In [3]:
# Data Init and Cleanin
raw_data = pd.read_excel(Path('./outage.xlsx'))
raw_data.columns = [f'{raw_data.columns[i]}' for i in range(len(raw_data.columns))]
raw_data = raw_data.iloc[1:, 1:].loc[:, ['OBS', 'YEAR', 'MONTH', 'U.S._STATE', 'NERC.REGION', 'CLIMATE.REGION', 'ANOMALY.LEVEL', 'OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME', 'CAUSE.CATEGORY', 'CLIMATE.CATEGORY', 'CAUSE.CATEGORY.DETAIL', 'OUTAGE.DURATION', 'DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'TOTAL.PRICE', 'TOTAL.SALES', 'TOTAL.CUSTOMERS', 'TOTAL.REALGSP']]


Below we display the only missing values for outage start dates. Note how there are only 9 entries, and each of these entries also have missing relevant features. Therefore, imputation is not something of interest in this case and we can simply drop these values

In [4]:
raw_data = raw_data.dropna(subset = ['OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME'], how = 'any')

raw_data['OUTAGE.START'] = pd.to_datetime(
    raw_data['OUTAGE.START.DATE'].astype(str) + ' ' + raw_data['OUTAGE.START.TIME'].astype(str)
)

raw_data['OUTAGE.END'] = pd.to_datetime(
    raw_data['OUTAGE.RESTORATION.DATE'].astype(str) + ' ' + raw_data['OUTAGE.RESTORATION.TIME'].astype(str)
)
raw_data = raw_data.drop(columns = ['OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME'])

raw_data['OUTAGE.START'].isna().sum()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



np.int64(0)

#### Adding Seasons using Binning

In [5]:
seasons = {'(0, 1]': 'Winter', '(1, 4]': 'Spring', '(4, 7]': 'Summer', '(7, 10]': 'Fall', '(10, 12]': 'Winter'}

raw_data['SEASONAL.BINS'] = pd.cut(raw_data['MONTH'], bins = [0, 1, 4, 7, 10, 12])
raw_data['SEASONAL.BINS'] = raw_data['SEASONAL.BINS'].astype(str).map(seasons)
raw_data

Unnamed: 0,OBS,YEAR,MONTH,U.S._STATE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CAUSE.CATEGORY,CLIMATE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,TOTAL.PRICE,TOTAL.SALES,TOTAL.CUSTOMERS,TOTAL.REALGSP,OUTAGE.START,OUTAGE.END,SEASONAL.BINS
1,1.0,2011.0,7.0,Minnesota,MRO,East North Central,-0.3,severe weather,normal,,3060,,70000.0,9.28,6562520,2.60e+06,274182,2011-07-01 17:00:00,2011-07-03 20:00:00,Summer
2,2.0,2014.0,5.0,Minnesota,MRO,East North Central,-0.1,intentional attack,normal,vandalism,1,,,9.28,5284231,2.64e+06,291955,2014-05-11 18:38:00,2014-05-11 18:39:00,Summer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1532,1532.0,2009.0,8.0,South Dakota,RFC,West North Central,0.5,islanding,warm,,59,84,,7.67,924051,4.36e+05,36504,2009-08-29 22:54:00,2009-08-29 23:53:00,Fall
1533,1533.0,2009.0,8.0,South Dakota,MRO,West North Central,0.5,islanding,warm,,181,373,,7.67,924051,4.36e+05,36504,2009-08-29 11:00:00,2009-08-29 14:01:00,Fall


#### Simple Handling of NaN Values

In [6]:
raw_data.loc[raw_data['CUSTOMERS.AFFECTED'] == 0, 'CUSTOMERS.AFFECTED'] = np.nan
raw_data.loc[raw_data['OUTAGE.DURATION'] == 0, 'OUTAGE.DURATION'] = np.nan


### EDA

In [7]:
cause_pdf = px.histogram(raw_data, x = 'CAUSE.CATEGORY', title = 'Distributions of Observations by Cause', histnorm = 'probability density')
cause_pdf.update_layout(xaxis_title = 'Cause', yaxis_title = 'Frequency', legend_title_text = 'Climate Category', showlegend = True)
cause_pdf.show()

In [8]:
states_url = f'https://raw.githubusercontent.com/python-visualization/folium/main/examples/data/us-states.json'

states_data = raw_data['U.S._STATE'].value_counts().reset_index(drop = False)
states_data

Unnamed: 0,U.S._STATE,count
0,California,198
1,Texas,122
...,...,...
47,South Dakota,2
48,North Dakota,1


In [None]:

m = folium.Map(location=[37.0902, -95.7129], zoom_start=4) # Centered on the US
folium.Choropleth(
    geo_data=states_url, 
    name='State Frequency',
    data=states_data, 
    columns=['U.S._STATE', 'count'],  # Replace 'state' and 'frequency' with your column names
    key_on='feature.properties.name', 
    fill_color='YlGn', 
    fill_opacity=0.7, 
    line_opacity=0.2, 
    legend_name='Frequency'
).add_to(m)
m

In [None]:
raw_data

Unnamed: 0,OBS,YEAR,MONTH,U.S._STATE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CAUSE.CATEGORY,CLIMATE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,TOTAL.PRICE,TOTAL.SALES,TOTAL.CUSTOMERS,TOTAL.REALGSP,OUTAGE.START,OUTAGE.END,SEASONAL.BINS
1,1.0,2011.0,7.0,Minnesota,MRO,East North Central,-0.3,severe weather,normal,,3060.0,,70000.0,9.28,6562520.0,2600000.0,274182,2011-07-01 17:00:00,2011-07-03 20:00:00,Summer
2,2.0,2014.0,5.0,Minnesota,MRO,East North Central,-0.1,intentional attack,normal,vandalism,1.0,,,9.28,5284231.0,2640000.0,291955,2014-05-11 18:38:00,2014-05-11 18:39:00,Summer
3,3.0,2010.0,10.0,Minnesota,MRO,East North Central,-1.5,severe weather,cold,heavy wind,3000.0,,70000.0,8.15,5222116.0,2590000.0,267895,2010-10-26 20:00:00,2010-10-28 22:00:00,Fall
4,4.0,2012.0,6.0,Minnesota,MRO,East North Central,-0.1,severe weather,normal,thunderstorm,2550.0,,68200.0,9.19,5787064.0,2610000.0,277627,2012-06-19 04:30:00,2012-06-20 23:00:00,Summer
5,5.0,2015.0,7.0,Minnesota,MRO,East North Central,1.2,severe weather,warm,,1740.0,250.0,250000.0,10.43,5970339.0,2670000.0,292023,2015-07-18 02:00:00,2015-07-19 07:00:00,Summer
6,6.0,2010.0,11.0,Minnesota,MRO,East North Central,-1.4,severe weather,cold,winter storm,1860.0,,60000.0,8.28,5374150.0,2590000.0,267895,2010-11-13 15:00:00,2010-11-14 22:00:00,Winter
7,7.0,2010.0,7.0,Minnesota,MRO,East North Central,-0.9,severe weather,cold,tornadoes,2970.0,,63000.0,9.12,6374935.0,2590000.0,267895,2010-07-17 20:30:00,2010-07-19 22:00:00,Summer
8,8.0,2005.0,6.0,Minnesota,MRO,East North Central,0.2,severe weather,normal,thunderstorm,3960.0,75.0,300000.0,7.36,5607498.0,2470000.0,268496,2005-06-08 04:00:00,2005-06-10 22:00:00,Summer
9,9.0,2015.0,3.0,Minnesota,MRO,East North Central,0.6,intentional attack,warm,sabotage,155.0,20.0,5940.0,9.03,5599486.0,2670000.0,292023,2015-03-16 07:31:00,2015-03-16 10:06:00,Spring
10,10.0,2013.0,6.0,Minnesota,MRO,East North Central,-0.2,severe weather,normal,hailstorm,3621.0,,400000.0,10.0,5490631.0,2620000.0,284542,2013-06-21 17:39:00,2013-06-24 06:00:00,Summer


### Bivariate Analysis

In [None]:
duration_customers_plt = px.violin(raw_data, x = 'SEASONAL.BINS', y = 'ANOMALY.LEVEL')
duration_customers_plt.show()

In [None]:
px.scatter(raw_data, x = 'CLIMATE.REGION', y = 'OUTAGE.DURATION')

In [None]:
price_climate_cause = raw_data.pivot_table(index = 'CAUSE.CATEGORY', columns = 'CLIMATE.REGION', values = 'TOTAL.PRICE', aggfunc = 'mean')
price_climate_cause

CLIMATE.REGION,Central,East North Central,Northeast,Northwest,South,Southeast,Southwest,West,West North Central
CAUSE.CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
equipment failure,7.65,7.92,12.35,6.9,8.24,10.15,8.07,12.74,6.21
fuel supply emergency,7.44,10.51,16.03,7.29,9.17,,8.65,13.82,
intentional attack,8.95,9.27,12.83,7.24,8.64,9.23,8.57,12.91,7.42
islanding,6.91,10.56,12.82,7.16,7.64,,9.61,13.74,7.84
public appeal,9.49,10.61,13.09,5.71,8.41,8.34,7.73,12.89,8.02
severe weather,8.27,9.34,12.45,6.83,8.63,8.36,8.13,12.79,6.37
system operability disruption,8.4,8.24,13.71,6.32,8.66,9.11,8.68,12.37,


## Step 3: Assessment of Missingness

### Part 3.1: Missingness Mechanism Analysis
If we look at our feature of interest, we can see that approximately 30 percent of values are missing. This is most likely not MCAR and requires more analysis to see if its missingness depends on other features.

In [None]:
raw_data[['CAUSE.CATEGORY.DETAIL']]

Unnamed: 0,CAUSE.CATEGORY.DETAIL
1,
2,vandalism
3,heavy wind
4,thunderstorm
5,
6,winter storm
7,tornadoes
8,thunderstorm
9,sabotage
10,hailstorm


In [None]:
raw_data['CAUSE.CATEGORY.DETAIL'].isna().mean()

np.float64(0.3035230352303523)

In [None]:
raw_data[['CAUSE.CATEGORY.DETAIL', 'CAUSE.CATEGORY']]

Unnamed: 0,CAUSE.CATEGORY.DETAIL,CAUSE.CATEGORY
1,,severe weather
2,vandalism,intentional attack
3,heavy wind,severe weather
4,thunderstorm,severe weather
5,,severe weather
6,winter storm,severe weather
7,tornadoes,severe weather
8,thunderstorm,severe weather
9,sabotage,intentional attack
10,hailstorm,severe weather


In [None]:
''' This cell defines valid, relevant test statistics for permutation and 
hypothesis tests.
'''
def tvd(dist1: pd.Series, dist2: pd.Series):
    return (dist1.value_counts(normalize = True) - dist2.value_counts(normalize = True)).abs().sum() / 2

def ks(dist1: pd.Series, dist2: pd.Series):
    return scipy.stats.ks_2samp(dist1, dist2)

In [None]:
''' This function takes in a dataframe, a column with missing values and a 
column to analyze the type of missingness mechanism with. It will return
a p-value and an associative True or False indicating if missing_col is MAR 
withrespect to col. To conduct the permutation test, it will use the given 
test_stat.

The function will also graph the distribution of simulated test statistics 
with a line indicating where the observed lies.
'''
    
def identify_mar(df, missing_col, col, test_stat, N, alpha):
    missing_dist = df[[col]].assign(is_missing = df[missing_col].isna())
    observed = test_stat(missing_dist[missing_dist['is_missing']][col], missing_dist[~missing_dist['is_missing']][col])
    simulations = np.array([])
    for _ in range(N):
        missing_dist['is_missing'] = np.random.permutation(missing_dist['is_missing'])
        simulated = test_stat(missing_dist[missing_dist['is_missing']][col], missing_dist[~missing_dist['is_missing']][col])
        simulations = np.append(simulations, simulated)
    
    fig = px.histogram(x = simulations, title = f'Permutation Test Distribution', labels={'x': 'Simulated Test Statistics'}, histnorm = 'probability')
    fig.add_vline(x=observed, line_color='red', line_width=2, annotation_text='Observed', annotation_position='top right')
    fig.show()    
    
    p_value = (simulations > observed).mean()
    return p_value, p_value < alpha

In [None]:
p_val, is_mar = identify_mar(raw_data, 'CAUSE.CATEGORY.DETAIL', 'U.S._STATE', tvd, 1000, 0.05)
p_val, is_mar

(np.float64(0.0), np.True_)

In [None]:
p_val2, is_mar2 = identify_mar(raw_data, 'OUTAGE.DURATION', 'MONTH', tvd, 1000, 0.05)
p_val2, is_mar2

(np.float64(0.608), np.False_)

Clearly, the `CAUSE.CATEGORY.DETAIL_` column is **MAR** with respect to `CAUSE.CATEGORY_`. In other words, the missingness for cause category details are *highly* dependent on what the actual cause category is, which makes a lot of sense intuitively.

## Step 4: Hypothesis Testing

$H_0$: The proportion of each cause category is uniformly distributed across each season, for each cause category.

$H_a$: The proportion of each cause category is not uniformly distributed across each season, for each cause category.

In [None]:
''' Calculates the TVD for 2D distributions across each column (axis = 0). 
The resulting TVD's will be aggregated (sum or mean) to represent the TVD of 
the whole distributions. Assumes the probability distributions are already
calculated and provided.
'''
def tvd_2d(dist1: pd.DataFrame, dist2: pd.DataFrame, aggfunc):
    return (np.sum(np.abs(dist1 - dist2), axis = 0) / 2).agg(aggfunc)

In [None]:
seasonal_counts = raw_data.pivot_table(values = 'OBS', columns = 'SEASONAL.BINS', index = 'CAUSE.CATEGORY', aggfunc = 'size')

cause_totals = seasonal_counts.sum(axis = 0)
seasonal_totals = seasonal_counts.sum(axis = 1)
expected_proportions = seasonal_totals / seasonal_counts.sum().sum()

observed_dist = seasonal_counts / cause_totals
expected_dist = pd.DataFrame(data = {col: expected_proportions for col in observed_dist.columns})
observed_tvd = tvd_2d(expected_dist, observed_dist, 'sum')
observed_dist

SEASONAL.BINS,Fall,Spring,Summer,Winter
CAUSE.CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
equipment failure,0.02,0.05,0.05,0.03
fuel supply emergency,0.02,0.04,0.02,0.03
intentional attack,0.23,0.35,0.23,0.3
islanding,0.03,0.03,0.04,0.02
public appeal,0.06,0.02,0.07,0.02
severe weather,0.57,0.4,0.51,0.53
system operability disruption,0.07,0.11,0.09,0.06


In [None]:
NUM_SIMULATIONS = 1000
sim_season_df = raw_data[['SEASONAL.BINS', 'CAUSE.CATEGORY', 'OBS']]
simulations = []
for _ in range(NUM_SIMULATIONS):
    sim_season_df['SEASONAL.BINS'] = np.random.permutation(sim_season_df['SEASONAL.BINS'])
    sim_counts = sim_season_df.pivot_table(values = 'OBS', columns = 'SEASONAL.BINS', index = 'CAUSE.CATEGORY', aggfunc = 'size')

    sim_cause_totals = sim_counts.sum(axis = 0)
    sim_seasonal_totals = sim_counts.sum(axis = 1)
    sim_expected_proportions = sim_seasonal_totals / sim_counts.sum().sum()

    sim_observed_dist = sim_counts / sim_cause_totals
    sim_expected_dist = pd.DataFrame(data = {col: sim_expected_proportions for col in sim_observed_dist.columns})
    sim_tvd = tvd_2d(sim_expected_dist, sim_observed_dist, 'sum')
    
    simulations.append(sim_tvd)
simulations

[np.float64(0.11738710444551893),
 np.float64(0.12331989636477558),
 np.float64(0.10590555675338789),
 np.float64(0.1750509678401592),
 np.float64(0.10982639976424069),
 np.float64(0.128656106048201),
 np.float64(0.13921177994407488),
 np.float64(0.11255076089574358),
 np.float64(0.17795937651590896),
 np.float64(0.13693987041385605),
 np.float64(0.13785550063907687),
 np.float64(0.13682048267567562),
 np.float64(0.10713448509381396),
 np.float64(0.14366526051953604),
 np.float64(0.20329231998744357),
 np.float64(0.13652886780035323),
 np.float64(0.12701579078643566),
 np.float64(0.10530673879264038),
 np.float64(0.1433147109515245),
 np.float64(0.20232841966227316),
 np.float64(0.10345012991212449),
 np.float64(0.18988305486390772),
 np.float64(0.09899530960103381),
 np.float64(0.14501961213914094),
 np.float64(0.09980055397376351),
 np.float64(0.11275934458990239),
 np.float64(0.10024100529387639),
 np.float64(0.08618897422792812),
 np.float64(0.11487342245117836),
 np.float64(0.1378

In [None]:
fig_hyp1 = px.histogram(simulations, histnorm = 'probability', title = 'Cause Category by Season Distribution of TVD')
fig_hyp1.add_vline(x=observed_tvd, line_color='red', line_width=2, annotation_text='Observed', annotation_position='top right')
fig_hyp1.show()

In [None]:
p_val_hyp1 = (observed_tvd < simulations).mean()
p_val_hyp1
np.array(simulations).mean(), np.array(simulations).std()

(np.float64(0.1431508193776529), np.float64(0.029407952597958427))

### Test Number 2

$H_0$: The distributions of mean affected customers for each state is the same for observations from 2000-2008 and 2008-2016.

$H_a$: The distributions of mean affected customers for each state is different for observations from 2000-2008 and 2008-2016.

In [None]:
raw_data['YEAR'] = raw_data['YEAR'].astype(float)
customers_dist_2005 = raw_data[raw_data['YEAR'] == 2005.0][['U.S._STATE', 'CUSTOMERS.AFFECTED']].groupby('U.S._STATE').mean()
customers_dist_2005 /= customers_dist_2005.sum(axis = 0)
customers_dist_2005 = customers_dist_2005['CUSTOMERS.AFFECTED'].dropna()

customers_dist_2006 = raw_data[raw_data['YEAR'] == 2006.0][['U.S._STATE', 'CUSTOMERS.AFFECTED']].groupby('U.S._STATE').mean()
customers_dist_2006 /= customers_dist_2006.sum(axis = 0)
customers_dist_2006 = customers_dist_2006['CUSTOMERS.AFFECTED'].dropna()

observed_tvd_customers = np.abs(customers_dist_2006 - customers_dist_2005).sum() / 2
observed_tvd_customers


np.float64(0.1931225644342009)

In [None]:
N_CUSTOMERS = 1000
sim_customers_2006 = np.random.multinomial(N_CUSTOMERS, pvals = customers_dist_2005, size = 100_000) / N_CUSTOMERS
sim_tvds_customers = np.sum(np.abs(sim_customers_2006 - customers_dist_2005.to_numpy()), axis = 1) / 2
px.histogram(sim_tvds_customers).add_vline(x=observed_tvd_customers, line_color='red', line_width=2, annotation_text='Observed', annotation_position='top right').show()
sim_tvds_customers.mean(), sim_tvds_customers.std()

(np.float64(0.04386220649403443), np.float64(0.009299661560213073))

In [None]:
p_val_hyp2 = (np.array(sim_tvds_customers) >= observed_tvd_customers).mean()
p_val_hyp2

np.float64(0.0)

## Step 5: Framing a Prediction Problem

We plan to predict the cause column, which represents the general cause of power outages. Since this column contains categorical data, it is a classification problem. We will use features such as the number of people affected, outage duration, demand loss, and climatic conditions to build a predictive model. We aim to uncover patterns that help identify the causes of power outages based on their associated impacts and conditions. This will provide valuable insights for improving outage management and preparedness.

## Step 6: Baseline Model

In [None]:
mdl_data = raw_data.drop(columns = ['CAUSE.CATEGORY.DETAIL', 'OUTAGE.START', 'OUTAGE.END', 'SEASONAL.BINS', 'NERC.REGION', 'OBS'])
mdl_data.isna().sum(axis = 0).sort_values(ascending=False)

DEMAND.LOSS.MW        672
CUSTOMERS.AFFECTED    622
OUTAGE.DURATION        78
TOTAL.PRICE            12
TOTAL.SALES            12
CLIMATE.REGION          5
CAUSE.CATEGORY          0
ANOMALY.LEVEL           0
U.S._STATE              0
MONTH                   0
YEAR                    0
CLIMATE.CATEGORY        0
TOTAL.CUSTOMERS         0
TOTAL.REALGSP           0
dtype: int64

In [None]:
mdl_data[['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'OUTAGE.DURATION']] = mdl_data[['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'OUTAGE.DURATION']].fillna(0)
mdl_data.dropna(subset = ['TOTAL.PRICE', 'TOTAL.SALES', 'CLIMATE.REGION'], inplace = True)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [None]:

mdl_base = Pipeline(steps = [
    ('data_encoding', ColumnTransformer(transformers = [
        ('one_hot', OneHotEncoder(handle_unknown = 'ignore'), ['U.S._STATE', 'CLIMATE.REGION', 'CLIMATE.CATEGORY']),
        ('ordinal', OrdinalEncoder(), ['MONTH']),
    ], remainder = 'passthrough')),
    ('random_forest', RandomForestClassifier())
])

In [None]:
X = mdl_data.drop(columns = ['CAUSE.CATEGORY'])
y = mdl_data['CAUSE.CATEGORY']
X_train, X_test, y_train, y_test = train_test_split(X, y)

mdl_data.isna().sum(axis = 0).sort_values(ascending=False)

YEAR                  0
MONTH                 0
U.S._STATE            0
CLIMATE.REGION        0
ANOMALY.LEVEL         0
CAUSE.CATEGORY        0
CLIMATE.CATEGORY      0
OUTAGE.DURATION       0
DEMAND.LOSS.MW        0
CUSTOMERS.AFFECTED    0
TOTAL.PRICE           0
TOTAL.SALES           0
TOTAL.CUSTOMERS       0
TOTAL.REALGSP         0
dtype: int64

In [None]:
mdl_base.fit(X_train, y_train)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [None]:
mdl_base.score(X_train, y_train)

1.0

In [None]:
mdl_base.score(X_test, y_test), float(f1_score(y_test, mdl_base.predict(X_test), average = 'macro'))

(0.821917808219178, 0.5554021601788702)

## Step 7: Final Model

In [None]:
# TODO

## Step 8: Fairness Analysis

In [None]:
# TODO