# Whats the Cause of your Power Outage?

**Name(s)**: Kaii Bijlani, Ketan Mittal

**Website Link**: https://k1mittal.github.io/Causes_of_Power_Outages/

In [40]:
import pandas as pd
import numpy as np
from pathlib import Path
import scipy    
import folium
from folium.plugins import HeatMap

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

### Interesting Questions:
- How does the cause of the power outages indicate other factors, for example, does whether related power outages result in more people having no power? Can we predict the cause of power outages?
- Is there a correlation between the time and other factors, do power outages happen in one month specifically? Has the number of power outages decreased over time? Can we predict when the next power outage is using a time series prediction?
- Can we predict the number of people affected by a power outage given certain factors? Are the number of people affected by power outages correlated to other factors?
- Can we predict the duration of power outages given certain factors? How are the duration of power outages correlated to other factors?

### Our Choice:
We decided to answer the first bullet point, which is what aspects of power outage are related to each category of cause. 

## Step 2: Data Cleaning and Exploratory Data Analysis

In [None]:
pd.set_option('display.max_columns', None)

Unnamed: 0,U.S._STATE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,OUTAGE.START.DATE,OUTAGE.START.TIME,OUTAGE.RESTORATION.DATE,OUTAGE.RESTORATION.TIME,CAUSE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,TOTAL.PRICE,TOTAL.SALES,TOTAL.CUSTOMERS,TOTAL.REALGSP
240,Texas,FRCC,South,,,,,,equipment failure,transformer outage,,46,43000.0,,,9.30e+06,944631
340,Alabama,SERC,Southeast,,,,,,severe weather,thunderstorm,,,160000.0,,,2.26e+06,150090
366,Illinois,SERC,Central,,,,,,severe weather,wildfire,,,11000.0,,,5.28e+06,612709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1507,Kansas,SPP,South,,,,,,severe weather,winter storm,,550,270000.0,,,1.38e+06,109966
1531,North Dakota,MRO,West North Central,,,,,,fuel supply emergency,Coal,,1650,,,,3.66e+05,27868
1534,Alaska,ASCC,,,,,,,equipment failure,failure,,35,14273.0,,,2.74e+05,36046


In [115]:
# Data Init and Cleanin
raw_data = pd.read_excel(Path('.\outage.xlsx'))
raw_data.columns = [f'{raw_data.columns[i]}' for i in range(len(raw_data.columns))]
raw_data = raw_data.iloc[1:, 1:].loc[:, ['U.S._STATE', 'NERC.REGION', 'CLIMATE.REGION', 'ANOMALY.LEVEL', 'OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME', 'CAUSE.CATEGORY', 'CLIMATE.CATEGORY', 'CAUSE.CATEGORY.DETAIL', 'OUTAGE.DURATION', 'DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'TOTAL.PRICE', 'TOTAL.SALES', 'TOTAL.CUSTOMERS', 'TOTAL.REALGSP']]



invalid escape sequence '\o'


invalid escape sequence '\o'


invalid escape sequence '\o'



Below we display the only missing values for outage start dates. Note how there are only 9 entries, and each of these entries also have missing relevant features. Therefore, imputation is not something of interest in this case and we can simply drop these values

In [116]:
display(raw_data[raw_data['OUTAGE.START.DATE'].isna()])

raw_data['OUTAGE.START'] = pd.to_datetime(
    raw_data['OUTAGE.START.DATE'].dropna().astype(str) + ' ' + raw_data['OUTAGE.START.TIME'].dropna().astype(str)
)

raw_data['OUTAGE.END'] = pd.to_datetime(
    raw_data['OUTAGE.RESTORATION.DATE'].dropna().astype(str) + ' ' + raw_data['OUTAGE.RESTORATION.TIME'].dropna().astype(str)
)
raw_data = raw_data.drop(columns = ['OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME'])
raw_data[['OUTAGE.START', 'OUTAGE.END']]

Unnamed: 0,U.S._STATE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,OUTAGE.START.DATE,OUTAGE.START.TIME,OUTAGE.RESTORATION.DATE,OUTAGE.RESTORATION.TIME,CAUSE.CATEGORY,CLIMATE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,TOTAL.PRICE,TOTAL.SALES,TOTAL.CUSTOMERS,TOTAL.REALGSP
240,Texas,FRCC,South,,,,,,equipment failure,,transformer outage,,46,43000.0,,,9.30e+06,944631
340,Alabama,SERC,Southeast,,,,,,severe weather,,thunderstorm,,,160000.0,,,2.26e+06,150090
366,Illinois,SERC,Central,,,,,,severe weather,,wildfire,,,11000.0,,,5.28e+06,612709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1507,Kansas,SPP,South,,,,,,severe weather,,winter storm,,550,270000.0,,,1.38e+06,109966
1531,North Dakota,MRO,West North Central,,,,,,fuel supply emergency,,Coal,,1650,,,,3.66e+05,27868
1534,Alaska,ASCC,,,,,,,equipment failure,,failure,,35,14273.0,,,2.74e+05,36046



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0,OUTAGE.START,OUTAGE.END
1,2011-07-01 17:00:00,2011-07-03 20:00:00
2,2014-05-11 18:38:00,2014-05-11 18:39:00
3,2010-10-26 20:00:00,2010-10-28 22:00:00
...,...,...
1532,2009-08-29 22:54:00,2009-08-29 23:53:00
1533,2009-08-29 11:00:00,2009-08-29 14:01:00
1534,NaT,NaT


### Simple Handling of NaN Values

In [141]:
raw_data[raw_data['CUSTOMERS.AFFECTED'] == 0]['CUSTOMERS.AFFECTED'] = 0
raw_data['CUSTOMERS.AFFECTED'].isna().sum()
#raw_data[raw_data['CUSTOMERS.AFFECTED'] == 0].shape


np.int64(443)

In [118]:
cause_pdf = px.histogram(raw_data, x = 'CAUSE.CATEGORY', title = 'Distributions of Observations by Cause', histnorm = 'probability density')
cause_pdf.update_layout(xaxis_title = 'Cause', yaxis_title = 'Frequency', legend_title_text = 'Climate Category', showlegend = True)
cause_pdf.show()

In [119]:
states_url = f'https://raw.githubusercontent.com/python-visualization/folium/main/examples/data/us-states.json'

states_data = raw_data['U.S._STATE'].value_counts().reset_index(drop = False)
states_data

Unnamed: 0,U.S._STATE,count
0,California,210
1,Texas,127
2,Washington,97
...,...,...
47,North Dakota,2
48,South Dakota,2
49,Alaska,1


In [120]:

m = folium.Map(location=[37.0902, -95.7129], zoom_start=4) # Centered on the US
folium.Choropleth(
    geo_data=states_url, 
    name='State Frequency',
    data=states_data, 
    columns=['U.S._STATE', 'count'],  # Replace 'state' and 'frequency' with your column names
    key_on='feature.properties.name', 
    fill_color='YlGn', 
    fill_opacity=0.7, 
    line_opacity=0.2, 
    legend_name='Frequency'
).add_to(m)
m

In [121]:
raw_data

Unnamed: 0,U.S._STATE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CAUSE.CATEGORY,CLIMATE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,TOTAL.PRICE,TOTAL.SALES,TOTAL.CUSTOMERS,TOTAL.REALGSP,OUTAGE.START,OUTAGE.END
1,Minnesota,MRO,East North Central,-0.3,severe weather,normal,,3060,,70000.0,9.28,6562520,2.60e+06,274182,2011-07-01 17:00:00,2011-07-03 20:00:00
2,Minnesota,MRO,East North Central,-0.1,intentional attack,normal,vandalism,1,,,9.28,5284231,2.64e+06,291955,2014-05-11 18:38:00,2014-05-11 18:39:00
3,Minnesota,MRO,East North Central,-1.5,severe weather,cold,heavy wind,3000,,70000.0,8.15,5222116,2.59e+06,267895,2010-10-26 20:00:00,2010-10-28 22:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1532,South Dakota,RFC,West North Central,0.5,islanding,warm,,59,84,,7.67,924051,4.36e+05,36504,2009-08-29 22:54:00,2009-08-29 23:53:00
1533,South Dakota,MRO,West North Central,0.5,islanding,warm,,181,373,,7.67,924051,4.36e+05,36504,2009-08-29 11:00:00,2009-08-29 14:01:00
1534,Alaska,ASCC,,,equipment failure,,failure,,35,14273.0,,,2.74e+05,36046,NaT,NaT


## Step 3: Assessment of Missingness

### Part 3.1: Missingness Mechanism Analysis
If we look at our feature of interest, we can see that approximately 30 percent of values are missing. This is most likely not MCAR and requires more analysis to see if its missingness depends on other features.

In [122]:
raw_data[['CAUSE.CATEGORY.DETAIL']]

Unnamed: 0,CAUSE.CATEGORY.DETAIL
1,
2,vandalism
3,heavy wind
...,...
1532,
1533,
1534,failure


In [123]:
raw_data['CAUSE.CATEGORY.DETAIL'].isna().mean()

np.float64(0.3070404172099087)

In [124]:
raw_data[['CAUSE.CATEGORY.DETAIL', 'CAUSE.CATEGORY']]

Unnamed: 0,CAUSE.CATEGORY.DETAIL,CAUSE.CATEGORY
1,,severe weather
2,vandalism,intentional attack
3,heavy wind,severe weather
...,...,...
1532,,islanding
1533,,islanding
1534,failure,equipment failure


In [125]:
''' This cell defines valid, relevant test statistics for permutation and 
hypothesis tests.
'''
def tvd(dist1: pd.Series, dist2: pd.Series):
    return (dist1.value_counts(normalize = True) - dist2.value_counts(normalize = True)).abs().sum() / 2

def ks(dist1: pd.Series, dist2: pd.Series):
    return scipy.stats.ks_2samp(dist1, dist2)

In [126]:
''' This function takes in a dataframe, a column with missing values and a 
column to analyze the type of missingness mechanism with. It will return
a p-value and an associative True or False indicating if missing_col is MAR 
withrespect to col. To conduct the permutation test, it will use the given 
test_stat.

The function will also graph the distribution of simulated test statistics 
with a line indicating where the observed lies.
'''
    
def identify_mar(df, missing_col, col, test_stat, N, alpha):
    missing_dist = df[[col]].assign(is_missing = df[missing_col].isna())
    observed = test_stat(missing_dist[missing_dist['is_missing']][col], missing_dist[~missing_dist['is_missing']][col])
    simulations = np.array([])
    for _ in range(N):
        missing_dist['is_missing'] = np.random.permutation(missing_dist['is_missing'])
        simulated = test_stat(missing_dist[missing_dist['is_missing']][col], missing_dist[~missing_dist['is_missing']][col])
        simulations = np.append(simulations, simulated)
    
    fig = px.histogram(x = simulations, title = f'Permutation Test Distribution', labels={'x': 'Simulated Test Statistics'}, histnorm = 'probability')
    fig.add_vline(x = observed)
    fig.show()    
    
    p_value = (simulations > observed).mean()
    return p_value, p_value < alpha

In [130]:
p_val, is_mar = identify_mar(raw_data, 'CAUSE.CATEGORY.DETAIL', 'CAUSE.CATEGORY', tvd, 1000, 0.05)
p_val, is_mar

(np.float64(0.0), np.True_)

In [None]:
p_val2, is_mar2 = identify_mar(raw_data, '', 'CLIMATE.CATEGORY', tvd, 1000, 0.05)
p_val2, is_mar2

(np.float64(0.0), np.True_)

Clearly, the `CAUSE.CATEGORY.DETAIL_` column is **MAR** with respect to `CAUSE.CATEGORY_`. In other words, the missingness for cause category details are *highly* dependent on what the actual cause category is, which makes a lot of sense intuitively.

Now, we will repeat this process for all columns of interest in `raw_data`. We will create a dictionary mapping column names to whether or not they are **MAR** with respect to `CAUSE.CATEGORY.DETAIL_`. Based on this distribution, the next step will be to impute accordingly.

### Part 3.2: Imputation

## Step 4: Hypothesis Testing

$H_0$: The proportion of each cause category is uniformly distributed across each season, for each cause category.

$H_a$: The proportion of each cause category is not uniformly distributed across each season, for each cause category.

In [15]:
seasons = {'(0, 1]': 'Winter', '(1, 4]': 'Spring', '(4, 7]': 'Summer', '(7, 10]': 'Fall', '(10, 12]': 'Winter'}

data = raw_data.copy()
data['SEASONAL.BINS_'] = pd.cut(data['MONTH_'], bins = [0, 1, 4, 7, 10, 12])
data['SEASONAL.BINS_'] = data['SEASONAL.BINS_'].astype(str).map(seasons)
data

Unnamed: 0,OBS_,YEAR_,MONTH_,U.S._STATE_,POSTAL.CODE_,NERC.REGION_,CLIMATE.REGION_,ANOMALY.LEVEL_numeric,CLIMATE.CATEGORY_,"OUTAGE.START.DATE_Day of the week, Month Day, Year",OUTAGE.START.TIME_Hour:Minute:Second (AM / PM),"OUTAGE.RESTORATION.DATE_Day of the week, Month Day, Year",OUTAGE.RESTORATION.TIME_Hour:Minute:Second (AM / PM),CAUSE.CATEGORY_,CAUSE.CATEGORY.DETAIL_,HURRICANE.NAMES_,OUTAGE.DURATION_mins,DEMAND.LOSS.MW_Megawatt,CUSTOMERS.AFFECTED_,RES.PRICE_cents / kilowatt-hour,COM.PRICE_cents / kilowatt-hour,IND.PRICE_cents / kilowatt-hour,TOTAL.PRICE_cents / kilowatt-hour,RES.SALES_Megawatt-hour,COM.SALES_Megawatt-hour,IND.SALES_Megawatt-hour,TOTAL.SALES_Megawatt-hour,RES.PERCEN_%,COM.PERCEN_%,IND.PERCEN_%,RES.CUSTOMERS_,COM.CUSTOMERS_,IND.CUSTOMERS_,TOTAL.CUSTOMERS_,RES.CUST.PCT_%,COM.CUST.PCT_%,IND.CUST.PCT_%,PC.REALGSP.STATE_USD,PC.REALGSP.USA_USD,PC.REALGSP.REL_fraction,PC.REALGSP.CHANGE_%,UTIL.REALGSP_USD,TOTAL.REALGSP_USD,UTIL.CONTRI_%,PI.UTIL.OFUSA_%,POPULATION_,POPPCT_URBAN_%,POPPCT_UC_%,POPDEN_URBAN_persons per square mile,POPDEN_UC_persons per square mile,POPDEN_RURAL_persons per square mile,AREAPCT_URBAN_%,AREAPCT_UC_%,PCT_LAND_%,PCT_WATER_TOT_%,PCT_WATER_INLAND_%,SEASONAL.BINS_
1,1.0,2011.0,7.0,Minnesota,MN,MRO,East North Central,-0.3,normal,2011-07-01 00:00:00,17:00:00,2011-07-03 00:00:00,20:00:00,severe weather,,,3060,,70000.0,11.6,9.18,6.81,9.28,2332915,2114774,2113291,6562520,35.55,32.23,32.2,2.31e+06,276286.0,10673.0,2.60e+06,88.94,10.64,0.41,51268,47586,1.08,1.6,4802,274182,1.75,2.2,5.35e+06,73.27,15.28,2279,1700.5,18.2,2.14,0.6,91.59,8.41,5.48,Summer
2,2.0,2014.0,5.0,Minnesota,MN,MRO,East North Central,-0.1,normal,2014-05-11 00:00:00,18:38:00,2014-05-11 00:00:00,18:39:00,intentional attack,vandalism,,1,,,12.12,9.71,6.49,9.28,1586986,1807756,1887927,5284231,30.03,34.21,35.73,2.35e+06,284978.0,9898.0,2.64e+06,88.83,10.79,0.37,53499,49091,1.09,1.9,5226,291955,1.79,2.2,5.46e+06,73.27,15.28,2279,1700.5,18.2,2.14,0.6,91.59,8.41,5.48,Summer
3,3.0,2010.0,10.0,Minnesota,MN,MRO,East North Central,-1.5,cold,2010-10-26 00:00:00,20:00:00,2010-10-28 00:00:00,22:00:00,severe weather,heavy wind,,3000,,70000.0,10.87,8.19,6.07,8.15,1467293,1801683,1951295,5222116,28.1,34.5,37.37,2.30e+06,276463.0,10150.0,2.59e+06,88.92,10.69,0.39,50447,47287,1.07,2.7,4571,267895,1.71,2.1,5.31e+06,73.27,15.28,2279,1700.5,18.2,2.14,0.6,91.59,8.41,5.48,Fall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1532,1532.0,2009.0,8.0,South Dakota,SD,RFC,West North Central,0.5,warm,2009-08-29 00:00:00,22:54:00,2009-08-29 00:00:00,23:53:00,islanding,,,59,84,,9.25,7.47,5.53,7.67,337874,370771,215406,924051,36.56,40.12,23.31,3.67e+05,65971.0,3052.0,4.36e+05,84.18,15.12,0.7,45230,46680,0.97,0,606,36504,1.66,0.3,8.07e+05,56.65,26.73,2038.3,1905.4,4.7,0.3,0.15,98.31,1.69,1.69,Fall
1533,1533.0,2009.0,8.0,South Dakota,SD,MRO,West North Central,0.5,warm,2009-08-29 00:00:00,11:00:00,2009-08-29 00:00:00,14:01:00,islanding,,,181,373,,9.25,7.47,5.53,7.67,337874,370771,215406,924051,36.56,40.12,23.31,3.67e+05,65971.0,3052.0,4.36e+05,84.18,15.12,0.7,45230,46680,0.97,0,606,36504,1.66,0.3,8.07e+05,56.65,26.73,2038.3,1905.4,4.7,0.3,0.15,98.31,1.69,1.69,Fall
1534,1534.0,2000.0,,Alaska,AK,ASCC,,,,,,,,equipment failure,failure,,,35,14273.0,,,,,,,,,,,,2.31e+05,38074.0,854.0,2.74e+05,84.28,13.92,0.31,57401,44745,1.28,-2.2,724,36046,2.01,0.2,6.28e+05,66.02,21.56,1802.6,1276,0.4,0.05,0.02,85.76,14.24,2.9,


In [16]:
''' Calculates the TVD for 2D distributions across each column (axis = 0). 
The resulting TVD's will be aggregated (sum or mean) to represent the TVD of 
the whole distributions. Assumes the probability distributions are already
calculated and provided.
'''
def tvd_2d(dist1: pd.DataFrame, dist2: pd.DataFrame, aggfunc):
    return (np.sum(np.abs(dist1 - dist2), axis = 0) / 2).agg(aggfunc)

In [17]:
seasonal_counts = data.pivot_table(values = 'OBS_', columns = 'SEASONAL.BINS_', index = 'CAUSE.CATEGORY_', aggfunc = 'size')

cause_totals = seasonal_counts.sum(axis = 0)
seasonal_totals = seasonal_counts.sum(axis = 1)
expected_proportions = seasonal_totals / seasonal_counts.sum().sum()

observed_dist = seasonal_counts / cause_totals
expected_dist = pd.DataFrame(data = {col: expected_proportions for col in observed_dist.columns})
observed_tvd = tvd_2d(expected_dist, observed_dist, 'sum')
observed_dist

SEASONAL.BINS_,Fall,Spring,Summer,Winter
CAUSE.CATEGORY_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
equipment failure,0.02,0.04,0.05,0.03
fuel supply emergency,0.02,0.05,0.03,0.03
intentional attack,0.23,0.36,0.23,0.31
islanding,0.03,0.03,0.04,0.02
public appeal,0.06,0.02,0.07,0.02
severe weather,0.57,0.39,0.5,0.53
system operability disruption,0.07,0.11,0.09,0.06


In [18]:
NUM_SIMULATIONS = 1000
sim_season_df = data[['SEASONAL.BINS_', 'CAUSE.CATEGORY_', 'OBS_']]
simulations = []
for _ in range(NUM_SIMULATIONS):
    sim_season_df['SEASONAL.BINS_'] = np.random.permutation(sim_season_df['SEASONAL.BINS_'])
    sim_counts = sim_season_df.pivot_table(values = 'OBS_', columns = 'SEASONAL.BINS_', index = 'CAUSE.CATEGORY_', aggfunc = 'size')

    sim_cause_totals = sim_counts.sum(axis = 0)
    sim_seasonal_totals = sim_counts.sum(axis = 1)
    sim_expected_proportions = sim_seasonal_totals / sim_counts.sum().sum()

    sim_observed_dist = sim_counts / sim_cause_totals
    sim_expected_dist = pd.DataFrame(data = {col: sim_expected_proportions for col in sim_observed_dist.columns})
    sim_tvd = tvd_2d(sim_expected_dist, sim_observed_dist, 'sum')
    
    simulations.append(sim_tvd)
simulations

[np.float64(0.15764905650330968),
 np.float64(0.1723999388336336),
 np.float64(0.14915688963760373),
 np.float64(0.21967500263231016),
 np.float64(0.13310541159289507),
 np.float64(0.13995027810178037),
 np.float64(0.0990274389253076),
 np.float64(0.15768429345873838),
 np.float64(0.14498178741382362),
 np.float64(0.18182111860808178),
 np.float64(0.10106728884155451),
 np.float64(0.13902029703486382),
 np.float64(0.09878519824853525),
 np.float64(0.11169870158251913),
 np.float64(0.11645959644979081),
 np.float64(0.11074874877581145),
 np.float64(0.11704346529907361),
 np.float64(0.1831279802972653),
 np.float64(0.19012058134352683),
 np.float64(0.15533149097495338),
 np.float64(0.15914458295300551),
 np.float64(0.16811506762521275),
 np.float64(0.12381350238114919),
 np.float64(0.13541480055666724),
 np.float64(0.12815415700816446),
 np.float64(0.148648699816477),
 np.float64(0.09281189740455362),
 np.float64(0.14805226500385485),
 np.float64(0.18557763183959508),
 np.float64(0.12102

In [19]:
fig_hyp1 = px.histogram(simulations, histnorm = 'probability', title = 'Cause Category by Season Distribution of TVD')
fig_hyp1.add_vline(x = observed_tvd)
fig_hyp1.show()

In [23]:
p_val_hyp1 = (observed_tvd < simulations).mean()
p_val_hyp1
np.array(simulations).mean(), np.array(simulations).std()

(np.float64(0.1417829945966173), np.float64(0.029523449599504648))

### Test Number 2

$H_0$: The distributions of mean affected customers for each state is the same for observations from 2000-2008 and 2008-2016.

$H_a$: The distributions of mean affected customers for each state is different for observations from 2000-2008 and 2008-2016.

In [24]:
data['YEAR_'] = data['YEAR_'].astype(float)
customers_dist_2008 = data[data['YEAR_'] <= 2008.0][['U.S._STATE_', 'CUSTOMERS.AFFECTED_']].groupby('U.S._STATE_').mean()
customers_dist_2008 /= customers_dist_2008.sum(axis = 0)
customers_dist_2008 = customers_dist_2008['CUSTOMERS.AFFECTED_'].dropna()

customers_dist_2016 = data[data['YEAR_'] > 2008.0][['U.S._STATE_', 'CUSTOMERS.AFFECTED_']].groupby('U.S._STATE_').mean()
customers_dist_2016 /= customers_dist_2016.sum(axis = 0)
customers_dist_2016 = customers_dist_2016['CUSTOMERS.AFFECTED_'].dropna()

observed_tvd_customers = np.abs(customers_dist_2008 - customers_dist_2016).sum() / 2
observed_tvd_customers


np.float64(0.36851805695787)

In [25]:
N_CUSTOMERS = 1000
sim_customers_2016 = np.random.multinomial(N_CUSTOMERS, pvals = customers_dist_2008, size = 100_000) / N_CUSTOMERS
sim_tvds_customers = np.sum(np.abs(sim_customers_2016 - customers_dist_2008.to_numpy()), axis = 1) / 2
px.histogram(sim_tvds_customers).show()
sim_tvds_customers.mean(), sim_tvds_customers.std()

(np.float64(0.0735027904740965), np.float64(0.009427212517951413))

In [26]:
p_val_hyp2 = (np.array(sim_tvds_customers) >= observed_tvd_customers).mean()
p_val_hyp2

np.float64(0.0)

## Step 5: Framing a Prediction Problem

## Step 6: Baseline Model

In [27]:
# TODO

## Step 7: Final Model

In [None]:
# TODO

## Step 8: Fairness Analysis

In [None]:
# TODO