# Power Outages

**Name(s)**: Karsin Dass & Nicole Doyle

**Website Link**: (your website link)

In [2]:
import pandas as pd
import numpy as np

import plotly.express as px
pd.options.plotting.backend = 'plotly'

# from lec_utils import * # Feel free to uncomment and use this. It'll make your plotly graphs look like ours in lecture!

## Step 1: Introduction

In [3]:
#Is there a specific day of the week and time of day where intentional attacks occur?  Do they occur more frequently in urban areas where more people are affected?

## Step 2: Data Cleaning and Exploratory Data Analysis

In [4]:
outageFull = pd.read_csv('outage.csv', skiprows=5) #with units
print(outageFull.head(1))
outageClean = outageFull.dropna(how='all', axis=1)
outageClean = outageClean.dropna(how='all')
outageClean = outageClean.drop(0)
print(outageClean.head())



  variables  OBS  YEAR  MONTH U.S._STATE POSTAL.CODE NERC.REGION  \
0     Units  NaN   NaN    NaN        NaN         NaN         NaN   

  CLIMATE.REGION ANOMALY.LEVEL CLIMATE.CATEGORY  ... POPPCT_URBAN POPPCT_UC  \
0            NaN       numeric              NaN  ...            %         %   

              POPDEN_URBAN                POPDEN_UC             POPDEN_RURAL  \
0  persons per square mile  persons per square mile  persons per square mile   

  AREAPCT_URBAN AREAPCT_UC PCT_LAND PCT_WATER_TOT  PCT_WATER_INLAND  
0             %          %        %             %                 %  

[1 rows x 57 columns]
  variables  OBS    YEAR  MONTH U.S._STATE POSTAL.CODE NERC.REGION  \
1       NaN  1.0  2011.0    7.0  Minnesota          MN         MRO   
2       NaN  2.0  2014.0    5.0  Minnesota          MN         MRO   
3       NaN  3.0  2010.0   10.0  Minnesota          MN         MRO   
4       NaN  4.0  2012.0    6.0  Minnesota          MN         MRO   
5       NaN  5.0  2015.0    7.

In [5]:
# Create visualization that shows where in the US intentional attacks occur
def plot_intentional_outages(data):
    # data by state, quantify by number of customers affected
    outage_summary = (
        data.groupby("POSTAL.CODE")
        .agg({"CUSTOMERS.AFFECTED": "sum"})
        .reset_index()
    )

    fig = px.choropleth(
        outage_summary,
        locations="POSTAL.CODE",  
        locationmode="USA-states",  
        color="CUSTOMERS.AFFECTED",  # depper red = more customers affected
        scope="usa", 
        title="Intentional Outages in the US by Affected Customers",
        color_continuous_scale="Reds",
        labels={"CUSTOMERS.AFFECTED": "Affected Customers"},
    )

    return fig

intentional = outageClean[outageClean['CAUSE.CATEGORY'] == 'intentional attack']
plot_intentional_outages(intentional).show()


In [6]:
'''I want to predict whether an outage is intentional or not based on the data provided.  
This is a classification problem, not a regression problem because the target variable can be classified in two categories: intentional or not intentional.
'''

print(outageClean.columns)

Index(['variables', 'OBS', 'YEAR', 'MONTH', 'U.S._STATE', 'POSTAL.CODE',
       'NERC.REGION', 'CLIMATE.REGION', 'ANOMALY.LEVEL', 'CLIMATE.CATEGORY',
       'OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE',
       'OUTAGE.RESTORATION.TIME', 'CAUSE.CATEGORY', 'CAUSE.CATEGORY.DETAIL',
       'HURRICANE.NAMES', 'OUTAGE.DURATION', 'DEMAND.LOSS.MW',
       'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE',
       'TOTAL.PRICE', 'RES.SALES', 'COM.SALES', 'IND.SALES', 'TOTAL.SALES',
       'RES.PERCEN', 'COM.PERCEN', 'IND.PERCEN', 'RES.CUSTOMERS',
       'COM.CUSTOMERS', 'IND.CUSTOMERS', 'TOTAL.CUSTOMERS', 'RES.CUST.PCT',
       'COM.CUST.PCT', 'IND.CUST.PCT', 'PC.REALGSP.STATE', 'PC.REALGSP.USA',
       'PC.REALGSP.REL', 'PC.REALGSP.CHANGE', 'UTIL.REALGSP', 'TOTAL.REALGSP',
       'UTIL.CONTRI', 'PI.UTIL.OFUSA', 'POPULATION', 'POPPCT_URBAN',
       'POPPCT_UC', 'POPDEN_URBAN', 'POPDEN_UC', 'POPDEN_RURAL',
       'AREAPCT_URBAN', 'AREAPCT_UC', 'PCT_LAND', 'PCT_WAT

## Step 3: Framing a Prediction Problem

In [7]:
#Can we predict the outage duration based on a variety of factors?
#we will need to one hot encode the categorical variables

## Step 4: Baseline Model

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

X = outageClean.drop(columns=['OUTAGE.DURATION'])  # Omit the target variable
y = outageClean['OUTAGE.DURATION']

# Handle NaNs in duration col
y = y.fillna(0) 
#categorical data below
non_numeric_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)
X_encoded = X_encoded.apply(pd.to_numeric, errors='coerce').fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)


rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")

#cross validation works quickly here, but not always for other models
y_cv_pred = cross_val_predict(linreg, X_scaled, y, cv=5)
cv_rmse = np.sqrt(mean_squared_error(y, y_cv_pred))
print(f"Cross-validated RMSE: {cv_rmse}")


Root Mean Squared Error (RMSE): 4222.567549191013
Cross-validated RMSE: 5740.366607103173


In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

#copy + paste
X = outageClean.drop(columns=['OUTAGE.DURATION'])  
y = outageClean['OUTAGE.DURATION']  
y = y.fillna(0)  
non_numeric_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)
X_encoded = X_encoded.apply(pd.to_numeric, errors='coerce').fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


lasso = Lasso(alpha=0.1, max_iter=10000, random_state=42) 
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


selected_features = X_encoded.columns[np.abs(lasso.coef_) > 1e-5]
print("Selected features by Lasso:")
print(selected_features.tolist())



Root Mean Squared Error (RMSE): 4146.4878
Selected features by Lasso:
['YEAR', 'CUSTOMERS.AFFECTED', 'U.S._STATE_Alaska', 'U.S._STATE_Delaware', 'U.S._STATE_Illinois', 'U.S._STATE_Kentucky', 'U.S._STATE_Michigan', 'U.S._STATE_Minnesota', 'U.S._STATE_New York', 'U.S._STATE_Virginia', 'U.S._STATE_West Virginia', 'U.S._STATE_Wisconsin', 'POSTAL.CODE_DE', 'POSTAL.CODE_GA', 'POSTAL.CODE_IL', 'POSTAL.CODE_MI', 'POSTAL.CODE_NY', 'POSTAL.CODE_TN', 'POSTAL.CODE_VA', 'POSTAL.CODE_VT', 'POSTAL.CODE_WI', 'NERC.REGION_HI', 'NERC.REGION_PR', 'NERC.REGION_RFC', 'CLIMATE.REGION_East North Central', 'CLIMATE.REGION_West North Central', 'ANOMALY.LEVEL_-0.6', 'ANOMALY.LEVEL_-1', 'ANOMALY.LEVEL_0.1', 'ANOMALY.LEVEL_0.2', 'ANOMALY.LEVEL_0.3', 'ANOMALY.LEVEL_0.6', 'ANOMALY.LEVEL_0.7', 'ANOMALY.LEVEL_1.1', 'ANOMALY.LEVEL_1.3', 'ANOMALY.LEVEL_2.2', 'ANOMALY.LEVEL_2.3', 'OUTAGE.START.DATE_Friday, April 17, 2015', 'OUTAGE.START.DATE_Friday, April 21, 2006', 'OUTAGE.START.DATE_Friday, April 22, 2005', 'OUTAGE.ST

In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

X = outageClean.drop(columns=['OUTAGE.DURATION'])  
y = outageClean['OUTAGE.DURATION']  
y = y.fillna(0)  
non_numeric_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)
X_encoded = X_encoded.apply(pd.to_numeric, errors='coerce').fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

ridge = Ridge(alpha=1.0, max_iter=10000, random_state=42)  # Adjust `alpha` as needed
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# Feature selection based on non-zero coefficients
selected_features = X_encoded.columns[np.abs(ridge.coef_) > 1e-5]
print("Selected features by Ridge:")
print(selected_features.tolist())




Root Mean Squared Error (RMSE): 4222.5424
Selected features by Ridge:
['OBS', 'YEAR', 'MONTH', 'CUSTOMERS.AFFECTED', 'RES.CUSTOMERS', 'COM.CUSTOMERS', 'IND.CUSTOMERS', 'TOTAL.CUSTOMERS', 'POPULATION', 'U.S._STATE_Alaska', 'U.S._STATE_Arizona', 'U.S._STATE_Arkansas', 'U.S._STATE_California', 'U.S._STATE_Colorado', 'U.S._STATE_Connecticut', 'U.S._STATE_Delaware', 'U.S._STATE_District of Columbia', 'U.S._STATE_Florida', 'U.S._STATE_Georgia', 'U.S._STATE_Hawaii', 'U.S._STATE_Idaho', 'U.S._STATE_Illinois', 'U.S._STATE_Indiana', 'U.S._STATE_Iowa', 'U.S._STATE_Kansas', 'U.S._STATE_Kentucky', 'U.S._STATE_Louisiana', 'U.S._STATE_Maine', 'U.S._STATE_Maryland', 'U.S._STATE_Massachusetts', 'U.S._STATE_Michigan', 'U.S._STATE_Minnesota', 'U.S._STATE_Mississippi', 'U.S._STATE_Missouri', 'U.S._STATE_Montana', 'U.S._STATE_Nebraska', 'U.S._STATE_Nevada', 'U.S._STATE_New Hampshire', 'U.S._STATE_New Jersey', 'U.S._STATE_New Mexico', 'U.S._STATE_New York', 'U.S._STATE_North Carolina', 'U.S._STATE_North Dak

In [13]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

X = outageClean.drop(columns=['OUTAGE.DURATION'])  
y = outageClean['OUTAGE.DURATION']  
y = y.fillna(0) 

non_numeric_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)
X_encoded = X_encoded.apply(pd.to_numeric, errors='coerce').fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
decision_tree = DecisionTreeRegressor(max_depth=10, random_state=42)  
decision_tree.fit(X_train, y_train)


y_pred = decision_tree.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


feature_importances = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Importance': decision_tree.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Top Features by Importance:")
print(feature_importances.head(10))

#plot most important features
import plotly.express as px
fig = px.bar(
    feature_importances.head(10),
    x='Feature',
    y='Importance',
    title="Top Features by Importance",
    labels={"Importance": "Feature Importance", "Feature": "Feature Name"},
)
fig.show()


Root Mean Squared Error (RMSE): 4586.9902
Top Features by Importance:
                                            Feature  Importance
3000            OUTAGE.RESTORATION.TIME_11:53:00 AM    0.171704
3703                             DEMAND.LOSS.MW_378    0.157552
7475                              COM.SALES_2089309    0.136957
4268                                RES.PRICE_21.69    0.092246
3348             OUTAGE.RESTORATION.TIME_7:47:00 PM    0.064316
3472                  CAUSE.CATEGORY_severe weather    0.061177
13418                          RES.CUST.PCT_89.4420    0.057060
7617                              COM.SALES_2789455    0.049423
299    OUTAGE.START.DATE_Friday, September 12, 2008    0.032087
3468           CAUSE.CATEGORY_fuel supply emergency    0.028483


Selected features: Index(['OBS', 'intentional', 'CLIMATE.REGION_South', 'CLIMATE.REGION_West',
       'CAUSE.CATEGORY_intentional attack', 'CAUSE.CATEGORY_severe weather',
       'CAUSE.CATEGORY.DETAIL_thunderstorm', 'CAUSE.CATEGORY.DETAIL_vandalism',
       'PC.REALGSP.USA_47586', 'PC.REALGSP.USA_48156'],
      dtype='object')
Root Mean Squared Error (RMSE) with selected features: 16379.377009984186
Cross-validated R^2 score with selected features: -4.559138629675185


## Step 5: Final Model

In [None]:
# TODO