# Your Title Here

**Name**: Kiran Chandrasekaran

**Website Link**: (your website link)

In [2]:
import pandas as pd
import numpy as np

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from lec_utils import * # Feel free to uncomment and use this. It'll make your plotly graphs look like ours in lecture!

## Step 1: Introduction

In [7]:
outages = pd.read_csv('data/outage_data.csv')

In [8]:
outages['TOTAL.PRICE'].describe()

In [9]:
outages.columns

## Step 2: Data Cleaning and Exploratory Data Analysis

In [10]:
outages.groupby('U.S._STATE')['TOTAL.PRICE'].mean()

In [1]:

# Load your dataset
df = outages

# Scatter plot: Outage Duration vs. Customers Affected
fig1 = px.scatter(df, x='CUSTOMERS.AFFECTED', y='OUTAGE.DURATION', 
                   title='Outage Duration vs Customers Affected',
                   labels={'CUSTOMERS.AFFECTED': 'Customers Affected', 'OUTAGE.DURATION': 'Outage Duration (minutes)'},
                   trendline='ols')
fig1.show()



In [12]:
# Box plot: Outage Duration by Cause Category
fig2 = px.box(df, x='CAUSE.CATEGORY', y='OUTAGE.DURATION', 
               title='Outage Duration by Cause Category',
               labels={'CAUSE.CATEGORY': 'Cause Category', 'OUTAGE.DURATION': 'Outage Duration (minutes)'},
               color='CAUSE.CATEGORY')
fig2.show()

In [13]:
# Scatter plot: Outage Duration vs. Anomaly Level (El Niño/La Niña)
fig3 = px.scatter(df, x='ANOMALY.LEVEL', y='OUTAGE.DURATION', 
                   title='Outage Duration vs Anomaly Level (El Niño/La Niña)',
                   labels={'ANOMALY.LEVEL': 'Anomaly Level', 'OUTAGE.DURATION': 'Outage Duration (minutes)'},
                   trendline='ols')
fig3.show()

In [14]:
# Scatter plot: Outage Duration vs. Electricity Price
fig4 = px.scatter(df, x='TOTAL.PRICE', y='OUTAGE.DURATION', 
                   title='Outage Duration vs Electricity Price',
                   labels={'TOTAL.PRICE': 'Electricity Price (cents/kWh)', 'OUTAGE.DURATION': 'Outage Duration (minutes)'},
                   trendline='ols')
fig4.show()

In [15]:
# Create bins for electricity price
df['PRICE.BIN'] = pd.cut(df['TOTAL.PRICE'], bins=[0, 5, 10, 15, 20, 25], labels=['0-5', '5-10', '10-15', '15-20', '20-25'])

# Box plot: Outage Duration by Electricity Price Range
fig5 = px.box(df, x='PRICE.BIN', y='OUTAGE.DURATION', 
               title='Outage Duration by Electricity Price Range',
               labels={'PRICE.BIN': 'Electricity Price Range (cents/kWh)', 'OUTAGE.DURATION': 'Outage Duration (minutes)'},
               color='PRICE.BIN')
fig5.show()

In [16]:
outages.replace('', np.nan, inplace=True)

In [17]:
outages['OUTAGE.START.DT'] = pd.to_datetime(outages['OUTAGE.START.DATE'] + ' ' + outages['OUTAGE.START.TIME'], errors='coerce')
outages['OUTAGE.END.DT'] = pd.to_datetime(outages['OUTAGE.RESTORATION.DATE'] + ' ' + outages['OUTAGE.RESTORATION.TIME'], errors='coerce')

In [18]:
outages['OUTAGE.MONTH'] = outages['OUTAGE.START.DT'].dt.month
outages['OUTAGE.YEAR'] = outages['OUTAGE.START.DT'].dt.year

## Step 3: Framing a Prediction Problem

In [19]:
fig1 = px.histogram(outages, x='OUTAGE.DURATION', nbins=100, title='Distribution of Outage Durations (Hours)')
fig1.show()

In [20]:
fig2 = px.histogram(outages, x='CAUSE.CATEGORY', title='Count of Outages by Cause Category')
fig2.show()

In [21]:
fig3 = px.histogram(outages, x='NERC.REGION', title='Outages by NERC Region')
fig3.show()

In [22]:
fig4 = px.scatter(outages, x='CUSTOMERS.AFFECTED', y='OUTAGE.DURATION',
                  title='Outage Duration vs. Customers Affected',
                  log_x=True)  # Log scale since customer counts can be huge
fig4.show()

In [23]:
fig5 = px.box(outages, x='CAUSE.CATEGORY', y='OUTAGE.DURATION',
              title='Outage Duration by Cause Category')
fig5.show()

In [24]:
fig6 = px.box(outages, x='CLIMATE.CATEGORY', y='OUTAGE.DURATION',
              title='Outage Duration by Climate Category')
fig6.show()

In [25]:
agg_state = outages.groupby('U.S._STATE').agg({
    'OUTAGE.DURATION': ['mean', 'count'],
    'CUSTOMERS.AFFECTED': 'mean'
}).reset_index()
agg_state.columns = ['U.S._STATE', 'Avg_Outage_Duration', 'Outage_Count', 'Avg_Customers_Affected']


In [26]:
import plotly.graph_objects as go
fig7 = go.Figure(data=[go.Table(
    header=dict(values=list(agg_state.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[agg_state[col] for col in agg_state.columns],
               fill_color='lavender',
               align='left'))
])
fig7.update_layout(title="Aggregate Outage Statistics by State")
fig7.show()

## Step 4: Baseline Model

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [28]:
features = [
    'CUSTOMERS.AFFECTED',
    'RES.PRICE', 'COM.PRICE', 'IND.PRICE',
    'POPULATION'
]

In [29]:
df_model = outages[features + ['OUTAGE.DURATION']].copy()


In [30]:
df_model.dropna(inplace=True)

In [31]:
X = df_model[features]
y = df_model['OUTAGE.DURATION']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [34]:
y_pred = lr.predict(X_test)


In [35]:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [36]:
print(f"R² Score: {r2:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} hours")

In [37]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

fig = px.scatter(results, x='Actual', y='Predicted',
                 title='Baseline Model: Actual vs Predicted Outage Duration',
                 labels={'Actual': 'Actual Outage Duration', 'Predicted': 'Predicted Outage Duration'})

# Add perfect prediction line
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=results['Actual'].min(), y0=results['Actual'].min(),
    x1=results['Actual'].max(), y1=results['Actual'].max()
)

fig.show()

In [39]:
'''
Model Choice: We built a baseline linear regression model predicting outage duration from customers affected, electricity prices, and population.

Data Preparation: We dropped rows with missing data and used a simple train/test split (80% train, 20% test).

Performance: Report the R² and RMSE values.

Scatter Plot: Explain how close the points are to the diagonal line — the closer they are, the better the predictions.
'''

## Step 5: Final Model

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


In [41]:
df_model['LOG_CUSTOMERS_AFFECTED'] = np.log1p(df_model['CUSTOMERS.AFFECTED'])  # log(1 + x) to avoid -inf
df_model['CUST_AFFECTED_PCT_POP'] = df_model['CUSTOMERS.AFFECTED'] / df_model['POPULATION']


In [42]:
final_features = [
    'LOG_CUSTOMERS_AFFECTED', 'CUST_AFFECTED_PCT_POP',
    'RES.PRICE', 'COM.PRICE', 'IND.PRICE', 'POPULATION'
]

In [43]:
X = df_model[final_features]
y = df_model['OUTAGE.DURATION']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [45]:
numeric_features = final_features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values if needed
    ('scaler', StandardScaler())
])

In [46]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ]
)

In [47]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [48]:
param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [5, 10, None],
    'regressor__min_samples_split': [2, 5],
}

In [49]:
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [50]:
print("Best parameters found:", grid_search.best_params_)


In [51]:
best_model = grid_search.best_estimator_

y_pred_final = best_model.predict(X_test)

r2_final = r2_score(y_test, y_pred_final)
rmse_final = np.sqrt(mean_squared_error(y_test, y_pred_final))

print(f"Final Model R² Score: {r2_final:.3f}")
print(f"Final Model RMSE: {rmse_final:.2f} hours")

In [52]:
results_final = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_final})

fig = px.scatter(results_final, x='Actual', y='Predicted',
                 title='Final Model (Random Forest): Actual vs Predicted Outage Duration',
                 labels={'Actual': 'Actual Outage Duration', 'Predicted': 'Predicted Outage Duration'})

# Add perfect prediction line
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=results_final['Actual'].min(), y0=results_final['Actual'].min(),
    x1=results_final['Actual'].max(), y1=results_final['Actual'].max()
)

fig.show()