# Data Processing

In [None]:
import pandas as pd

from src import data_processing

In [None]:
MIN_YEAR = 2011
MAX_YEAR = 2021

## Load Data Sets

In [None]:
reactors_df = pd.read_excel('./data/raw/reactors-operating.xlsx')
display(reactors_df.head())

In [None]:
decom_reactors_df = pd.read_excel('./data/raw/reactors-decommissioning.xlsx')
display(decom_reactors_df.head())

In [None]:
scram_df = pd.read_excel('./data/raw/scram-event-notifications.xlsx')
display(scram_df.head())

## Aggregate Scrams

In [None]:
scram_agg = scram_df[
    ['Year ', 'Plant Name', 'Scram #']
].groupby(
    ['Year ', 'Plant Name'], as_index=False
).count()
scram_agg = scram_agg.rename(
    columns={'Year ': 'year', 'Plant Name': 'plant', 'Scram #': 'scrams'}
)

scram_agg['plant'] = scram_agg['plant'].str.strip()
scram_agg = scram_agg[scram_agg['year'] >= MIN_YEAR]
scram_agg = scram_agg[scram_agg['year'] <= MAX_YEAR]

display(scram_agg.head())

## Create Final Count DataFrame

In [None]:
scram_count_records = []

# Plants in operation
for year in range(MIN_YEAR, MAX_YEAR + 1):
    for i, row in reactors_df.iterrows():
        scram_count_records.append({
            'year': year,
            'plant': row['Plant Name'].strip(),
            'start operation': row['Commercial Operation'],
            'end operation': None,
        })
        
# Decommissioned plants
for year in range(MIN_YEAR, MAX_YEAR + 1):
    for i, row in decom_reactors_df.iterrows():
        scram_count_records.append({
            'year': year,
            'plant': row['Unit'].strip(),
            # Note: We use OL issue date to approximate operation start, as we lack precise information on
            # operation start date for decommissioned plants
            'start operation': row['Operating License (OL) Issued'],
            'end operation': row['Shut Down'],
        })
    
    # See: https://en.wikipedia.org/wiki/Pilgrim_Nuclear_Power_Station
    scram_count_records.append({
        'year': year,
        'plant': 'Pilgrim Nuclear Power Station',
        'start operation': pd.Timestamp(1972, 12, 1),
        'end operation': pd.Timestamp(2019, 5, 31)
    })
        
scram_count_df = pd.DataFrame(scram_count_records)
display(scram_count_df.head())

In [None]:
# Calculate % of year the plant was in operation
# Keep only years of operation, since no scrams can happen during non-operational years
scram_count_df['percent of year operational'] = scram_count_df.apply(
    data_processing.percent_of_year_operational, axis=1
)
scram_count_df = scram_count_df[scram_count_df['percent of year operational'] > 0]
display(scram_count_df.head())

In [None]:
# Merge scram counts
# - Rename names, if necessary
name_changes = [
    ('Duane Arnold Energy Center', 'Duane Arnold'),
    ('Fort Calhoun Station', 'Fort Calhoun'),
    ('Indian Point Nuclear Generating, Unit 2', 'Indian Point 2'),
    ('Indian Point Nuclear Generating, Unit 3', 'Indian Point 3'),
    ('Oyster Creek Nuclear Generating Station', 'Oyster Creek'),
    ('Turkey Point Nuclear Generating Station, Unit 3', 'Turkey Point Nuclear Generating Unit No. 3'),
    ('Turkey Point Nuclear Generating Station, Unit 4', 'Turkey Point Nuclear Generating Unit No. 4'),
    ('Three Mile Island Nuclear Station, Unit 1', 'Three Mile Island 1'),
    ('San Onofre Nuclear Generating Station, Unit 2', 'San Onofre 2'),
    ('San Onofre Nuclear Generating Station, Unit 3', 'San Onofre 3'),
]
for name1, name2 in name_changes:
    scram_count_df.loc[scram_count_df['plant'] == name2, 'plant'] = name1


scram_count_df = pd.merge(scram_count_df, scram_agg, how='left', on=['plant', 'year'])
scram_count_df['scrams'] = scram_count_df['scrams'].fillna(0).astype(int)

display(scram_count_df.head())

In [None]:
# Clean up dataframe
scram_count_df = scram_count_df[
    ['plant', 'year', 'percent of year operational', 'scrams']
].sort_values(
    ['plant', 'year']
).reset_index(
).drop('index', axis=1)
display(scram_count_df.head())

In [None]:
scram_count_df.to_pickle('./data/processed/count_df.pkl')