In [2]:
!pip install us
import us
import pandas as pd
from google.colab import drive

In [3]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
%cd /content/drive/Shareddrives/Data606_Energy/data/weather

/content/drive/Shareddrives/Data606_Energy/data/weather


In [5]:
temp_df = pd.read_csv('temperatures_1981-01-01-2021-10-01.csv')
pdsi_df = pd.read_csv('pdsi_1981-01-01-2021-10-01.csv')

## Get convert data from 2000 onward into standard deviations above average

In [6]:
def standardize_data(data,date_start):
  # first, I need to create an array that holds the average as time goes by during the time range of interest
  # I need to do the same thing with the standard deviations
  cols = data.columns[1:-3] # excluding 1st column and last 3
  cols = pd.to_datetime(cols,format='%Y-%m-%d')
  data.columns = ['Untitled']+list(cols)+['y','x','State']
  cols_for_average = cols[cols<date_start]
  df_for_average = data.copy()[cols_for_average]
  # specify which columns are to be standardized
  cols_for_analysis = cols[cols>=date_start]
  data_for_analysis = data.copy()[cols_for_analysis]
  # grab the averages and standard deviations by coordinate and by month
  month_summaries = {}
  for m in range(1,13):
    month_columns = cols_for_average[[col.month == m for col in cols_for_average]]
    month_avgs = df_for_average[month_columns].mean(axis=1)
    month_stds = df_for_average[month_columns].std(axis=1)
    month_summaries[f'Averages {m}'] = month_avgs
    month_summaries[f'Standard Deviations {m}'] = month_stds
    month_columns_standardize = cols_for_analysis[[col.month == m for col in cols_for_analysis]]
    # now standardize the data used for analysis using these averages and standard deviations
    for col in month_columns_standardize:
      data_for_analysis[col] = (data_for_analysis[col]-month_avgs)/month_stds
  data_for_analysis[['y','x','State']] = data[['y','x','State']]
  month_summaries['x']=data_for_analysis['x']
  month_summaries['y']=data_for_analysis['y']
  month_summaries['State']=data_for_analysis['State']
  return data_for_analysis, pd.DataFrame(month_summaries)

In [7]:
standardized_temperatures, temp_summary = standardize_data(temp_df,'2000-01-01')
standardized_droughts, drought_summary = standardize_data(pdsi_df,'2000-01-01')

In [8]:
# first I need to make the column names into datetimes so that I can subset stuff on months
date_cols = standardized_temperatures.columns[:-3] # emit the last 3 cause they're coordinates and States
date_cols = pd.to_datetime(date_cols,format='%Y-%m-%d')
standardized_temperatures.columns = list(date_cols)+['y','x','State']
# standardize each of the dataframes
date_cols = standardized_droughts.columns[:-3] # emit the last 3 cause they're coordinates and States
date_cols = pd.to_datetime(date_cols,format='%Y-%m-%d')
standardized_droughts.columns = list(date_cols)+['y','x','State']

In [39]:
# a simple metric could be the average value for each state at each month - this should probably be revised after some EDA/research
temperature_mean_grouped = standardized_temperatures.groupby('State').mean()
drought_mean_grouped = standardized_droughts.groupby('State').mean()

In [None]:
# save the standardized values not aggregated by state yet in case we want to do further analysis at location level
standardized_temperatures.to_csv('standardized_temperatures.csv')
standardized_droughts.to_csv('standardized_droughts.csv')
# save the averages by state
temperature_mean_grouped.to_csv('standardized_temperatures_state.csv')
drought_mean_grouped.to_csv('standardized_droughts_state.csv')
# save the means and standard deviations by location
temp_summary.to_csv('temperature_summary.csv')
drought_summary.to_csv('drought_summary.csv')

In [55]:
#(temp_summary[['x','y']] == standardized_temperatures[['x','y']]).sum()/standardized_temperatures.shape[0]
# since this is now getting aggregated at the annual level, I need to somehow capture monthly granularity
# I will create a summer dataset which has the number of months with summer temperatures above 1 std above the mean
# likewise for winter but with 1 std below the mean
# do the same thing for droughts but only for the 1 standard deviation below since low values correspond to droughts
temps_1_above = temperature_mean_grouped.copy()#standardized_temperatures.copy()
temps_1_below = temperature_mean_grouped.copy()#standardized_temperatures.copy()
droughts_1_below = drought_mean_grouped.copy()#standardized_droughts.copy()
for date_col in date_cols:
  month = date_col.month
  temps_1_above[date_col] = temps_1_above[date_col] >= 1#(temp_summary['Averages '+str(month)]+temp_summary['Standard Deviations '+str(month)])
  temps_1_below[date_col] = temps_1_above[date_col] < 1#(temp_summary['Averages '+str(month)]-temp_summary['Standard Deviations '+str(month)])
  droughts_1_below[date_col] = droughts_1_below[date_col] <= 1#drought_summary['Averages '+str(month)]-drought_summary['Standard Deviations '+str(month)] 

In [56]:
summer_months = [6,7,8,9]
winter_months = [12,1,2,3]
years = []
states = []
temps_above = []
temps_below = []
droughts_below = []
# go through and filter by year and state and for each gather a count for the summer months 1 std above average and the winter months 1 std below average
for y in list(date_cols.year.unique()):
  for s in list(standardized_temperatures['State'].unique()):
    years.append(y)
    states.append(us.states.lookup(s).abbr)
    temps_above.append(temps_1_above[date_cols[(date_cols.year==y)&(date_cols.month.isin(summer_months))]][temps_1_above.index==s].sum().sum())
    temps_below.append(temps_1_below[date_cols[(date_cols.year==y)&(date_cols.month.isin(winter_months))]][temps_1_below.index==s].sum().sum())
    droughts_below.append(droughts_1_below[date_cols[(date_cols.year==y)&(date_cols.month.isin(summer_months))]][droughts_1_below.index==s].sum().sum())
annual_summary = pd.DataFrame({'State':states,'Year':years,'Hot Summer Months':temps_above,'Cold Winter Months':temps_below,'Dry Summer Months':droughts_below})

In [57]:
annual_summary.head()

Unnamed: 0,State,Year,Hot Summer Months,Cold Winter Months,Dry Summer Months
0,AK,2000,0,2,4
1,MN,2000,0,3,4
2,WA,2000,0,4,4
3,ID,2000,1,4,4
4,MT,2000,0,4,4


In [58]:
annual_summary.to_csv('weather_data_annual.csv')