In [1]:
import pandas as pd
from google.colab import drive

In [2]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
%cd /content/drive/Shareddrives/Data606_Energy/data/weather

/content/drive/Shareddrives/Data606_Energy/data/weather


In [5]:
temp_df = pd.read_csv('temperatures_1981-01-01-2021-10-01.csv')
pdsi_df = pd.read_csv('pdsi_1981-01-01-2021-10-01.csv')

## Get convert data from 2000 onward into standard deviations above average

In [6]:
def standardize_data(data,date_start):
  # first, I need to create an array that holds the average as time goes by during the time range of interest
  # I need to do the same thing with the standard deviations
  cols = data.columns[1:-3] # excluding 1st column and last 3
  cols = pd.to_datetime(cols,format='%Y-%m-%d')
  data.columns = ['Untitled']+list(cols)+['y','x','State']
  cols_for_average = cols[cols<date_start]
  df_for_average = data.copy()[cols_for_average]
  # specify which columns are to be standardized
  cols_for_analysis = cols[cols>=date_start]
  data_for_analysis = data.copy()[cols_for_analysis]
  # grab the averages and standard deviations by coordinate and by month
  month_summaries = {}
  for m in range(1,13):
    month_columns = cols_for_average[[col.month == m for col in cols_for_average]]
    month_avgs = df_for_average[month_columns].mean(axis=1)
    month_stds = df_for_average[month_columns].std(axis=1)
    month_summaries[f'Averages {m}'] = month_avgs
    month_summaries[f'Standard Deviations {m}'] = month_stds
    month_columns_standardize = cols_for_analysis[[col.month == m for col in cols_for_analysis]]
    # now standardize the data used for analysis using these averages and standard deviations
    for col in month_columns_standardize:
      data_for_analysis[col] = (data_for_analysis[col]-month_avgs)/month_stds
  data_for_analysis[['y','x','State']] = data[['y','x','State']]
  month_summaries['x']=data_for_analysis['x']
  month_summaries['y']=data_for_analysis['y']
  month_summaries['State']=data_for_analysis['State']
  return data_for_analysis, pd.DataFrame(month_summaries)

In [7]:
standardized_temperatures, temp_summary = standardize_data(temp_df,'2000-01-01')
standardized_droughts, drought_summary = standardize_data(pdsi_df,'2000-01-01')

In [8]:
# first I need to make the column names into datetimes so that I can subset stuff on months
date_cols = standardized_temperatures.columns[:-3] # emit the last 3 cause they're coordinates and States
date_cols = pd.to_datetime(date_cols,format='%Y-%m-%d')
standardized_temperatures.columns = list(date_cols)+['y','x','State']
# standardize each of the dataframes
date_cols = standardized_droughts.columns[:-3] # emit the last 3 cause they're coordinates and States
date_cols = pd.to_datetime(date_cols,format='%Y-%m-%d')
standardized_droughts.columns = list(date_cols)+['y','x','State']

In [9]:
# a simple metric could be the average value for each state at each month - this should probably be revised after some EDA/research
temperature_mean_grouped = standardized_temperatures.groupby('State').mean()
drought_mean_grouped = standardized_droughts.groupby('State').mean()

In [10]:
# save the standardized values not aggregated by state yet in case we want to do further analysis at location level
standardized_temperatures.to_csv('standardized_temperatures.csv')
standardized_droughts.to_csv('standardized_droughts.csv')
# save the averages by state
temperature_mean_grouped.to_csv('standardized_temperatures_state.csv')
drought_mean_grouped.to_csv('standardized_droughts_state.csv')
# save the means and standard deviations by location
temp_summary.to_csv('temperature_summary.csv')
drought_summary.to_csv('drought_summary.csv')