## First, get the data into a useful format

In [172]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt

In [128]:
# Read in datafiles
gfs = pd.read_csv('./raw_data/GFS_Daily_PredictVars_2010thru2019.csv', index_col = 'Date', usecols = ['Date', 'TMAX', 'TMIN', 'WMAX', 'RTOT'], parse_dates = True)
obs = pd.read_csv('./raw_data/KCMI_daily_tidy.csv', index_col = 'Date', parse_dates = True)
obs.index = obs.index + pd.DateOffset(hours=12)  # Align times

In [129]:
# There is some missing data from the GFS forecasts!
print(gfs.shape)
print(obs.shape)

(3614, 4)
(3652, 4)


In [130]:
# Merge and drop all days with missing values 
features = pd.merge(obs.reset_index(), gfs.reset_index(), on = 'Date', how = 'inner')
features.dropna(inplace = True)  # There are some NaNs in the observations

In [131]:
features.describe()

Unnamed: 0,Max Hourly Temp (C),Min Hourly Temp (C),Max Wind Speed (m/s),Daily Precip (mm),TMAX,TMIN,WMAX,RTOT
count,3576.0,3576.0,3576.0,3576.0,3576.0,3576.0,3576.0,3576.0
mean,16.826995,6.048813,7.674937,2.11443,15.537847,6.678395,5.840101,3.280696
std,11.966492,10.739529,2.937644,6.24831,11.243844,10.023166,2.365245,7.183832
min,-21.111111,-26.666667,2.2352,0.0,-21.56,-29.36,1.431782,0.0
25%,7.222222,-1.666667,5.81152,0.0,6.54,-0.96,4.049691,0.0
50%,18.333333,6.666667,7.15264,0.0,17.14,7.54,5.458022,0.27
75%,27.222222,15.555556,9.38784,0.6,25.54,15.24,7.244998,3.125
max,37.777778,25.555556,23.24608,89.0,37.14,25.04,16.413714,94.69


In [132]:
# Add year, month, day as integers 
features['year'] = features['Date'].dt.year
features['month'] = features['Date'].dt.month
features['day'] = features['Date'].dt.day

In [133]:
features

Unnamed: 0,Date,Max Hourly Temp (C),Min Hourly Temp (C),Max Wind Speed (m/s),Daily Precip (mm),TMAX,TMIN,WMAX,RTOT,year,month,day
0,2010-01-01 12:00:00,-11.111111,-16.666667,6.70560,0.0,-11.26,-16.46,7.323933,0.05,2010,1,1
1,2010-01-02 12:00:00,-14.444444,-20.555556,6.70560,0.0,-10.16,-15.96,4.687217,0.00,2010,1,2
2,2010-01-03 12:00:00,-12.222222,-21.111111,4.91744,0.0,-9.96,-14.36,5.941380,0.06,2010,1,3
3,2010-01-04 12:00:00,-11.666667,-18.888889,8.94080,0.0,-8.46,-12.86,5.685068,0.17,2010,1,4
4,2010-01-05 12:00:00,-12.222222,-18.333333,7.15264,0.0,-7.16,-11.06,3.794733,0.77,2010,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
3609,2019-12-27 12:00:00,6.111111,0.000000,6.70560,0.0,13.94,3.54,8.819297,2.73,2019,12,27
3610,2019-12-28 12:00:00,13.333333,0.000000,10.28192,2.7,15.94,9.34,9.701546,11.58,2019,12,28
3611,2019-12-29 12:00:00,15.000000,5.000000,10.28192,4.0,7.24,-1.66,11.111256,2.57,2019,12,29
3612,2019-12-30 12:00:00,3.333333,-0.555556,11.62304,0.4,-0.36,-2.26,8.769265,0.04,2019,12,30


In [175]:
# The data we will be using to predict the labels
new_features = features.drop(columns = ['Date', 'Max Hourly Temp (C)', 'Min Hourly Temp (C)', 'Max Wind Speed (m/s)', 'Daily Precip (mm)'], axis = 1)

In [176]:
new_features

Unnamed: 0,TMAX,TMIN,WMAX,RTOT,year,month,day
0,-11.26,-16.46,7.323933,0.05,2010,1,1
1,-10.16,-15.96,4.687217,0.00,2010,1,2
2,-9.96,-14.36,5.941380,0.06,2010,1,3
3,-8.46,-12.86,5.685068,0.17,2010,1,4
4,-7.16,-11.06,3.794733,0.77,2010,1,5
...,...,...,...,...,...,...,...
3609,13.94,3.54,8.819297,2.73,2019,12,27
3610,15.94,9.34,9.701546,11.58,2019,12,28
3611,7.24,-1.66,11.111256,2.57,2019,12,29
3612,-0.36,-2.26,8.769265,0.04,2019,12,30


## Now run the model!

### Maximum Temperature

In [178]:
# Split the data into training and testing sets
train_features = np.array(new_features.query('year < 2019'))
test_features = np.array(new_features.query('year == 2019'))
train_labels = np.array(features.query('year < 2019')['Max Hourly Temp (C)'])
test_labels = np.array(features.query('year == 2019')['Max Hourly Temp (C)'])

In [179]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (3213, 7)
Training Labels Shape: (3213,)
Testing Features Shape: (363, 7)
Testing Labels Shape: (363,)


In [180]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rfmax = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rfmax.fit(train_features, train_labels);

In [184]:
# Use the forest's predict method on the test data
predictions = rfmax.predict(test_features)

# Print out the root mean square error (rmse)
print('Root Mean Square Error:', round(sqrt(mean_squared_error(test_labels, predictions)), 2), 'degrees.')

Root Mean Square Error: 4.05 degrees.


In [185]:
# Get numerical feature importances
importances = list(rfmax.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: TMIN                 Importance: 0.75
Variable: TMAX                 Importance: 0.17
Variable: WMAX                 Importance: 0.02
Variable: RTOT                 Importance: 0.01
Variable: year                 Importance: 0.01
Variable: month                Importance: 0.01
Variable: day                  Importance: 0.01


### Minimum Temperature

In [186]:
# New labels, features remain the same
train_labels = np.array(features.query('year < 2019')['Min Hourly Temp (C)'])
test_labels = np.array(features.query('year == 2019')['Min Hourly Temp (C)'])

In [187]:
# Instantiate model with 1000 decision trees
rfmin = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rfmin.fit(train_features, train_labels);

In [189]:
# Use the forest's predict method on the test data
predictions = rfmin.predict(test_features)

# Print out the root mean square error (rmse)
print('Root Mean Square Error:', round(sqrt(mean_squared_error(test_labels, predictions)), 2), 'degrees.')

Root Mean Square Error: 3.78 degrees.


In [190]:
# Get numerical feature importances
importances = list(rfmin.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: TMIN                 Importance: 0.88
Variable: TMAX                 Importance: 0.03
Variable: WMAX                 Importance: 0.03
Variable: RTOT                 Importance: 0.02
Variable: month                Importance: 0.02
Variable: day                  Importance: 0.02
Variable: year                 Importance: 0.01


### Max Wind Speed

In [192]:
# New labels, features remain the same
train_labels = np.array(features.query('year < 2019')['Max Wind Speed (m/s)'])
test_labels = np.array(features.query('year == 2019')['Max Wind Speed (m/s)'])

In [193]:
# Instantiate model with 1000 decision trees
rfwind = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rfwind.fit(train_features, train_labels);

In [195]:
# Use the forest's predict method on the test data
predictions = rfwind.predict(test_features)

# Print out the root mean square error (rmse)
print('Root Mean Square Error:', round(sqrt(mean_squared_error(test_labels, predictions)), 2), 'm/s.')

Root Mean Square Error: 2.57 m/s.


In [196]:
# Get numerical feature importances
importances = list(rfwind.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: WMAX                 Importance: 0.27
Variable: TMAX                 Importance: 0.24
Variable: TMIN                 Importance: 0.12
Variable: RTOT                 Importance: 0.1
Variable: month                Importance: 0.1
Variable: day                  Importance: 0.1
Variable: year                 Importance: 0.07


### Total Precipitation

In [198]:
# New labels, features remain the same
train_labels = np.array(features.query('year < 2019')['Daily Precip (mm)'])
test_labels = np.array(features.query('year == 2019')['Daily Precip (mm)'])

In [199]:
# Instantiate model with 1000 decision trees
rfprecip = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rfprecip.fit(train_features, train_labels);

In [200]:
# Use the forest's predict method on the test data
predictions = rfprecip.predict(test_features)

# Print out the root mean square error (rmse)
print('Root Mean Square Error:', round(sqrt(mean_squared_error(test_labels, predictions)), 2), 'mm.')

Root Mean Square Error: 5.36 mm.


In [201]:
# Get numerical feature importances
importances = list(rfprecip.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: WMAX                 Importance: 0.21
Variable: TMAX                 Importance: 0.18
Variable: RTOT                 Importance: 0.18
Variable: TMIN                 Importance: 0.17
Variable: day                  Importance: 0.11
Variable: year                 Importance: 0.07
Variable: month                Importance: 0.07
