## Temperature Prediction
- read csv data
- preprocess data
- train model and do prediction
- source: https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [3]:
#read data
data = pd.read_csv('data/temps.csv')
data.head()

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,45,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,44,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,41,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,40,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,44,46,46,46,41


In [4]:
data.shape

(348, 12)

In [5]:
data.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103,57.238506,62.373563,59.772989,60.034483
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146,10.605746,10.549381,10.705256,15.626179
min,2016.0,1.0,1.0,35.0,35.0,45.1,35.0,41.0,46.0,44.0,28.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,54.0,48.0,53.0,50.0,47.75
50%,2016.0,6.0,15.0,62.5,62.5,58.2,62.5,56.0,61.0,58.0,60.0
75%,2016.0,10.0,23.0,71.0,71.0,69.025,71.0,66.0,72.0,69.0,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,92.0,77.0,82.0,79.0,95.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   year            348 non-null    int64  
 1   month           348 non-null    int64  
 2   day             348 non-null    int64  
 3   week            348 non-null    object 
 4   temp_2          348 non-null    int64  
 5   temp_1          348 non-null    int64  
 6   average         348 non-null    float64
 7   actual          348 non-null    int64  
 8   forecast_noaa   348 non-null    int64  
 9   forecast_acc    348 non-null    int64  
 10  forecast_under  348 non-null    int64  
 11  friend          348 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 32.8+ KB


In [7]:
#one hot encoding for the week data
data = pd.get_dummies(data)
data.head()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,45,43,50,44,29,1,0,0,0,0,0,0
1,2016,1,2,44,45,45.7,44,41,50,44,61,0,0,1,0,0,0,0
2,2016,1,3,45,44,45.8,41,43,46,47,56,0,0,0,1,0,0,0
3,2016,1,4,44,41,45.9,40,44,48,46,53,0,1,0,0,0,0,0
4,2016,1,5,41,40,46.0,44,46,46,46,41,0,0,0,0,0,1,0


In [8]:
#separate features and targets
#transform pd dataframe to numpy array for the algo
target_df = data['actual']
features_df = data.drop('actual', axis = 1)
target_array = np.array(target_df)
features_array = np.array(features_df)

#saving feature names for later use
feature_list = list(features_df.columns)

In [9]:
#splitting data into train and test set
train_features, test_features, train_target, test_target = train_test_split(features_array, target_array, 
                                                                           test_size = 0.25, random_state = 42)

In [11]:
#checking array dimensions
print('train_features '+ str(train_features.shape))
print('test_features '+ str(test_features.shape))
print('train_target '+ str(train_target.shape))
print('test_target '+ str(test_target.shape))

train_features (261, 17)
test_features (87, 17)
train_target (261,)
test_target (87,)


In [106]:
#setup the baseline case for comparison
baseline_pred = np.array(data['average'])
baseline_actual = np.array(data['actual'])
error = (baseline_pred-baseline_actual)*100/baseline_actual
mean_error = round(np.mean(error),2)
print ("Percentage error: "+str(mean_error)+'%')

Percentage error: -3.68%


In [107]:
#train the model
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_target);

In [108]:
#predict the data
predictions = rf.predict(test_features)
#calculate the error
pred_error = (predictions-test_target)*100/test_target
mean_pred_error = round(np.mean(pred_error), 2)
print('Mean prediction error: '+ str(mean_pred_error) + '%')

Mean prediction error: -1.6%


In [109]:
#visualization
from sklearn.tree import export_graphviz
import pydot 

#samle tree
tree_sample = rf.estimators_[2]
#export image to a dot file
tree_dot = export_graphviz(tree_sample, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
#use the dot file to create the graph
(graph,) = pydot.graph_from_dot_file('tree.dot')
#save the graph in a png file
graph.write_png('tree.png')

In [110]:
#make a smaller trained model for easier visualization
small_rf = RandomForestRegressor(n_estimators = 50, max_depth = 3)
small_rf.fit(train_features,train_target)

#samle tree
tree_sample_small = small_rf.estimators_[2]
#export image to a dot file
tree_dot_small = export_graphviz(tree_sample_small, out_file = 'tree_small.dot', feature_names = feature_list, rounded = True, precision = 1)
#use the dot file to create the graph
(graph_small,) = pydot.graph_from_dot_file('tree_small.dot')
#save the graph in a png file
graph_small.write_png('tree_small.png')

In [112]:
#variable importance
#get numerical feature importances, returns numpy ndarray
importance = rf.feature_importances_
#creating a list of tuples of features and importances
feature_importance = list([feature,importance] for feature, importance in zip(feature_list, importance))
#sort the list of tuples by importance
feature_importance_sorted = sorted(feature_importance, key = lambda x : x[1],  reverse = True)
[print('{}: +{}'.format(*pair)) for pair in feature_importance_sorted]; #the end semicolon removes excess values! weird!

temp_1: +0.655553271473427
average: +0.1503298292111351
forecast_noaa: +0.045382293248822446
forecast_acc: +0.03485937239968542
forecast_under: +0.023190478556827192
day: +0.02111933068039649
temp_2: +0.02099328360465877
friend: +0.02068466932581951
month: +0.010329714202200255
week_Sat: +0.0036125047603753953
week_Fri: +0.003524921225987202
week_Mon: +0.0025881281964305588
week_Tues: +0.002302658049751997
week_Sun: +0.0022890741053675223
week_Wed: +0.0019740255721335514
week_Thurs: +0.0012664453869816038
year: +0.0


In [137]:
#training the model only with importance features #could also do pd modification on data and then split
#creating a smaller training and test data set
max_imp_features = 2
features_imp = [feature_importance_sorted[i][0] for i in range(max_imp_features)]
print(features_imp[0])
#get the indices based on the important features
indices_imp = [feature_list.index(str) for str in features_imp]
print(indices_imp)
train_imp = train_features[:,indices_imp] #pd df won't work. these are numpy ndarrays!
test_imp = test_features[:, indices_imp]

#now train the data
rf_imp = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf_imp.fit(train_imp, train_target)

#validate the data
predict_imp = rf_imp.predict(test_imp)

#error quantification
error_imp = (predict_imp - test_target)*100/test_target
error_imp_mean = round(np.mean(error_imp))
print("Mean error imp: "+ str(error_imp_mean)+"%")

temp_1
[4, 5]
Mean error imp: -2.0%
