In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import warnings
warnings.simplefilter('ignore')
sns.set()

In [2]:
input_file = 'data/temps.csv'
input_dataset  =  pd.read_csv(input_file)

In [3]:
input_dataset.head()

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,45,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,44,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,41,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,40,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,44,46,46,46,41


In [4]:
y = input_dataset['actual']
X = input_dataset.drop('actual',axis=1)

In [5]:
X.head()

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,46,46,46,41


In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.25,random_state =42)

In [7]:
baseline_preds = X_train['average']
baseline_errors = abs(baseline_preds - y_train)
baseline_errors.mean()

4.890804597701151

In [8]:
X_train['week'].unique()

array(['Mon', 'Thurs', 'Sat', 'Sun', 'Fri', 'Wed', 'Tues'], dtype=object)

In [9]:
X_test['week'].unique()

array(['Thurs', 'Wed', 'Mon', 'Sun', 'Sat', 'Fri', 'Tues'], dtype=object)

In [10]:
from sklearn.ensemble import RandomForestRegressor

In [11]:
from datatools.customtransformers import DummyTransformer
dt = DummyTransformer(['week'])
X_train = dt.fit_transform(X_train)
X_test = dt.transform(X_test)

In [12]:
X_test.head()

Unnamed: 0,year,month,day,temp_2,temp_1,average,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
255,2016,9,29,69,68,66.1,63,71,68,57,0,0,0,0,1,0,0
114,2016,4,27,59,60,60.7,59,65,60,50,0,0,0,0,0,0,1
314,2016,11,28,53,48,48.0,46,48,49,44,0,1,0,0,0,0,0
268,2016,10,12,60,62,61.0,60,63,63,52,0,0,0,0,0,0,1
167,2016,6,19,67,65,70.4,69,73,70,58,0,0,0,1,0,0,0


In [13]:
rf = RandomForestRegressor(n_estimators =1000,random_state =42)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_train)
errors  = abs(y_pred -y_train)
print('Average model error:', round(np.mean(errors), 2), 'degrees.')
improvement_baseline = 100 * abs(np.mean(errors) - np.mean(baseline_errors)) / np.mean(baseline_errors)
print('Improvement over baseline:', round(improvement_baseline, 2), '%.')

Average model error: 1.42 degrees.
Improvement over baseline: 70.89 %.


In [14]:
baseline_test = X_test['average']
baseline_testerrors = abs(baseline_test - y_test)
baseline_testerrors.mean()

5.059770114942527

In [15]:
y_pred_test = rf.predict(X_test)
errors = abs(y_pred_test - baseline_test)

print('Average model error:', round(np.mean(errors), 2), 'degrees.')
improvement_baseline = 100 * abs(np.mean(errors) - np.mean(baseline_testerrors)) / np.mean(baseline_testerrors)
print('Improvement over baseline:', round(improvement_baseline, 2), '%.')

In [17]:
from sklearn.model_selection import cross_val_score
scoring ='r2'

In [18]:
lr_scores  = np.mean(cross_val_score(rf, X_train, y_train, cv=10,scoring =scoring))
print("Regression scores:{0}".format(lr_scores))

Regression scores:0.8027509076533468


In [19]:
from sklearn.metrics import r2_score
adv_r2 =r2_score(y_pred_test, y_test)  
print("R2 value :{0}".format(adv_r2))

R2 value :0.7641121764351968


### Pipeline

In [21]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([("rf",RandomForestRegressor(n_estimators =1000,random_state =42))])
pipe.fit(X_train,y_train)


Cross-validation accuracy (pipeline): 0.81


In [22]:
print("Cross-validation accuracy (pipeline): {:.2f}".format(np.mean(cross_val_score(pipe, X_train, y_train, cv=10))))

Cross-validation accuracy (pipeline): 0.80


In [23]:
y_pred_test = pipe.predict(X_test)
errors = abs(y_pred_test - baseline_test)

print('Average model error:', round(np.mean(errors), 2), 'degrees.')
improvement_baseline = 100 * abs(np.mean(errors) - np.mean(baseline_testerrors)) / np.mean(baseline_testerrors)
print('Improvement over baseline:', round(improvement_baseline, 2), '%.')

Average model error: 3.93 degrees.
Improvement over baseline: 22.42 %.
