In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yaml
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
try:
    with open ("../config.yaml", 'r') as file:
        config = yaml.safe_load(file)
except Exception as e:
    print('Error reading the config file')

In [3]:
df = pd.read_csv(config['data']+'data_models.csv')
df.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,AC,rain,sun,refill_liters,specials_AC rain,specials_AC snow,specials_AC sun,specials_other,specials_rain,specials_rain+sun,specials_snow,specials_sun,gas_type_SP98,refill_gas_SP98,refill_gas_no
0,28.0,5.0,26,21.5,12,0,0,0,45.0,0,0,0,1,0,0,0,0,0,0,0
1,12.0,4.2,30,21.5,13,0,0,0,0.0,0,0,0,1,0,0,0,0,0,0,1
2,11.2,5.5,38,21.5,15,0,0,0,0.0,0,0,0,1,0,0,0,0,0,0,1
3,12.9,3.9,36,21.5,14,0,0,0,0.0,0,0,0,1,0,0,0,0,0,0,1
4,18.5,4.5,46,21.5,15,0,0,0,0.0,0,0,0,1,0,0,0,0,0,0,1


In [4]:
X = df.drop(['consume'], axis = 1)
y = df['consume']

## We need to scale the dataset. We will use Min Max Scaler and save the scaler with pickle in order to be able to make future predictions.

In [5]:
transformer = MinMaxScaler()
X = transformer.fit_transform(X,y)
pickle.dump(transformer, open(config['scalers']+'scaler.pkl', 'wb'))

## Train/Test split for the regression models. We will focus on a simple linear regression model and a Random Forest Regressor.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 13)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(310, 19) (78, 19) (310,) (78,)


In [7]:
lm = LinearRegression()
lm.fit(X_train, y_train)

y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)

In [8]:
print('\nError tests\n')
print(f'R2 train = {r2_score(y_train, y_pred_train):.4f}')
print(f'R2 test = {r2_score(y_test, y_pred_test):.4f}')
print()
print(f'RMSE train = {(np.sqrt(mean_squared_error(y_train,y_pred_train))):.4f}')
print(f'RMSE test = {(np.sqrt(mean_squared_error(y_test,y_pred_test))):.4f}')
print()
print (f'MAE train = {(metrics.mean_absolute_error(y_train, y_pred_train)):.4f}')
print (f'MAE test = {(metrics.mean_absolute_error(y_test, y_pred_test)):.4f}')
print()
print (f'MSE train = {(metrics.mean_squared_error(y_train, y_pred_train)):.4f}')
print (f'MAE test = {(metrics.mean_squared_error(y_test, y_pred_test)):.4f}')
print()


Error tests

R2 train = 0.2620
R2 test = 0.0448

RMSE train = 0.7692
RMSE test = 1.4115

MAE train = 0.5307
MAE test = 0.8219

MSE train = 0.5917
MAE test = 1.9924



## Not good results. We will try with the Random Forests Regressor model and feed some parameters to tune it to the best option:

In [9]:
min_samples_leaf = [2,4,5,7,9]
min_samples_split = [2,4,5,7,9]


In [10]:
cv_err_lst = []
for i in min_samples_leaf:
    for j in min_samples_split:
        rfr = RandomForestRegressor(n_estimators = 100,  criterion = 'squared_error', min_samples_leaf= i, min_samples_split= i)
        cv_err = cross_val_score(rfr, X_train, y_train, cv=10)
        print('CV error for min_samples_leaf =', i, 'and min_samples_split =', j, ':', cv_err.mean())
        cv_err_lst.append(cv_err.mean())

CV error for min_samples_leaf = 2 and min_samples_split = 2 : 0.45336743975430444
CV error for min_samples_leaf = 2 and min_samples_split = 4 : 0.436503797623808
CV error for min_samples_leaf = 2 and min_samples_split = 5 : 0.44616894786724304
CV error for min_samples_leaf = 2 and min_samples_split = 7 : 0.45302488114475603
CV error for min_samples_leaf = 2 and min_samples_split = 9 : 0.4532801049430364
CV error for min_samples_leaf = 4 and min_samples_split = 2 : 0.5022446492463953
CV error for min_samples_leaf = 4 and min_samples_split = 4 : 0.4949131899105019
CV error for min_samples_leaf = 4 and min_samples_split = 5 : 0.492717135265455
CV error for min_samples_leaf = 4 and min_samples_split = 7 : 0.49290056099203133
CV error for min_samples_leaf = 4 and min_samples_split = 9 : 0.4972889822200818
CV error for min_samples_leaf = 5 and min_samples_split = 2 : 0.48612544045780215
CV error for min_samples_leaf = 5 and min_samples_split = 4 : 0.48496475155027596
CV error for min_samples

In [11]:
max_value = max(cv_err_lst)
max_index = cv_err_lst.index(max_value)

print('Best CV error value is', round(max_value,2), 'for configuration', max_index+1)

Best CV error value is 0.5 for configuration 6


In [12]:
rfr = RandomForestRegressor(n_estimators = 100,  criterion = 'squared_error', min_samples_leaf= 4, min_samples_split= 2)
cv_err = cross_val_score(rfr, X_train, y_train, cv=10)
print(cv_err.mean())



0.4986604739387695


In [13]:
rfr.fit(X_train, y_train)
y_pred_train = rfr.predict(X_train)
y_pred_test = rfr.predict(X_test)

In [14]:
r2_score(y_train, y_pred_train)

0.7749059386370067

We got a very acceptable R2 for this dataset.

In [15]:
pickle.dump(rfr, open(config['models']+'model.pkl', 'wb'))

# We have successfully exported the model to make future predictions.