In [1]:
#Importing libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
#Importing and viewing Data
data_path = "./data/"
data = pd.read_csv(data_path + "forestfires.csv", delimiter = ',')
data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
data.shape

(517, 13)

In [4]:
data.dtypes

X          int64
Y          int64
month     object
day       object
FFMC     float64
DMC      float64
DC       float64
ISI      float64
temp     float64
RH         int64
wind     float64
rain     float64
area     float64
dtype: object

In [5]:
#Encoding Categorical Data
data.month.replace(('jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'),(1,2,3,4,5,6,7,8,9,10,11,12), inplace=True)
data.day.replace(('mon','tue','wed','thu','fri','sat','sun'),(1,2,3,4,5,6,7), inplace=True)

X = data.iloc[:, 0:12].values
y = data.iloc[:, 12].values

In [6]:
sc = StandardScaler()
X = sc.fit_transform(X)

# Tuning Parameters
ridgeR = Ridge()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
grid = dict()
grid['alpha'] = np.arange(0.0,1.0,0.01)
search = GridSearchCV(ridgeR, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
results = search.fit(X, y)
print('MSE: %.3f' % (-results.best_score_))
print('Config: %s' % results.best_params_)

MSE: 4053.691
Config: {'alpha': 0.99}


In [7]:
ridgeR = Ridge(alpha=0.99)
scores = cross_val_score(ridgeR, X, y, scoring="neg_mean_squared_error", cv=10)
rmse_ridge = np.sqrt(-scores)
rmse_ridge

array([ 18.76706841,  14.34868177,  14.61083569,  10.64326518,
       161.64889944,  18.32642984,  13.2319799 ,  31.14863866,
       105.32059017,  44.06046763])

In [8]:
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())

Scores: [  -352.20285684   -205.8846684    -213.47651956   -113.27909371
 -26130.36668951   -335.85803071   -175.08529198   -970.23769015
 -11092.42671417  -1941.32480775]
Mean: -4153.014236277964
Standard deviation: 7990.2510577417415


In [9]:
lassoR = Lasso()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
grid = dict()
grid['alpha'] = np.arange(0.0,1.0,0.01)
search = GridSearchCV(lassoR, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
results = search.fit(X, y)
print('MSE: %.3f' % (-results.best_score_))
print('Config: %s' % results.best_params_)

MSE: 4039.352
Config: {'alpha': 0.28}


In [10]:
lassoR = Lasso(alpha = 0.28)
scores = cross_val_score(lassoR, X, y, scoring = "neg_mean_squared_error", cv=10)
rmse_lasso = np.sqrt(-scores)
rmse_lasso

array([ 18.49343187,  14.08981564,  14.29115481,  10.40261117,
       161.72311703,  15.16069054,  14.3293843 ,  30.42532186,
       105.32587868,  42.87396159])

In [11]:
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())

Scores: [  -342.00702225   -198.52290485   -204.23710592   -108.2143192
 -26154.36658341   -229.84653777   -205.33125447   -925.70021043
 -11093.54071919  -1838.17658213]
Mean: -4129.994323960453
Standard deviation: 8006.80718870534


In [12]:
elasticR = ElasticNet()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
grid = dict()
grid['alpha'] = np.arange(0.0,1.0,0.01)
search = GridSearchCV(elasticR, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
results = search.fit(X, y)
print('MSE: %.3f' % (-results.best_score_))
print('Config: %s' % results.best_params_)

MSE: 4025.285
Config: {'alpha': 0.93}


In [13]:
elasticR = ElasticNet(alpha = 0.93)
scores = cross_val_score(elasticR, X, y, scoring = "neg_mean_squared_error", cv=10)
rmse_elastic = np.sqrt(-scores)
rmse_elastic

array([ 16.54838313,  13.2744481 ,  13.65204701,   9.42251753,
       161.66089401,  14.76229105,  13.73734582,  28.82824367,
       105.66839216,  42.43024721])

In [14]:
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())

Scores: [  -273.84898411   -176.21097248   -186.37838762    -88.78383656
 -26134.24465079   -217.92523692   -188.71467012   -831.06763335
 -11165.80910099  -1800.32587843]
Mean: -4106.330935137845
Standard deviation: 8020.087987170149
