In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score,mean_squared_error

from sklearn.ensemble import GradientBoostingRegressor

In [5]:
concrete = pd.read_csv('dataset/Concrete_Data.csv')

concrete.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [6]:
x = concrete.drop(['csMPa', 'flyash', 'coarseaggregate', 'fineaggregate'], axis = 1)
y = concrete['csMPa']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [13]:
baseline = GradientBoostingRegressor(n_estimators = 50, max_depth = 3)

baseline.fit(x_train, y_train)

GradientBoostingRegressor(n_estimators=50)

In [14]:
y_pred = baseline.predict(x_test)

r2_score(y_test,y_pred)

0.8935194830507225

In [15]:
important_features = pd.Series(baseline.feature_importances_, index = x.columns).sort_values(ascending = True)

important_features

water               0.090948
slag                0.098392
superplasticizer    0.110050
cement              0.324211
age                 0.376399
dtype: float64

In [16]:
gbr = GradientBoostingRegressor(max_depth = 3)

In [19]:
parameters = {'n_estimators' : [1, 5, 10, 50, 100, 200, 300, 400, 500]}

gridsearch_reg = GridSearchCV(gbr, parameters, cv = 3)

In [20]:
gridsearch_reg.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=GradientBoostingRegressor(),
             param_grid={'n_estimators': [1, 5, 10, 50, 100, 200, 300, 400,
                                          500]})

In [21]:
gridsearch_reg.best_params_

{'n_estimators': 500}

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [25]:
gbr_best = GradientBoostingRegressor(n_estimators = gridsearch_reg.best_params_['n_estimators'], max_depth = 3)

gbr_best.fit(x_train, y_train)

GradientBoostingRegressor(n_estimators=500)

In [26]:
y_pred = gbr_best.predict(x_test)

r2_score(y_test, y_pred)

0.9206101558946057

In [27]:
gbr = GradientBoostingRegressor(max_depth = 3, warm_start = True)

In [29]:
min_val_error = float('inf')
error_increasng = 0 

for n_estimators in range(1,1000):
    gbr.n_estomators = n_estimators 
    gbr.fit(x_train, y_train)
    
    y_pred = gbr.predict(x_test)
    val_error = mean_squared_error(y_test, y_pred)
    
    print('Number of estimators', gbr.n_estimators_)
    print('validation error', val_error)
    
    if val_error < min_val_error:
        min_val_error = val_error
        error_increasng = 0
    else:
        error_increasng += 1
        if error_increasng == 10:
            break

Number of estimators 100
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748
Number of estimators 101
validation error 29.69497304786748


In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [32]:
gbr_best  = GradientBoostingRegressor(max_depth = 3, n_estimators = 101)

gbr_best.fit(x_train, y_train)

GradientBoostingRegressor(n_estimators=101)

In [33]:
y_pred = gbr_best.predict(x_test)

r2_score(y_test, y_pred)

0.9146235622658778