In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import Normalizer

# ignore warnings
import warnings; warnings.simplefilter('ignore')

# Pretty display for notebook
%matplotlib inline

In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

In [68]:
# Load the data
try:
    data = pd.read_excel('Property Transaction Data More Samples.xlsx')
    data.drop(['Company','Year'], axis=1, inplace=True)
    print("Pricing data has {} data points with {} features".format(*data.shape))
except:
    print("Data not loaded. Is the dataset missing?")

Pricing data has 81 data points with 10 features


In [69]:
seed = 12
new_data = data.drop(['ROL'], axis=1)
#log_data = np.log(new_data)
x_train, x_test, y_train, y_test = train_test_split(new_data, data['ROL'].values.reshape(-1), \
                                                    test_size=0.3, random_state=seed)
print('X_Train size: {} \nX_Test: {} \nY_Train: {}, \nY_Test: {}'.\
      format(x_train.shape, x_test.shape, y_train.shape, y_test.shape))
print('Seed: {}'.format(seed))

X_Train size: (56, 9) 
X_Test: (25, 9) 
Y_Train: (56,), 
Y_Test: (25,)
Seed: 12


In [70]:
# Setting test options
num_folds = 10
num_instances = len(x_train)
scoring = 'r2'

In [71]:
norm = Normalizer().fit(x_train)
normalizedX = norm.transform(x_train)

In [23]:
norm = Normalizer().fit(x_train)
normalizedX = norm.transform(x_train)
param_grid = dict(n_estimators=np.array([50,100,150,200,250,300,350,400]), \
                  loss=['ls','lad','huber'], \
                  max_depth=np.array([1,2,3,4,5,6,7,8,9,10]))
model = GradientBoostingRegressor(random_state=seed)
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(normalizedX, y_train)

In [19]:
param_grid

{'loss': ['ls', 'lad', 'huber'],
 'n_estimators': array([ 50, 100, 150, 200, 250, 300, 350, 400])}

In [7]:
grid_result

GridSearchCV(cv=KFold(n_splits=10, random_state=7, shuffle=False),
       error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=7,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 50, 100, 150, 200, 250, 300, 350, 400])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=0)

In [20]:
grid_result.grid_scores_

[mean: 0.90828, std: 0.09852, params: {'loss': 'ls', 'n_estimators': 50},
 mean: 0.91153, std: 0.09660, params: {'loss': 'ls', 'n_estimators': 100},
 mean: 0.91162, std: 0.09643, params: {'loss': 'ls', 'n_estimators': 150},
 mean: 0.91161, std: 0.09643, params: {'loss': 'ls', 'n_estimators': 200},
 mean: 0.91161, std: 0.09643, params: {'loss': 'ls', 'n_estimators': 250},
 mean: 0.91161, std: 0.09643, params: {'loss': 'ls', 'n_estimators': 300},
 mean: 0.91161, std: 0.09643, params: {'loss': 'ls', 'n_estimators': 350},
 mean: 0.91161, std: 0.09643, params: {'loss': 'ls', 'n_estimators': 400},
 mean: 0.87603, std: 0.12709, params: {'loss': 'lad', 'n_estimators': 50},
 mean: 0.89702, std: 0.11049, params: {'loss': 'lad', 'n_estimators': 100},
 mean: 0.89928, std: 0.10840, params: {'loss': 'lad', 'n_estimators': 150},
 mean: 0.90047, std: 0.10946, params: {'loss': 'lad', 'n_estimators': 200},
 mean: 0.90103, std: 0.10935, params: {'loss': 'lad', 'n_estimators': 250},
 mean: 0.90136, std: 0

In [24]:
print('Best score: {} using {}'.format(grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
    print('Score(mean): {} (Std. dev: {}) with: {}'.format(scores.mean(), scores.std(), params))

Best score: 0.9363761595995699 using {'loss': 'huber', 'max_depth': 2, 'n_estimators': 400}
Score(mean): 0.8111339362432173 (Std. dev: 0.13673988162985484) with: {'loss': 'ls', 'max_depth': 1, 'n_estimators': 50}
Score(mean): 0.8526322843448082 (Std. dev: 0.1280969583197328) with: {'loss': 'ls', 'max_depth': 1, 'n_estimators': 100}
Score(mean): 0.8685478382890626 (Std. dev: 0.1234634269027368) with: {'loss': 'ls', 'max_depth': 1, 'n_estimators': 150}
Score(mean): 0.8752618006214629 (Std. dev: 0.12240637152718882) with: {'loss': 'ls', 'max_depth': 1, 'n_estimators': 200}
Score(mean): 0.878971917263015 (Std. dev: 0.12168252217714906) with: {'loss': 'ls', 'max_depth': 1, 'n_estimators': 250}
Score(mean): 0.8813516434138948 (Std. dev: 0.12148168079602482) with: {'loss': 'ls', 'max_depth': 1, 'n_estimators': 300}
Score(mean): 0.883250843486276 (Std. dev: 0.12082429224736368) with: {'loss': 'ls', 'max_depth': 1, 'n_estimators': 350}
Score(mean): 0.8845190223671748 (Std. dev: 0.12058980717756

In [25]:
x_train.head(5)

Unnamed: 0,Layer,Limit,Retention,100% Prem,Prob. of Act.,Prob. of Exh.,100% Ceded AAL,100% Std. Dev.,Loss on Line
26,1,100000000,100000000,18750000.0,0.2727,0.0993,17684970.0,37731190.0,0.1768
32,3,20000000,20000000,758000.0,0.017,0.0064,205280.0,1857387.0,0.0103
33,1,11000000,4000000,1182500.0,0.1359,0.0499,869965.0,2801299.0,0.0791
54,3,100000000,300000000,5700000.0,0.0366,0.0202,2844761.0,16440760.0,0.0284
71,3,20000000,30000000,750000.0,0.0174,0.0068,216055.8,1906872.0,0.010803


In [32]:
normalizedX[:,0]

array([6.72856051e-09, 1.05797461e-07, 8.24779306e-08, 9.47212088e-09,
       8.30695021e-08, 6.76451539e-08, 4.32540694e-08, 8.36042515e-08,
       9.33427244e-08, 9.46922479e-09, 8.92772529e-09, 1.40436466e-07,
       2.72857398e-08, 6.77997902e-09, 8.87127065e-09, 3.87901090e-08,
       9.33028493e-08, 8.92527985e-09, 7.03062695e-08, 3.37432126e-08,
       2.09814565e-07, 6.32851047e-08, 1.34171507e-07, 9.33114793e-08,
       2.72188726e-08, 7.03239450e-08, 9.33521899e-08, 3.51456439e-08,
       4.52912369e-08, 8.88964984e-09, 2.09866728e-07, 5.50260206e-08,
       1.40274422e-07, 8.25876036e-08, 6.75894474e-08, 1.05714148e-07,
       4.22265114e-08, 8.93071338e-09, 8.91656170e-08, 6.69940103e-09,
       5.28498277e-08, 8.25272598e-08, 2.63031143e-07, 2.34993826e-08,
       8.30527464e-08, 9.33674793e-08, 2.72649184e-08, 7.84377024e-08,
       4.23882296e-08, 7.84346868e-08, 5.61200057e-08, 2.74698474e-07,
       7.03050801e-08, 1.10547689e-07, 4.70011029e-08, 5.23179860e-08])

In [34]:
import math

In [40]:
np.square(normalizedX[:,0])

array([4.52735266e-17, 1.11931028e-14, 6.80260903e-15, 8.97210741e-17,
       6.90054218e-15, 4.57586685e-15, 1.87091452e-15, 6.98967087e-15,
       8.71286420e-15, 8.96662182e-17, 7.97042788e-17, 1.97224011e-14,
       7.44511599e-16, 4.59681155e-17, 7.86994430e-17, 1.50467256e-15,
       8.70542169e-15, 7.96606205e-17, 4.94297153e-15, 1.13860440e-15,
       4.40221517e-14, 4.00500448e-15, 1.80019933e-14, 8.70703217e-15,
       7.40867028e-16, 4.94545724e-15, 8.71463136e-15, 1.23521629e-15,
       2.05129614e-15, 7.90258743e-17, 4.40440435e-14, 3.02786294e-15,
       1.96769135e-14, 6.82071226e-15, 4.56833340e-15, 1.11754811e-14,
       1.78307826e-15, 7.97576415e-17, 7.95050725e-15, 4.48819742e-17,
       2.79310428e-15, 6.81074861e-15, 6.91853824e-14, 5.52220984e-16,
       6.89775869e-15, 8.71748618e-15, 7.43375776e-16, 6.15247316e-15,
       1.79676201e-15, 6.15200010e-15, 3.14945504e-15, 7.54592516e-14,
       4.94280429e-15, 1.22207914e-14, 2.20910367e-15, 2.73717166e-15])

In [43]:
sum(np.square(normalizedX[1]))

1.0

In [44]:
grid_result.estimator

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=7,
             subsample=1.0, verbose=0, warm_start=False)

In [47]:
model1 = GradientBoostingRegressor(random_state=seed, n_estimators=50)
model1.fit(normalizedX, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, presort='auto', random_state=7,
             subsample=1.0, verbose=0, warm_start=False)

In [48]:
model1.feature_importances_

array([3.49444875e-05, 1.07509587e-01, 8.11870729e-02, 6.30492185e-01,
       0.00000000e+00, 0.00000000e+00, 1.59791887e-01, 2.09843229e-02,
       0.00000000e+00])

In [72]:
model4 = GradientBoostingRegressor(random_state=seed, n_estimators=50)
model4.fit(normalizedX, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, presort='auto', random_state=12,
             subsample=1.0, verbose=0, warm_start=False)

In [76]:
tree = model4.estimators_[0, 0].tree_
leaf_mask = tree.children_left == -1
w_i = tree.value[leaf_mask, 0, 0]

In [77]:
w_i

array([-0.07839893, -0.06145893, -0.03159393,  0.01394464,  0.08158274,
        0.16341607,  0.19508274,  0.15571607])

In [80]:
model4.feature_importances_

array([2.37860208e-04, 7.53837506e-02, 8.93663347e-02, 5.84431980e-01,
       0.00000000e+00, 0.00000000e+00, 2.41774224e-01, 8.80585026e-03,
       0.00000000e+00])