# Regression
Kali ini kita akan membandingkan hasil regresi pada beberapa pada satu dataset. 

Data yang akan kita gunakan ada `insurance`

In [1]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt 

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [29]:
data = pd.read_csv('datawi.csv')

In [30]:
data.columns

Index(['nama_diklat', 'start', 'end', 'jamlat', 'nama_peserta', 'nip', 'born',
       'age', 'gender', 'ujian', 'eselon2', 'education', 'posi_desc'],
      dtype='object')

In [31]:
data.head()

Unnamed: 0,nama_diklat,start,end,jamlat,nama_peserta,nip,born,age,gender,ujian,eselon2,education,posi_desc
0,Pelatihan Tata Naskah Dinas Angkatan I,24/01/2019,08/02/2019,35.0,Efi Dyah Indrawati,~ 197206291999032001,Malang,48,WANITA,TIDAK,KU,S2,Wi_Madya
1,Pelatihan Tata Naskah Dinas Angkatan I,24/01/2019,08/02/2019,35.0,Theresia Vera Yuliastanti,~ 197707021999032001,Cilacap,43,WANITA,TIDAK,KU,S2,Wi_Madya
2,Pelatihan Tata Naskah Dinas Angkatan I,24/01/2019,08/02/2019,35.0,Eri Hariyanto,~ 197309011994021001,Yogyakarta,47,PRIA,TIDAK,KU,S2,Wi_Madya
3,Pelatihan Tata Naskah Dinas Angkatan I,31/01/2019,15/02/2019,55.0,Koko Inarto,~ 198404132004121003,Mojokerto,36,PRIA,TIDAK,KNPK,S2,Wi_Muda
4,Pelatihan Penyelesaian Sengketa/Perkara Melalu...,01/02/2019,01/03/2019,151.0,Agus Suharsono,~ 196912311995031001,Sleman,51,PRIA,TIDAK,PUSPA,S2,Wi_Madya


In [21]:
data.posi_desc.unique()

array(['Wi_Madya', 'Wi_Muda', 'Wi_Utama', 'Widyaiswara', 'Wi_Pertama'],
      dtype=object)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392 entries, 0 to 1391
Data columns (total 13 columns):
nama_diklat     1392 non-null object
start           1392 non-null object
end             1392 non-null object
jamlat          1392 non-null float64
nama_peserta    1392 non-null object
nip             1392 non-null object
born            1392 non-null object
age             1392 non-null int64
gender          1392 non-null object
ujian           1392 non-null object
eselon2         1347 non-null object
education       1358 non-null object
posi_desc       1392 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 141.5+ KB


## Encoding
### Label Encoder
Secara umum, semua data yang bersifat **categorical** harus diubah menjadi bentuk angka, dalam hal ini disebut dengan **encoding**. Caranya adalah sebagai berikut:

In [32]:
data.gender = LabelEncoder().fit_transform(data.gender)

data.ujian = LabelEncoder().fit_transform(data.ujian)

In [33]:
data.head()

Unnamed: 0,nama_diklat,start,end,jamlat,nama_peserta,nip,born,age,gender,ujian,eselon2,education,posi_desc
0,Pelatihan Tata Naskah Dinas Angkatan I,24/01/2019,08/02/2019,35.0,Efi Dyah Indrawati,~ 197206291999032001,Malang,48,1,0,KU,S2,Wi_Madya
1,Pelatihan Tata Naskah Dinas Angkatan I,24/01/2019,08/02/2019,35.0,Theresia Vera Yuliastanti,~ 197707021999032001,Cilacap,43,1,0,KU,S2,Wi_Madya
2,Pelatihan Tata Naskah Dinas Angkatan I,24/01/2019,08/02/2019,35.0,Eri Hariyanto,~ 197309011994021001,Yogyakarta,47,0,0,KU,S2,Wi_Madya
3,Pelatihan Tata Naskah Dinas Angkatan I,31/01/2019,15/02/2019,55.0,Koko Inarto,~ 198404132004121003,Mojokerto,36,0,0,KNPK,S2,Wi_Muda
4,Pelatihan Penyelesaian Sengketa/Perkara Melalu...,01/02/2019,01/03/2019,151.0,Agus Suharsono,~ 196912311995031001,Sleman,51,0,0,PUSPA,S2,Wi_Madya


In [25]:
data.posi_desc.value_counts()

Wi_Madya       898
Wi_Muda        280
Wi_Utama       124
Widyaiswara     50
Wi_Pertama      40
Name: posi_desc, dtype: int64

### One Hot Encoder
One Hot Encoder bekerja dengan cara mengabil nilai pada satu kolom yang memiliki data categorical dan memisahkannya menjadi kolom terpisah. Setiap kolom akan diberikan nilai 0 apabila tidak ada dan 1 apabila ada.

In [34]:
posi_desc = data.iloc[:,12:13].values #ndarray

## ohe for region
ohe = OneHotEncoder() 

posi_desc = ohe.fit_transform(posi_desc).toarray()
posi_desc = pd.DataFrame(posi_desc)
posi_desc.columns = ['wi_madya', 'wi_muda', 'wi_utama', 'widyaiswara','wi_pertama']

In [35]:
# kita hapus kolom region
del data['posi_desc']

# kita gabungkan region ke dalam tabel data
data = pd.concat([data, posi_desc], axis=1)

# reorder kolom pada data
columns = ['nama_diklat', 'start', 'end', 'jamlat', 'nama_peserta', 'nip', 'born',
       'age', 'gender', 'ujian', 'eselon2', 'education', 'wi_utama','wi_madya', 'wi_muda', 'wi_pertama','widyaiswara' ]

data = data[columns]

In [46]:
data.head()

Unnamed: 0,nama_diklat,start,end,jamlat,nama_peserta,nip,born,age,gender,ujian,eselon2,education,wi_utama,wi_madya,wi_muda,wi_pertama,widyaiswara
0,Pelatihan Tata Naskah Dinas Angkatan I,24/01/2019,08/02/2019,35.0,Efi Dyah Indrawati,~ 197206291999032001,Malang,48,1,0,KU,S2,0.0,1.0,0.0,0.0,0.0
1,Pelatihan Tata Naskah Dinas Angkatan I,24/01/2019,08/02/2019,35.0,Theresia Vera Yuliastanti,~ 197707021999032001,Cilacap,43,1,0,KU,S2,0.0,1.0,0.0,0.0,0.0
2,Pelatihan Tata Naskah Dinas Angkatan I,24/01/2019,08/02/2019,35.0,Eri Hariyanto,~ 197309011994021001,Yogyakarta,47,0,0,KU,S2,0.0,1.0,0.0,0.0,0.0
3,Pelatihan Tata Naskah Dinas Angkatan I,31/01/2019,15/02/2019,55.0,Koko Inarto,~ 198404132004121003,Mojokerto,36,0,0,KNPK,S2,0.0,0.0,1.0,0.0,0.0
4,Pelatihan Penyelesaian Sengketa/Perkara Melalu...,01/02/2019,01/03/2019,151.0,Agus Suharsono,~ 196912311995031001,Sleman,51,0,0,PUSPA,S2,0.0,1.0,0.0,0.0,0.0


In [42]:
data.eselon2.value_counts(dropna=False)

PUSPA      633
AP         172
KU         155
PUSDBC     146
PSDM       133
KNPK       107
NaN         45
PKNSTAN      1
Name: eselon2, dtype: int64

In [45]:
OneHotEncoder?

[0;31mInit signature:[0m
[0mOneHotEncoder[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_values[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcategorical_features[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcategories[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdrop[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msparse[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;34m<[0m[0;32mclass[0m [0;34m'numpy.float64'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhandle_unknown[0m[0;34m=[0m[0;34m'error'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Encode categorical integer features as a one-hot numeric array.

The input to this transformer should be an array-like of integers or
strings, denoting the values taken on by categorical (discrete) features.
The

In [44]:
eselon2 = data.iloc[:,10:11].values #ndarray

## ohe for region
ohe = OneHotEncoder(drop=True) 

eselon2 = ohe.fit_transform(eselon2).toarray()
eselon2 = pd.DataFrame(eselon2)
eselon2.columns = ['PUSPA', 'AP', 'KU', 'PUSDBC','PSDM','KNPK','PKNSTAN']

ValueError: Input contains NaN

In [None]:
# kita hapus kolom region
del data['eselon2']

# kita gabungkan region ke dalam tabel data
data = pd.concat([data, eselon2], axis=1)

# reorder kolom pada data
columns = ['nama_diklat', 'start', 'end', 'jamlat', 'nama_peserta', 'nip', 'born',
       'age', 'gender', 'ujian', 'education', 'wi_utama','wi_madya', 'wi_muda', 'wi_pertama','widyaiswara',
           'PUSPA', 'AP', 'KU', 'PUSDBC','PSDM','KNPK','PKNSTAN' ]

data = data[columns]

In [None]:
data

In [9]:
#putting the data together:

##take the numerical data from the original data
X_num = data[['age', 'bmi', 'children']].copy()

# Ambil data X saja
X_final = data.drop(['charges'], 1)

#define y as being the "charges column" from the original dataset
y_final = data.charges

#Test train split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )


## Pembuatan Model

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score

### Linear Regression

In [11]:
lr = LinearRegression().fit(X_train,y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

#print score
print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))
print('lr train score %.3f, lr test score: %.3f' % (
lr.score(X_train,y_train),
lr.score(X_test, y_test)))

lr.coef_: [  259.00998437   -89.10791545   319.22120573   550.30215293
   564.2141075    -80.26025643  -245.66875106  -238.28510001
 23286.12557703]
lr.intercept_: -12081.453079488738
lr train score 0.728, lr test score: 0.786


### Polynomial Regression

In [12]:
poly = PolynomialFeatures (degree = 3)
X_poly = poly.fit_transform(X_final)

X_train,X_test,y_train,y_test = train_test_split(X_poly,y_final, test_size = 0.33, random_state = 0)

#standard scaler (fit transform on train, fit only on test)
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float))
X_test= sc.transform(X_test.astype(np.float))

#fit model
poly_lr = LinearRegression().fit(X_train,y_train)

y_train_pred = poly_lr.predict(X_train)
y_test_pred = poly_lr.predict(X_test)

#print score
print('poly train score %.3f, poly test score: %.3f' % (
poly_lr.score(X_train,y_train),
poly_lr.score(X_test, y_test)))

poly train score 0.831, poly test score: 0.841


### Support Vector Regression

In [13]:
svr = SVR(kernel='linear', C = 300)

#test train split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )

#standard scaler (fit transform on train, fit only on test)
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float))
X_test= sc.transform(X_test.astype(np.float))

#fit model
svr = svr.fit(X_train,y_train.values.ravel())
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)

#print score
print('svr train score %.3f, svr test score: %.3f' % (
svr.score(X_train,y_train),
svr.score(X_test, y_test)))

svr train score 0.598, svr test score: 0.628


## Decision Tree Regressor

In [14]:
dt = DecisionTreeRegressor(random_state=0)

#test train split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )

#standard scaler (fit transform on train, fit only on test)
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float))
X_test= sc.transform(X_test.astype(np.float))


#fit model
dt = dt.fit(X_train,y_train.values.ravel())
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

#print score
print('dt train score %.3f, dt test score: %.3f' % (
dt.score(X_train,y_train),
dt.score(X_test, y_test)))


dt train score 0.999, dt test score: 0.716


## Random Forest Regressor

In [15]:
forest = RandomForestRegressor(n_estimators = 100,
                              criterion = 'mse',
                              random_state = 1,
                              n_jobs = -1)
#test train split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )

#standard scaler (fit transform on train, fit only on test)
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float))
X_test= sc.transform(X_test.astype(np.float))

#fit model
forest.fit(X_train,y_train.values.ravel())
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

#print score
print('forest train score %.3f, forest test score: %.3f' % (
forest.score(X_train,y_train),
forest.score(X_test, y_test)))

forest train score 0.973, forest test score: 0.858


## Grid Search CV

In [16]:
from sklearn.model_selection import GridSearchCV

#Function to print best hyperparamaters: 
def print_best_params(gd_model):
    param_dict = gd_model.best_estimator_.get_params()
    model_str = str(gd_model.estimator).split('(')[0]
    print("\n*** {} Best Parameters ***".format(model_str))
    for k in param_dict:
        print("{}: {}".format(k, param_dict[k]))
    print()

#test train split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )

### GridSearch for SVR

In [17]:
#standard scaler (fit transform on train, fit only on test)
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float))
X_test= sc.transform(X_test.astype(np.float))

###Challenge 1: SVR parameter grid###
param_grid_svr = dict(kernel=[ 'linear', 'poly'],
                     degree=[2],
                     C=[600, 700, 800, 900],
                     epsilon=[0.0001, 0.00001, 0.000001])
svr = GridSearchCV(SVR(), param_grid=param_grid_svr, cv=5, verbose=3)


#fit model
svr = svr.fit(X_train,y_train.values.ravel())

#print score
print('\n\nsvr train score %.3f, svr test score: %.3f' % (
svr.score(X_train,y_train),
svr.score(X_test, y_test)))
#print(svr.best_estimator_.get_params())

print_best_params(svr)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.671, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.663, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.571, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.636, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=linear, score=0.555, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.488, total=   0.1s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.335, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.277, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.372, total=   0.0s
[CV] C=600, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=600, degree=2, epsilon=0.0001, kernel=poly, score=0.222, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=linear, score=0.671, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=linear, score=0.663, total=   0.0s




[CV] C=600, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=linear, score=0.571, total=   0.1s
[CV] C=600, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=linear, score=0.636, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=linear, score=0.555, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=poly, score=0.488, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=poly, score=0.335, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=poly .....................




[CV]  C=600, degree=2, epsilon=1e-05, kernel=poly, score=0.277, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=poly, score=0.372, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=600, degree=2, epsilon=1e-05, kernel=poly, score=0.222, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=linear, score=0.671, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=linear, score=0.663, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=linear, score=0.571, total=   0.0s
[CV] C=600, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=600, degree=2, epsilon=1e-06, kernel=linear, score=0.636, total=   0.0s
[CV] C=600, degree=2



[CV]  C=700, degree=2, epsilon=0.0001, kernel=linear, score=0.671, total=   0.1s
[CV] C=700, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=linear, score=0.662, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=linear, score=0.572, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=linear, score=0.636, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=linear, score=0.555, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=poly, score=0.535, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=poly ....................




[CV]  C=700, degree=2, epsilon=0.0001, kernel=poly, score=0.360, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=poly, score=0.309, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=poly, score=0.406, total=   0.0s
[CV] C=700, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=700, degree=2, epsilon=0.0001, kernel=poly, score=0.248, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-05, kernel=linear, score=0.671, total=   0.1s
[CV] C=700, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-05, kernel=linear, score=0.662, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-05, kernel=linear, score=0.572, total=   0.0s
[CV] C=700, degree



[CV]  C=700, degree=2, epsilon=1e-05, kernel=poly, score=0.248, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=linear, score=0.671, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=linear, score=0.662, total=   0.1s
[CV] C=700, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=linear, score=0.572, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=linear, score=0.636, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=linear, score=0.555, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................




[CV]  C=700, degree=2, epsilon=1e-06, kernel=poly, score=0.535, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=poly, score=0.360, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=poly, score=0.309, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=poly, score=0.406, total=   0.0s
[CV] C=700, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=700, degree=2, epsilon=1e-06, kernel=poly, score=0.248, total=   0.0s
[CV] C=800, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=800, degree=2, epsilon=0.0001, kernel=linear, score=0.686, total=   0.1s
[CV] C=800, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=800, degree=2, epsilon=0.0001, kernel=linear, score=0.621, total=   0.0s
[CV] C=800, degree=2, 




[CV] C=800, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=800, degree=2, epsilon=0.0001, kernel=poly, score=0.442, total=   0.0s
[CV] C=800, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=800, degree=2, epsilon=0.0001, kernel=poly, score=0.267, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=linear, score=0.686, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=linear, score=0.621, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=linear, score=0.572, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=linear, score=0.588, total=   0.1s
[CV] C=800, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=800, degree=2, e




[CV] C=800, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=poly, score=0.442, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=800, degree=2, epsilon=1e-05, kernel=poly, score=0.267, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=linear, score=0.686, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=linear, score=0.621, total=   0.1s
[CV] C=800, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=linear, score=0.572, total=   0.1s
[CV] C=800, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=linear, score=0.588, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=800, degree=2, eps




[CV] C=800, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=poly, score=0.377, total=   0.1s
[CV] C=800, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=poly, score=0.343, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=poly, score=0.442, total=   0.0s
[CV] C=800, degree=2, epsilon=1e-06, kernel=poly .....................
[CV]  C=800, degree=2, epsilon=1e-06, kernel=poly, score=0.267, total=   0.0s
[CV] C=900, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=linear, score=0.709, total=   0.1s
[CV] C=900, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=linear, score=0.610, total=   0.1s
[CV] C=900, degree=2, epsilon=0.0001, kernel=linear ..................
[CV]  C=900, degree=2, epsil




[CV] C=900, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=poly, score=0.373, total=   0.0s
[CV] C=900, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=poly, score=0.461, total=   0.0s
[CV] C=900, degree=2, epsilon=0.0001, kernel=poly ....................
[CV]  C=900, degree=2, epsilon=0.0001, kernel=poly, score=0.294, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=linear ...................




[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.709, total=   0.1s
[CV] C=900, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.610, total=   0.1s
[CV] C=900, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.573, total=   0.1s
[CV] C=900, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.571, total=   0.1s
[CV] C=900, degree=2, epsilon=1e-05, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=linear, score=0.555, total=   0.1s
[CV] C=900, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=poly, score=0.584, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=poly, score=0.386, total=   0.0s
[CV] C=900, degree



[CV]  C=900, degree=2, epsilon=1e-05, kernel=poly, score=0.461, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-05, kernel=poly .....................
[CV]  C=900, degree=2, epsilon=1e-05, kernel=poly, score=0.294, total=   0.1s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-06, kernel=linear, score=0.709, total=   0.1s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-06, kernel=linear, score=0.610, total=   0.0s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-06, kernel=linear, score=0.573, total=   0.1s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-06, kernel=linear, score=0.571, total=   0.1s
[CV] C=900, degree=2, epsilon=1e-06, kernel=linear ...................
[CV]  C=900, degree=2, epsilon=1e-06, kernel=linear, score=0.555, total=   0.1s
[CV] C=900, degree






svr train score 0.683, svr test score: 0.734

*** SVR Best Parameters ***
C: 700
cache_size: 200
coef0: 0.0
degree: 2
epsilon: 1e-06
gamma: auto_deprecated
kernel: linear
max_iter: -1
shrinking: True
tol: 0.001
verbose: False



[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    4.9s finished


### GridSearch for DT

In [18]:
###Challenge 2:Decision Tree parameter grid###
param_grid_dt = dict(min_samples_leaf=np.arange(9, 13, 1, int), 
                  max_depth = np.arange(4,7,1, int),
                  min_impurity_decrease = [0, 1, 2],
                 )

dt = GridSearchCV(DecisionTreeRegressor(random_state=0), param_grid=param_grid_dt, cv=5,  verbose=3)



#fit model
dt = dt.fit(X_train,y_train.values.ravel())


#print score
print('\n\ndt train score %.3f, dt test score: %.3f' % (
dt.score(X_train,y_train),
dt.score(X_test, y_test)))
print_best_params(dt)



Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.817, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.819, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.846, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.812, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_leaf=9, score=0.796, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=0, min_samples_leaf=10 .......
[CV]  max_depth=4, min_impurity_decrease=0, min_samples_l

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] max_depth=4, min_impurity_decrease=1, min_samples_leaf=12 .......
[CV]  max_depth=4, min_impurity_decrease=1, min_samples_leaf=12, score=0.815, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=1, min_samples_leaf=12 .......
[CV]  max_depth=4, min_impurity_decrease=1, min_samples_leaf=12, score=0.796, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=2, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=2, min_samples_leaf=9, score=0.817, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=2, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=2, min_samples_leaf=9, score=0.819, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=2, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=2, min_samples_leaf=9, score=0.846, total=   0.0s
[CV] max_depth=4, min_impurity_decrease=2, min_samples_leaf=9 ........
[CV]  max_depth=4, min_impurity_decrease=2, min_samples_leaf=9, score=0.812, total=   0.0s
[CV] max_depth=4, min_imp

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    0.7s finished


### GridSearch for RF

In [19]:
###Challenge 3:Random Forest parameter grid###
param_grid_rf = dict(n_estimators=[20],
                     max_depth=np.arange(1, 13, 2),
                     min_samples_split=[2],
                     min_samples_leaf= np.arange(1, 15, 2, int),
                     bootstrap=[True, False],
                     oob_score=[False, ])


forest = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param_grid_rf, cv=5, verbose=3)

#fit model
forest.fit(X_train,y_train.values.ravel())


#print score
print('\n\nforest train score %.3f, forest test score: %.3f' % (
forest.score(X_train,y_train),
forest.score(X_test, y_test)))

print_best_params(forest)

Fitting 5 folds for each of 84 candidates, totalling 420 fits
[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.616, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.454, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.646, total=   0.0s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.595, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=1, min_samples_split=2, n_estimators=20, oob_score=False, score=0.582, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.616, total=   0.0s
[CV] bootstrap=True, max_depth=1, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, max_depth=1, min_samples_leaf=3, min_samples_split=2, n_estimators=20, oob_score=False, score=0.454, total=   0.0s
[CV] bootstrap=

[Parallel(n_jobs=1)]: Done 420 out of 420 | elapsed:   18.0s finished
