# Размер случайного леса

In [23]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, cross_val_score

In [10]:
data = pd.read_csv('data/abalone.csv')
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [11]:
data['Sex'] = data['Sex'].map(lambda x: 1 if x == 'M' else (-1 if x == 'F' else 0))
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,-1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [21]:
y_train = data['Rings']
X_train = data.loc[:, 'Sex':'ShellWeight']

In [24]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = []
for i in range(1, 51):
    forest = RandomForestRegressor(n_estimators=i, random_state=1)
    forest.fit(X_train, y_train)
    val_score = cross_val_score(forest, X_train, y_train, cv=kfold, scoring='r2')
    scores.append(val_score)

In [58]:
pd.DataFrame(scores).mean(1).map(lambda x: x if x > 0.52 else None).sort_values(ascending=True).head(1)

21    0.520158
dtype: float64

# Градиентный бустинг над решающими деревьями

In [105]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import math
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
%matplotlib inline

In [76]:
data = pd.read_csv('data/gbm-data.csv')
data_values = data.values
X = data.loc[:, 'D1':'D1776']
y = data['Activity']

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

In [103]:
# learning_rate = [1, 0.5, 0.3, 0.2, 0.1] 
learning_rate = [0.2] 
for i in learning_rate:
    clf = GradientBoostingClassifier(learning_rate=i, n_estimators=250, verbose=True, random_state=241)
    clf.fit(X_train, y_train)
    
    ll_train = []
    for x in clf.staged_decision_function(X_train):
        y_pred = [1 / (1 + math.exp(-y)) for y in x]
        ll_train.append(log_loss(y_train, y_pred))
        
    ll_test = []
    for x in clf.staged_decision_function(X_test):
        y_pred = [1 / (1 + math.exp(-y)) for y in x]
        ll_test.append(log_loss(y_test, y_pred))
    
#     plt.figure()
#     plt.plot(ll_train, 'r', linewidth=2)    
#     plt.plot(ll_test, 'g', linewidth=2)
#     plt.legend(['train', 'test'])


      Iter       Train Loss   Remaining Time 
         1           1.2613           15.30s
         2           1.1715           13.12s
         3           1.1009           12.56s
         4           1.0529           12.32s
         5           1.0130           12.22s
         6           0.9740           11.99s
         7           0.9475           11.07s
         8           0.9197           11.01s
         9           0.8979           10.43s
        10           0.8730           10.42s
        20           0.7207            8.04s
        30           0.6055            7.18s
        40           0.5244            6.52s
        50           0.4501            6.08s
        60           0.3908            5.67s
        70           0.3372            5.35s
        80           0.3009            5.03s
        90           0.2603            4.73s
       100           0.2327            4.37s
       200           0.0835            1.39s


In [104]:
print(min(ll_train), ll_train.index(min(ll_train)))
print(min(ll_test), ll_test.index(min(ll_test)))

0.024906124380084917 249
0.5314507963190638 36


In [106]:
clf = RandomForestClassifier(n_estimators=36, random_state=241)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=36, n_jobs=1,
            oob_score=False, random_state=241, verbose=0, warm_start=False)

In [107]:
y_pred = clf.predict_proba(X_test)
log_loss(y_test, y_pred)

0.5413812861804069