In [1]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from scipy.io.arff import loadarff 


In [2]:
raw_data = loadarff('Dry_Bean_Dataset.arff')
df = pd.DataFrame(raw_data[0])

df.Class = pd.factorize(df.Class)[0]

df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,0
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,0
2,29380.0,624.11,212.82613,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,0
3,30008.0,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,0
4,30140.0,620.134,201.847882,190.279279,1.060798,0.33368,30417.0,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,0


In [3]:
X = df.drop('Class', axis=1)  
y = df['Class']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 1)

In [4]:
from itertools import product
n_estimators = [10, 100]
max_features = ['sqrt', 'log2']
max_depth = [1,15]
min_samples_split = [2,20,50]
min_samples_leaf = [2,20,50]
bootstrap = [True, False]

t = pd.DataFrame(columns=[
    "n_estimators", "max_features","max_depth","min_samples_split",
    "min_samples_leaf","bootstrap","score"
])


for n,f,d,s,l,b in product(n_estimators, max_features,max_depth, min_samples_split, min_samples_leaf,bootstrap):
    rf = RandomForestClassifier(
        n_estimators=n,
        max_features=f,
        min_samples_leaf=l,
        max_depth=d,
        bootstrap=b
    )

    rf.fit(X_train,y_train)
    sc = rf.score(X_test,y_test)
    t.loc[t.shape[0]] = [n,f,d,s,l,b,sc]


t = t.sort_values('score', ascending=False)
t.head(10)

Unnamed: 0,n_estimators,max_features,max_depth,min_samples_split,min_samples_leaf,bootstrap,score
126,100,log2,15,2,2,True,0.92431
103,100,sqrt,15,50,2,False,0.924087
132,100,log2,15,20,2,True,0.923865
90,100,sqrt,15,2,2,True,0.923197
102,100,sqrt,15,50,2,True,0.923197
138,100,log2,15,50,2,True,0.922974
96,100,sqrt,15,20,2,True,0.922752
139,100,log2,15,50,2,False,0.922084
97,100,sqrt,15,20,2,False,0.921193
55,10,log2,15,2,2,False,0.920971


In [5]:
rf = RandomForestClassifier(
        n_estimators=t.iloc[0]['n_estimators'],
        max_features=t.iloc[0]['max_features'],
        min_samples_leaf=t.iloc[0]['min_samples_leaf'],
        max_depth=t.iloc[0]['max_depth'],
        bootstrap=t.iloc[0]['bootstrap']
    )

rf.fit(X_train,y_train)

print(f'Точность\nна обучающей выборке:{rf.score(X_train,y_train):.4f}\nна тестовой:{rf.score(X_test,y_test):.4f}')


Точность
на обучающей выборке:0.9838
на тестовой:0.9254


In [6]:
rf = RandomForestClassifier(
        n_estimators=t.iloc[0]['n_estimators'],
        max_features=t.iloc[0]['max_features'],
        min_samples_leaf=t.iloc[0]['min_samples_leaf'],
        max_depth=t.iloc[0]['max_depth'],
        bootstrap=t.iloc[0]['bootstrap']
    )

rf.fit(X_train,y_train)

print(f'Точность\nна обучающей выборке:{rf.score(X_train,y_train):.4f}\nна тестовой:{rf.score(X_test,y_test):.4f}')


Точность
на обучающей выборке:0.9822
на тестовой:0.9232


In [7]:
rf = RandomForestClassifier(
        n_estimators=t.iloc[0]['n_estimators'],
        max_features=t.iloc[0]['max_features'],
        min_samples_leaf=t.iloc[0]['min_samples_leaf'],
        max_depth=t.iloc[0]['max_depth'],
        bootstrap=t.iloc[0]['bootstrap']
    )

rf.fit(X_train,y_train)

print(f'Точность\nна обучающей выборке:{rf.score(X_train,y_train):.4f}\nна тестовой:{rf.score(X_test,y_test):.4f}')

Точность
на обучающей выборке:0.9843
на тестовой:0.9239
