In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier as rf

In [4]:
# 데이터 불러오기
raw_data = pd.read_csv('final_data.csv')
raw_data.head()

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t493,t494,t495,t496,t497,t498,t499,t500,t501,y
0,0.019336,0.0,0.0,0.0,0.003223,0.0,0.0,0.0,0.0,0.0,...,0.029004,0.009668,0.012891,0.0,0.0,0.0,0.003223,0.003223,0.0,0
1,0.0,0.0,0.012891,0.0,0.016113,0.0,0.006445,0.0,0.003223,0.022559,...,0.0,0.0,0.0,0.009668,0.0,0.0,0.0,0.009668,0.0,0
2,0.0,0.009668,0.0,0.0,0.006445,0.012891,0.0,0.0,0.029004,0.025781,...,0.006445,0.003223,0.012891,0.0,0.0,0.0,0.0,0.003223,0.0,0
3,0.0,0.0,0.0,0.016113,0.006445,0.003223,0.0,0.022559,0.012891,0.0,...,0.0,0.003223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.016113,0.0,0.0,0.0,0.012891,0.0,0.0,0.003223,0.003223,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003223,0.0,0


In [5]:
# x와 y로 데이터 구분하기
x = raw_data.iloc[:, 0:501]
y = raw_data.iloc[:, 501]

In [6]:
# train/test로 데이터 나누기
seed = 7
test_size = 0.3
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state=seed)

In [7]:
# fit model no training data
model = rf()
model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
# make predictions for test data
y_pred = model.predict(x_test)

In [9]:
y_test.values

array([0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0])

In [10]:
# evaluate predictions
confusion_mat = confusion_matrix(y_test, y_pred)

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_score = f1_score(y_test.values, y_pred)

In [11]:
print(confusion_mat)
print('acc:',acc)
print('precision:',precision)
print('recall:',recall)
print('fscore:',f_score)

[[18  1]
 [ 4 10]]
acc: 0.8484848484848485
precision: 0.9090909090909091
recall: 0.7142857142857143
fscore: 0.8


In [140]:
#parameter tuning
from pprint import pprint

print('Parameters currently in use:\n')
pprint(model.get_params())

Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [141]:
#from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 350, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(55, 65, num = 10)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [3,4,5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3]

# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True],
 'max_depth': [55, 56, 57, 58, 59, 60, 61, 62, 63, 65, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 3],
 'min_samples_split': [3, 4, 5],
 'n_estimators': [200, 216, 233, 250, 266, 283, 300, 316, 333, 350]}


In [142]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(estimator = model, param_grid = random_grid, scoring = 'accuracy',
                        cv = 5, verbose=2, n_jobs = 4)

rf_random = GridSearchCV(estimator = model, param_grid = random_grid, scoring = 'accuracy',
                        cv = 5, verbose=2, n_jobs = 4)

In [None]:
rf_random.fit(x_train, y_train)

In [144]:
print(rf_random.best_params_)
#print(pd.DataFrame(rf_random.cv_results_))

{'bootstrap': True, 'max_depth': 55, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 283}


In [145]:
rf_random_pred = rf_random.predict(x_test)

confusion_mat_tuned = confusion_matrix(y_test, rf_random_pred)
acc_tuned = accuracy_score(y_test, rf_random_pred)
precision_tuned = precision_score(y_test, rf_random_pred)
recall_tuned = recall_score(y_test, rf_random_pred)
f_score_tuned = f1_score(y_test.values, rf_random_pred)

In [146]:
print(confusion_mat_tuned)
print('acc:',acc_tuned)
print('precision:',precision_tuned)
print('recall:',recall_tuned)
print('fscore:',f_score_tuned)

[[18  1]
 [ 1 13]]
acc: 0.9393939393939394
precision: 0.9285714285714286
recall: 0.9285714285714286
fscore: 0.9285714285714286
