### 랜덤포레스트
- ensemble을 기반으로 고정밀 분류, 회귀, 클러스터링 등을 구현
- 학습 전용 데이터를 기반으로 다수의 의사결정 트리(C-tree)를 만들고 의사결정 트리를 기반으로 다수결로 결과를 유도하므로 높은 정밀도

In [1]:
import pandas as pd


In [2]:
csv = pd.read_csv('../data/iris.csv')

csv.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
import numpy as np

In [4]:
# Feature, Target구분

csv_data = csv.iloc[:, :-1]
csv_label = csv['Name']



In [5]:
# Train과 Test로 구분
from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = train_test_split(
    csv_data,
    csv_label,
    random_state=42,
    stratify=csv_label
)


In [6]:
# 학습시키기

from sklearn.ensemble import RandomForestClassifier

In [25]:
clf = RandomForestClassifier()
clf.fit(train_data, train_label)

print(clf.score(train_data, train_label))

1.0


In [26]:
# 일반화 성능 측정하기
clf.score(test_data, test_label)


0.9210526315789473

In [27]:
# 정답률 구하기
from sklearn import metrics

In [28]:
# 예측값 구하가ㅣ


pred = clf.predict(test_data)

In [29]:
print(metrics.classification_report(test_label, pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        12
Iris-versicolor       0.86      0.92      0.89        13
 Iris-virginica       0.92      0.85      0.88        13

       accuracy                           0.92        38
      macro avg       0.92      0.92      0.92        38
   weighted avg       0.92      0.92      0.92        38



---
## 독버섯과 관련된 데이터를 사용한 머신러닝

In [33]:
import urllib.request as req
local = '../data/mushroom.csv'
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"

req.urlretrieve(url, local)
print('OK')

OK


In [35]:
mr = pd.read_csv("../data/mushroom.csv", header=None)


mr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


- 한줄이 버섯 한종류
- 0: 독(Poison), 식용버섯(Edible)
- 1 : 버섯의 모양 -> b(종모양), c(원뿔), x(볼록), f(평평), k(혹), s(오목)
- 2 : 몰라용
- 3 : 버섯의 머리 색깔 -> n(갈색), b(황갈색), c(연한갈색) ...

In [37]:
# 머신러닝을 사용하려면 Feature는 모두 숫자로 되어있어야 한다.
ord('x')

120

In [47]:
mr2 = pd.concat([mr.iloc[:,0] ,mr.iloc[:, 1:].map(ord)], axis=1)
mr2.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,e,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,e,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,p,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,e,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


In [93]:
mr2_data = mr2.iloc[:, 1:]
mr2_target = mr2.iloc[:, 0]

train_data, test_data, train_label, test_label = train_test_split(
    mr2_data,
    mr2_target,
    # random_state=42,
    stratify=mr2_target
)


In [94]:
clf = RandomForestClassifier()
clf.fit(train_data, train_label)

print(clf.score(train_data, train_label))

print(clf.score(test_data, test_label))

1.0
1.0


In [95]:
pred = clf.predict(test_data)
print(metrics.classification_report(test_label, pred))

In [96]:
print(metrics.classification_report(test_label, pred))

              precision    recall  f1-score   support

           e       1.00      1.00      1.00      1052
           p       1.00      1.00      1.00       979

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031



### One-Hot Encoding
: 숫자데이터가 숫자로서 의미가 있으면 상관 없지만, 위의 데이터는 분류를 위한 데이터이므로, 숫자 크기의 의미가 없다.  
이때 사용하는것이 One Hot Encoding

In [97]:
#1번열 데이터의 종류 확인

mr2[1].unique()

array([120,  98, 115, 102, 107,  99])

In [99]:
# 연습하기:

pd.get_dummies(data = mr2_data, columns=[1], prefix='1')

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,19,20,21,22,1_98,1_99,1_102,1_107,1_115,1_120
0,115,110,116,112,102,99,110,107,101,101,...,112,107,115,117,False,False,False,False,False,True
1,115,121,116,97,102,99,98,107,101,99,...,112,110,110,103,False,False,False,False,False,True
2,115,119,116,108,102,99,98,110,101,99,...,112,110,110,109,True,False,False,False,False,False
3,121,119,116,112,102,99,110,110,101,101,...,112,107,115,117,False,False,False,False,False,True
4,115,103,102,110,102,119,98,107,116,101,...,101,110,97,103,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,115,110,102,110,97,99,98,121,101,63,...,112,98,99,108,False,False,False,True,False,False
8120,115,110,102,110,97,99,98,121,101,63,...,112,98,118,108,False,False,False,False,False,True
8121,115,110,102,110,97,99,98,110,101,63,...,112,98,99,108,False,False,True,False,False,False
8122,121,110,102,121,102,99,110,98,116,63,...,101,119,118,108,False,False,False,True,False,False


In [112]:
# 적용하기
mr3 = pd.get_dummies(data = mr2_data, columns=range(1, 23), prefix=range(1, 23))
mr3

Unnamed: 0,1_98,1_99,1_102,1_107,1_115,1_120,2_102,2_103,2_115,2_121,...,21_115,21_118,21_121,22_100,22_103,22_108,22_109,22_112,22_117,22_119
0,False,False,False,False,False,True,False,False,True,False,...,True,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
2,True,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,True,False,False,False,True,...,True,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,False,False,False,True,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
8120,False,False,False,False,False,True,False,False,True,False,...,False,True,False,False,False,True,False,False,False,False
8121,False,False,True,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
8122,False,False,False,True,False,False,False,False,False,True,...,False,True,False,False,False,True,False,False,False,False


In [172]:
mr3_data = mr3.iloc[:, range(50)]
mr3_target = mr2.iloc[:, 0]

train_data, test_data, train_label, test_label = train_test_split(
    mr3_data,
    mr3_target,
    random_state=42,
    stratify=mr3_target
)

In [173]:
clf = RandomForestClassifier()
clf.fit(train_data, train_label)

print(clf.score(train_data, train_label))

print(clf.score(test_data, test_label))

0.999015263417036
0.9970457902511078


In [174]:
pred = clf.predict(test_data)
print(metrics.classification_report(test_label, pred))

              precision    recall  f1-score   support

           e       1.00      1.00      1.00      1052
           p       0.99      1.00      1.00       979

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031

