## 随机森林的直接调用  

#### 这一节课我们利用现成的随机森林库函数对蘑菇进行有毒和无毒的简单分类   

- 数据来源： https://www.kaggle.com/uciml/mushroom-classification/data  
- 对比模型： 随机森林，决策树，Logistic回归模型

In [3]:

%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

np.random.seed(19)

### 读入数据

In [4]:
data_folder = ""
data = pd.read_csv(os.path.join(data_folder, "mushrooms.csv"), header=None)


In [5]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
1,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
2,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
3,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
4,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u


### 处理二分类问题的标签

In [6]:
data[0] = data.apply(lambda row: 0 if row[0] == 'e' else 1, axis=1)

### 处理每列的数据

In [7]:
# 每一列如果有null，用"missing"代替
cols = np.arange(1,23)
for col in cols:
    if np.any(data[col].isnull()):
        data.loc[data[col].isnull(), col] = 'missing'


In [8]:
labelEncoders = dict()

# 对每一列进行one hot encoding
for col in cols:
    encoder = LabelEncoder()
    values = data[col].tolist()
    values.append('missing')  #加入missing这种值
    encoder.fit(values)
    labelEncoders[col] = encoder

# 计算one hot encoding之后的列数
dimensionality = 0
for col, encoder in labelEncoders.items():
    dimensionality += len(encoder.classes_)
print("dimensionality:  %d" % (dimensionality))

dimensionality:  161


In [9]:
# 用于测试数据的变换
def transform(df):
    N, _ = df.shape
    X = np.zeros((N, dimensionality))
    i = 0
    for col ,encoder in labelEncoders.items():
        k = len(encoder.classes_)
        X[np.arange(N), encoder.transform(df[col]) + i] = 1
        i += k
    return X

In [12]:
# 准备数据和标签
X = transform(data.iloc[:,1:])
Y = data[0].to_numpy()

### Logistic回归的表现

In [13]:
logistic_model = LogisticRegression()
print("logistic Regression performance: %f" % (cross_val_score(logistic_model, X, Y, cv=8).mean()))

logistic Regression performance: 0.925634


### 决策树的表现

In [14]:
tree_model = DecisionTreeClassifier()
print("Decision Tree performance: %f" % (cross_val_score(tree_model, X, Y, cv=8).mean()))

Decision Tree performance: 0.930819


### 随机森林的表现

In [15]:
forest = RandomForestClassifier(n_estimators=20)
print("Random Forest performance: %f" % (cross_val_score(tree_model, X, Y, cv=8).mean()))

Random Forest performance: 0.945475


### Bagging决策树的实现

In [49]:
class BaggedTreeClassifier(BaseEstimator):
    def __init__(self, M):
        self.M = M

    def fit(self, X, Y):
        N = len(X)
        self.models = []
        for m in range(self.M):
            idx = np.random.choice(N, size=N, replace=True)
            Xb = X[idx]
            Yb = Y[idx]

            model = DecisionTreeClassifier(max_depth=2)
            model.fit(Xb, Yb)
            self.models.append(model)

    def predict(self, X):
        # no need to keep a dictionary since we are doing binary classification
        predictions = np.zeros(len(X))
        for model in self.models:
            predictions += model.predict(X)
        return np.round(predictions / self.M)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(Y == P)

In [19]:
baggedtc = BaggedTreeClassifier(20)

In [20]:
cross_val_score(baggedtc, X, Y, cv=8).mean()

0.9055898723866413

In [21]:
fakerf = FakeRandomForest(20)

In [22]:
cross_val_score(fakerf, X, Y, cv=8).mean()

0.8887257088553586