随机森林模型实例

作者：谢文伟

邮件：jim.xie.cn@outlook.com

主页：https://github.com/jim-xie-cn/ai-cv

In [None]:
#引用以下包做数据处理
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
#引用以下包做数据可视化
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from warnings import filterwarnings

In [None]:
filterwarnings('ignore')
pd.set_option('display.float_format',lambda x:'%.2f'%x) #不使用科学计数法
sns.set(font_scale=1.5) #设置统计图字体大小
plt.rcParams['font.sans-serif']=['SimHei'] #在统计图上显示中文
plt.style.use({'figure.figsize':(24, 8)})  #设置画布大小

In [None]:
#显示数据集散点图
def plot_dataset(x, y):
    (min_x1,min_x2) = (np.min(x[:,0])-1,np.min(x[:,1])-1)
    (max_x1,max_x2) = (np.max(x[:,0])+1,np.max(x[:,1])+1)
    axes = [min_x1, max_x1, min_x2, max_x2]
    plt.scatter(x[y==0][:,0],x[y==0][:,1],marker='*',s=150)
    plt.scatter(x[y==1][:,0],x[y==1][:,1],marker='1',s=150)
    plt.scatter(x[y==2][:,0],x[y==2][:,1],marker='+',s=150)
#显示判别曲线
def plot_predictions(clf,x):
    (min_x1,min_x2)= (np.min(x[:,0])-1,np.min(x[:,1])-1)
    (max_x1,max_x2)= (np.max(x[:,0])+1,np.max(x[:,1])+1)
    axes = [min_x1, max_x1, min_x2, max_x2]
    x0s = np.linspace(axes[0], axes[1] , 200)
    x1s = np.linspace(axes[2], axes[3], 200)
    x0, x1 = np.meshgrid(x0s, x1s)
    test_x = np.c_[x0.ravel(), x1.ravel()]
    y_pred = clf.predict(test_x)
    y_pred = y_pred.reshape(x0.shape)
    plt.contourf(x0, x1, y_pred, cmap=plt.cm.brg, alpha=0.25)
#现实判别曲线和测试数据
def plot_predict_curve(model,x,y):
    plot_predictions(model,x)
    plot_dataset(x,y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
#随机生成样本（2个特征，3个分类）
X,Y= make_blobs(n_samples=1000,n_features=2,centers=3)
#划分成训练集和测试集，测试集占总样本的30%，训练集占70%
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.30)
#定义、训练和评估模型(使用5棵决策树)
model = RandomForestClassifier(n_estimators=5)
model.fit(X_train, Y_train)
score = model.score(X_test,Y_test)
print("模型得分:",score)
#模型输出可视化
plot_predict_curve(model,X_test, Y_test)