In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier

# 1、获取
titanic = pd.read_csv("./datasets/titanic.csv")
# 2、筛选特征值和目标值
x = titanic[["pclass", "age", "sex"]]
y = titanic["survived"]
# 3、缺失值处理
x["age"].fillna(x["age"].mean(), inplace=True)
# 4、转成字典
x = x.to_dict(orient="records")
# 5、划分
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)
# 6、字典特征抽取
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 7、决策树预估器
estimator = DecisionTreeClassifier(criterion='entropy', max_depth=6)
estimator.fit(x_train, y_train)
# 8、模型评估
y_predict = estimator.predict(x_test)
print("y_predict: ", y_predict)
print("对比真实值和预测值：", y_test == y_predict)
score = estimator.score(x_test, y_test)
print("准确率为：", score)
# 9、可视化决策树（http://webgraphviz.com/）
export_graphviz(estimator, out_file='./graphviz/titanic_tree.dot', feature_names=transfer.get_feature_names())

# 使用随机森林进行预测
# 1、随机森林预估器
estimator = RandomForestClassifier()
# 2、模型选择和调优（网格搜索，交叉验证）
param_dict = {"n_estimators": [120, 200, 300, 500, 800, 1200],
             "max_depth": [5, 8, 15, 25, 30]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)
# 3、模型评估
y_predict = estimator.predict(x_test)
print("y_predict: ", y_predict)
print("对比真实值和预测值：", y_test == y_predict)
score = estimator.score(x_test, y_test)
print("准确率为：", score)
print("最佳参数：", estimator.best_params_)
print("最佳结果：", estimator.best_score_)
print("最佳预估器：", estimator.best_estimator_)
print("交叉验证结果：", estimator.cv_results_)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


y_predict:  [0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
对比真实值和预测值： 831      True
261      True
1210     True
1155     True
255      True
        ...  
1146     True
1125    False
386      True
1025    False
337      True
Name: survived, Length: 329, dtype: bool
准确率为： 0.7872340425531915
y_predict:  [0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 0