In [17]:
# 决策树分类器训练及预测
from pyspark.mllib.tree import DecisionTree,DecisionTreeModel
from pyspark.mllib.util import MLUtils
dataPath = "/home/zhb/Desktop/work/DecisionTreeShareProject/app/sample_libsvm_data.txt"
print dataPath

data = MLUtils.loadLibSVMFile(sc,dataPath)
(trainingData,testData) = data.randomSplit([0.7,0.3])
print "train data count: " + str(trainingData.count())
print "test data count : " + str(testData.count())
# 训练决策树分类器
# categoricalFeaturesInfo 为空，表示所有的特征均为连续值
model = DecisionTree.trainClassifier(trainingData,numClasses = 2,categoricalFeaturesInfo = {},impurity='gini',maxDepth = 4,maxBins = 32)
# 测试数据集上预测
predictions = model.predict(testData.map(lambda x:x.features))
# 打包真实值与预测值
labelsAndPredictions = testData.map(lambda lp:lp.label).zip(predictions)
# 统计预测错误的样本的频率
testErr = labelsAndPredictions.filter(lambda (v,p): v != p).count() / float(testData.count())
print('Decision Tree Test Error = %5.3f%%'%(testErr*100))
print("Decision Tree Learned classifiction tree model : ")
print(model.toDebugString())

/home/zhb/Desktop/work/DecisionTreeShareProject/app/sample_libsvm_data.txt
train data count: 82
test data count : 18
Decision Tree Test Error = 0.000%
Decision Tree Learned classifiction tree model : 
DecisionTreeModel classifier of depth 2 with 5 nodes
  If (feature 434 <= 0.0)
   If (feature 100 <= 165.0)
    Predict: 0.0
   Else (feature 100 > 165.0)
    Predict: 1.0
  Else (feature 434 > 0.0)
   Predict: 1.0



In [18]:
# 随机森林分类其训练及预测
from pyspark.mllib.tree import RandomForest,RandomForestModel
# 训练随机森林分类器
# categoricalFeaturesInfo 为空，表示所有的特征均为连续值
# 实践中使用更多numTrees
# 设置featureSubsetStrategy="auto"，让算法来选择
RF_Model = RandomForest.trainClassifier(trainingData,numClasses = 2,categoricalFeaturesInfo = {},numTrees = 3,featureSubsetStrategy = "auto",impurity = 'gini',maxDepth = 4,maxBins = 32)
# 测试数据集上预测
RF_Predictions = RF_Model.predict(testData.map(lambda x:x.features))
# 打包真实值与预测值
RF_LabelsAndPredictions = testData.map(lambda lp:lp.label).zip(RF_Predictions)
# 统计预测错误的样本的频率
RF_TestErr = RF_LabelsAndPredictions.filter(lambda (v,p): v!= p).count() / float(testData.count())
print('Random Forest Test Error = %5.3f%%'%(RF_TestErr*100))
print("RandomForest Learned classifiction tree model : ")
print(RF_Model.toDebugString())

Random Forest Test Error = 0.000%
RandomForest Learned classifiction tree model : 
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 484 <= 0.0)
     If (feature 518 <= 0.0)
      If (feature 241 <= 245.0)
       Predict: 0.0
      Else (feature 241 > 245.0)
       Predict: 1.0
     Else (feature 518 > 0.0)
      Predict: 1.0
    Else (feature 484 > 0.0)
     Predict: 0.0
  Tree 1:
    If (feature 497 <= 0.0)
     If (feature 384 <= 0.0)
      If (feature 378 <= 0.0)
       Predict: 0.0
      Else (feature 378 > 0.0)
       Predict: 1.0
     Else (feature 384 > 0.0)
      Predict: 0.0
    Else (feature 497 > 0.0)
     Predict: 0.0
  Tree 2:
    If (feature 317 <= 0.0)
     If (feature 433 <= 0.0)
      Predict: 0.0
     Else (feature 433 > 0.0)
      Predict: 1.0
    Else (feature 317 > 0.0)
     If (feature 216 <= 253.0)
      Predict: 0.0
     Else (feature 216 > 253.0)
      Predict: 1.0



In [20]:
# GBDT分类器训练及预测
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

# 训练GBDT分类器
# categoricalFeaturesInfo 为空，表示所有的特征均为连续值
# 实践中使用更多的numIterations
GBDT_Model = GradientBoostedTrees.trainClassifier(trainingData,categoricalFeaturesInfo = {},numIterations = 3)
# 测试数据集上预测
GBDT_Predictions = GBDT_Model.predict(testData.map(lambda x:x.features))
# 打包真实值与预测值
GBDT_LabelsAndPredictions = testData.map(lambda lp:lp.label).zip(GBDT_Predictions)
# 统计预测错误的样本的频率
GBDT_TestErr = GBDT_LabelsAndPredictions.filter(lambda (v,p): v != p).count() / float(testData.count())
print('GradientBoosted Trees Test Error = %5.3f%%'%(GBDT_TestErr*100))
print("GradientBoosted Trees Learned classifiction tree model : ")
print(GBDT_Model.toDebugString())

GradientBoosted Trees Test Error = 0.000%
GradientBoosted Trees Learned classifiction tree model : 
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 434 <= 0.0)
     If (feature 100 <= 165.0)
      Predict: -1.0
     Else (feature 100 > 165.0)
      Predict: 1.0
    Else (feature 434 > 0.0)
     Predict: 1.0
  Tree 1:
    If (feature 434 <= 0.0)
     If (feature 351 <= 251.0)
      If (feature 162 <= 43.0)
       Predict: -0.47681168808847024
      Else (feature 162 > 43.0)
       Predict: -0.4768116880884712
     Else (feature 351 > 251.0)
      Predict: 0.4768116880884712
    Else (feature 434 > 0.0)
     If (feature 348 <= 0.0)
      If (feature 296 <= 196.0)
       Predict: 0.47681168808847024
      Else (feature 296 > 196.0)
       Predict: 0.4768116880884703
     Else (feature 348 > 0.0)
      Predict: 0.4768116880884712
  Tree 2:
    If (feature 434 <= 0.0)
     If (feature 435 <= 0.0)
      Predict: -0.43819358104272066
     Else (feature 435 > 0.0)
      Pr