# Tree Mothods
- 참고 : https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-trees

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz 
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
!pip install -q findspark

import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"

findspark.init()
findspark.find()

'/content/spark-3.1.2-bin-hadoop2.7'

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, 
                                       DecisionTreeClassifier)

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rf').getOrCreate()

In [4]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("/content/spark-3.1.2-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")

In [5]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [6]:
data.head()

Row(label=0.0, features=SparseVector(692, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347: 71.0, 348: 19.0, 349: 28.0,

## train_data, test_data 데이터셋 분리

In [7]:
train, test = data.randomSplit([0.7, 0.3])

In [8]:
train.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



## Model

In [9]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [10]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

## Make predictions.

In [11]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [12]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[95,96,97,12...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[129,130,131...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[154,155,156...|   [0.0,44.0]|  [0.0,1.0]|       1.0|
|  0.0|(69

In [13]:
rfc_preds.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[125,126,127...|
+----------+-----+--------------------+
only showing top 5 rows



## Evaluation

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
acc_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [18]:
accuracy = acc_eval.evaluate(dtc_preds)
print(f"Test Accuracy = {accuracy}")

Test Accuracy = 0.9285714285714286


In [None]:
accuracy = acc_eval.evaluate(rfc_preds)
print(f"Test Accuracy = {accuracy}")

In [19]:
accuracy = acc_eval.evaluate(gbt_preds)
print(f"Test Accuracy = {accuracy}")

Test Accuracy = 0.9285714285714286
