## Tree Methods
- We will be using a college dataset (College.csv) to try to classify colleges as Private or Public based off these features:

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('treecode').getOrCreate()

In [None]:
# Load training data
data = spark.read.csv('Cung cap du lieu buoi 6/College.csv', inferSchema = True, header = True)

In [None]:
data.count()

777

In [None]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [None]:
data.head()

Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)



```
# This is formatted as code
```

### Spark Formating of Data

In [None]:
# It needs to be in the form of two columns
# ('label', 'features')

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [None]:
assembler = VectorAssembler(
    inputCols=['Apps',
               'Accept',
               'Enroll',
               'Top10perc',
               'Top25perc',
               'F_Undergrad',
               'P_Undergrad',
               'Outstate',
               'Room_Board',
               'Books',
               'Personal',
               'PhD',
               'Terminal',
               'S_F_Ratio',
               'perc_alumni',
               'Expend',
               'Grad_Rate'],
    outputCol='features')

In [None]:
output = assembler.transform(data)

In [None]:
output.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|[2186.0,1924

Deal with Private column being 'yes' or 'no'

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')
output_fixed = indexer.fit(output).transform(output)

In [None]:
output_fixed.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|PrivateIndex|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|         0.0|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30

In [None]:
final_data = output_fixed.select('features', 'PrivateIndex')

In [None]:
train_data, test_data = final_data.randomSplit([0.8, 0.2])

### The Classifiers

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml import Pipeline

Create all three models:

In [None]:
# Use mostly defaults to make this comparison 'fair'

dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features',seed=42)
rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features',seed=42)
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features',seed=42)

Train all three models:

In [None]:
# Train the models (its threee models, so it might take some time)
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

### Model Comparison
Let's compare each of these models!

In [None]:
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

<strong>Evaluation Metrics:</strong>.

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol='PrivateIndex',
                                                  predictionCol='prediction', 
                                                  metricName='accuracy')

In [None]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)

In [None]:
print('Results:')
print('-'*80)
print('A single decision tree - accuracy: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('A random forest ensemble - accuracy: {0:2.2f}%'.format(rfc_acc*100))
print('-'*80)
print('An ensemble using GBT - accuracy: {0:2.2f}%'.format(gbt_acc*100))

Results:
--------------------------------------------------------------------------------
A single decision tree - accuracy: 91.02%
--------------------------------------------------------------------------------
A random forest ensemble - accuracy: 92.81%
--------------------------------------------------------------------------------
An ensemble using GBT - accuracy: 91.62%


Optional Assignment - play around with the parameters of each of these models, squeeze some more accuracy out of them