In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

## Tree based classification methods

In this notebook we will test out 3 different tree methods:

* A single decision tree
* A random forest
* A gradient boosted tree classifier
    
We will be using a college dataset to try to classify colleges as Private or Public based off these features:

    Private: A factor with levels No and Yes indicating private or public university
    Apps: Number of applications received
    Accept: Number of applications accepted
    Enroll: Number of new students enrolled
    Top10perc: Percentage new students from top 10% of H.S. class
    Top25perc: Percentage new students from top 25% of H.S. class
    F.Undergrad: Number of fulltime undergraduates
    P.Undergrad: Number of parttime undergraduates
    Outstate: Out-of-state tuition
    Room.Board: Room and board costs
    Books: Estimated book costs
    Personal: Estimated personal spending
    PhD: Percentage of faculty with Ph.D.’s
    Terminal: Percentage of faculty with terminal degree
    S.F.Ratio: Student/faculty ratio
    perc.alumni: Percentage alumni who donate
    Expend: Instructional expenditure per student
    Grad.Rate: Graduation rate

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('trees').getOrCreate()

In [5]:
# Load training data
data = spark.read.csv('data/College.csv',inferSchema=True,header=True)

In [6]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [8]:
data.describe().show()

+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+------------------+
|summary|              School|Private|              Apps|            Accept|          Enroll|         Top10perc|         Top25perc|      F_Undergrad|      P_Undergrad|          Outstate|        Room_Board|             Books|          Personal|               PhD|          Terminal|         S_F_Ratio|       perc_alumni|          Expend|         Grad_Rate|
+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------

## Create a feature Vector

In [9]:
# Spark MLlib needs data as two columnsc ("label","features")

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [10]:
assembler = VectorAssembler(inputCols=['Apps','Accept','Enroll','Top10perc','Top25perc','F_Undergrad','P_Undergrad', 
                                       'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 
                                       'perc_alumni', 'Expend', 'Grad_Rate'],
                            outputCol="features")
output = assembler.transform(data)

In [15]:
output.groupBy('Private').count().show()

+-------+-----+
|Private|count|
+-------+-----+
|     No|  212|
|    Yes|  565|
+-------+-----+



### Convert the Private column to a categorical variable

In [16]:
from pyspark.ml.feature import StringIndexer

In [17]:
indexer = StringIndexer(inputCol="Private", outputCol="PrivateIndex")
output_fixed = indexer.fit(output).transform(output)

In [18]:
final_data = output_fixed.select("features",'PrivateIndex')

In [19]:
final_data.show()

+--------------------+------------+
|            features|PrivateIndex|
+--------------------+------------+
|[1660.0,1232.0,72...|         0.0|
|[2186.0,1924.0,51...|         0.0|
|[1428.0,1097.0,33...|         0.0|
|[417.0,349.0,137....|         0.0|
|[193.0,146.0,55.0...|         0.0|
|[587.0,479.0,158....|         0.0|
|[353.0,340.0,103....|         0.0|
|[1899.0,1720.0,48...|         0.0|
|[1038.0,839.0,227...|         0.0|
|[582.0,498.0,172....|         0.0|
|[1732.0,1425.0,47...|         0.0|
|[2652.0,1900.0,48...|         0.0|
|[1179.0,780.0,290...|         0.0|
|[1267.0,1080.0,38...|         0.0|
|[494.0,313.0,157....|         0.0|
|[1420.0,1093.0,22...|         0.0|
|[4302.0,992.0,418...|         0.0|
|[1216.0,908.0,423...|         0.0|
|[1130.0,704.0,322...|         0.0|
|[3540.0,2001.0,10...|         1.0|
+--------------------+------------+
only showing top 20 rows



In [20]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

## Build the classification models

In [21]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

In [22]:
## Using default parameters
dtc = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex',featuresCol='features')

In [23]:
# Train the models (its three models, so it might take some time)
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

#  Evaluate the models

In [24]:
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [27]:
dtc_predictions.show()

+--------------------+------------+-------------+-----------+----------+
|            features|PrivateIndex|rawPrediction|probability|prediction|
+--------------------+------------+-------------+-----------+----------+
|[174.0,146.0,88.0...|         0.0|  [293.0,0.0]|  [1.0,0.0]|       0.0|
|[247.0,189.0,100....|         0.0|  [293.0,0.0]|  [1.0,0.0]|       0.0|
|[261.0,192.0,111....|         0.0|  [293.0,0.0]|  [1.0,0.0]|       0.0|
|[281.0,266.0,139....|         0.0|  [293.0,0.0]|  [1.0,0.0]|       0.0|
|[285.0,280.0,208....|         1.0|    [0.0,4.0]|  [0.0,1.0]|       1.0|
|[291.0,245.0,126....|         0.0|  [293.0,0.0]|  [1.0,0.0]|       0.0|
|[314.0,158.0,132....|         0.0|  [293.0,0.0]|  [1.0,0.0]|       0.0|
|[331.0,331.0,225....|         0.0|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|[344.0,264.0,97.0...|         0.0|  [293.0,0.0]|  [1.0,0.0]|       0.0|
|[353.0,340.0,103....|         0.0|  [293.0,0.0]|  [1.0,0.0]|       0.0|
|[355.0,300.0,142....|         0.0|  [293.0,0.0]|  

In [28]:
rfc_predictions.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|PrivateIndex|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[174.0,146.0,88.0...|         0.0|[16.9220846522093...|[0.84610423261046...|       0.0|
|[247.0,189.0,100....|         0.0|[19.8879874687252...|[0.99439937343626...|       0.0|
|[261.0,192.0,111....|         0.0|[19.8879874687252...|[0.99439937343626...|       0.0|
|[281.0,266.0,139....|         0.0|[19.7997161651201...|[0.98998580825600...|       0.0|
|[285.0,280.0,208....|         1.0|[7.84080298786181...|[0.39204014939309...|       1.0|
|[291.0,245.0,126....|         0.0|[18.4926853808226...|[0.92463426904113...|       0.0|
|[314.0,158.0,132....|         0.0|[17.8672629508987...|[0.89336314754493...|       0.0|
|[331.0,331.0,225....|         0.0|[16.2812049089912...|[0.81406024544956...|       0.0|
|[344.0,264.0,97.0...

In [29]:
gbt_predictions.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|PrivateIndex|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[174.0,146.0,88.0...|         0.0|[1.50072114099596...|[0.95263924179081...|       0.0|
|[247.0,189.0,100....|         0.0|[1.54158208101341...|[0.95619291657883...|       0.0|
|[261.0,192.0,111....|         0.0|[1.54158208101341...|[0.95619291657883...|       0.0|
|[281.0,266.0,139....|         0.0|[1.54158208101341...|[0.95619291657883...|       0.0|
|[285.0,280.0,208....|         1.0|[-1.0290860050080...|[0.11322924586342...|       1.0|
|[291.0,245.0,126....|         0.0|[1.54101320488511...|[0.95614523364235...|       0.0|
|[314.0,158.0,132....|         0.0|[1.59036241135198...|[0.96010244018177...|       0.0|
|[331.0,331.0,225....|         0.0|[1.09458376407703...|[0.89927252498822...|       0.0|
|[344.0,264.0,97.0...

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [30]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="PrivateIndex", predictionCol="prediction", metricName="accuracy")

In [31]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)

In [32]:
print("Here are the results!")
print('-'*80)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*80)
print('A ensemble using GBT had an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

Here are the results!
--------------------------------------------------------------------------------
A single decision tree had an accuracy of: 93.33%
--------------------------------------------------------------------------------
A random forest ensemble had an accuracy of: 94.22%
--------------------------------------------------------------------------------
A ensemble using GBT had an accuracy of: 93.33%


In [33]:
rfc120_model = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features',numTrees=120).fit(train_data)
rfc120_predictions = rfc120_model.transform(test_data)
rfc120_acc = acc_evaluator.evaluate(rfc120_predictions)

In [34]:
rfc200_model = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features',numTrees=200).fit(train_data)
rfc200_predictions = rfc200_model.transform(test_data)
rfc200_acc = acc_evaluator.evaluate(rfc200_predictions)

In [36]:
rfc250_model = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features',numTrees=250).fit(train_data)
rfc250_predictions = rfc250_model.transform(test_data)
rfc250_acc = acc_evaluator.evaluate(rfc250_predictions)

In [37]:
print("Here are the results!")
print('-'*80)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*80)
print('A random forest ensemble of 120 trees had an accuracy of: {0:2.2f}%'.format(rfc120_acc*100))
print('-'*80)
print('A random forest ensemble of 200 trees had an accuracy of: {0:2.2f}%'.format(rfc200_acc*100))
print('-'*80)
print('A random forest ensemble of 250 trees had an accuracy of: {0:2.2f}%'.format(rfc250_acc*100))
print('-'*80)
print('A ensemble using GBT had an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

Here are the results!
--------------------------------------------------------------------------------
A single decision tree had an accuracy of: 93.33%
--------------------------------------------------------------------------------
A random forest ensemble had an accuracy of: 94.22%
--------------------------------------------------------------------------------
A random forest ensemble of 120 trees had an accuracy of: 94.22%
--------------------------------------------------------------------------------
A random forest ensemble of 200 trees had an accuracy of: 94.22%
--------------------------------------------------------------------------------
A random forest ensemble of 250 trees had an accuracy of: 94.67%
--------------------------------------------------------------------------------
A ensemble using GBT had an accuracy of: 93.33%
