In [1]:
# We will work through all 3 Tree Methods and compare their results on a college dataset
# This dataset has features of universities and labeled either Private or Public
'''Private A factor with levels No and Yes indicating private or public university
Apps Number of applications received
Accept Number of applications accepted
Enroll Number of new students enrolled
Top10perc Pct. new students from top 10% of H.S. class
Top25perc Pct. new students from top 25% of H.S. class
F.Undergrad Number of fulltime undergraduates
P.Undergrad Number of parttime undergraduates
Outstate Out-of-state tuition
Room.Board Room and board costs
Books Estimated book costs
Personal Estimated personal spending
PhD Pct. of faculty with Ph.D.’s
Terminal Pct. of faculty with terminal degree
S.F.Ratio Student/faculty ratio
perc.alumni Pct. alumni who donate
Expend Instructional expenditure per student
Grad.Rate Graduation rate'''

In [2]:
import findspark
findspark.init('/home/hale/spark-2.4.3-bin-hadoop2.7/')
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('tree').getOrCreate()

In [5]:
# Load training data..
data = spark.read.csv('College.csv', inferSchema=True, header=True)

In [6]:
data.printSchema() # features of the schools

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [56]:
data.head(1)


[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [8]:
# Format the data
from pyspark.ml.feature import VectorAssembler

In [9]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [10]:
assembler = VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

In [11]:
output = assembler.transform(data)

In [14]:
# Remember that Private column is what we are trying to predict.
# But right now we have the issue that it is a string 
# It says either yes or no, so we neet to change that to 0 or 1 because Spark's Mllib library can't deal for yes or no directly
from pyspark.ml.feature import StringIndexer

In [15]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [16]:
output_fixed = indexer.fit(output).transform(output)

In [18]:
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [19]:
final_data = output_fixed.select('features','PrivateIndex')

In [20]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [34]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier

In [35]:
from pyspark.ml import Pipeline

In [40]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex',featuresCol='features')

In [41]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [42]:
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [43]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [44]:
my_binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [47]:
print('DTC')
print(my_binary_eval.evaluate(dtc_predictions))

DTC
0.920420029506205


In [48]:
print('RFC')
print(my_binary_eval.evaluate(rfc_predictions))

RFC
0.9732708496051373


In [None]:
# Using just decision tree is not nearly as good as a random forest.
# And honestly, it should never really be any other way
# It makes sense that having a lot more trees to do the voting or choose the classification would work better than
# just a single decision tree especially when we are doing the random splits 
# You should always expect random forest to outperform decision tree in almost every situation 

In [51]:
print('GBT')
print(my_binary_eval.evaluate(gbt_predictions))

GBT
0.9282738870085918


In [53]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy')

In [54]:
rfc_acc = acc_eval.evaluate(rfc_predictions)

In [55]:
rfc_acc

0.9152542372881356