# Trees

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('trees').getOrCreate()

21/09/16 10:59:06 WARN Utils: Your hostname, GBLON1WLZ13699 resolves to a loopback address: 127.0.1.1; using 10.164.15.145 instead (on interface eth2)
21/09/16 10:59:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/09/16 10:59:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

In [3]:
from pathlib import Path
data_path = Path('/home/juvid/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/sample_libsvm_data.txt')
data_path.exists()

True

In [4]:
df = spark.read.format('libsvm').load(str(data_path))

21/09/16 10:59:11 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                                                

In [5]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [6]:
train_df, test_df = df.randomSplit([0.7, 0.3])

In [7]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [8]:
dtc_model = dtc.fit(train_df)
rfc_model = rfc.fit(train_df)
gbt_model = gbt.fit(train_df)

                                                                                

In [9]:
dtc_preds = dtc_model.transform(test_df)
rfc_preds = rfc_model.transform(test_df)
gbt_preds = gbt_model.transform(test_df)

In [10]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,124...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [0.0,39.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[126,127,128...|   [0.0,39.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[126,127,128...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[129,130,131...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[151,152,153...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(69

# College dataset

Use tree classification on a real dataset

In [17]:
data_path = Path('/home/juvid/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/College.csv')
data_path.exists()

True

In [12]:
spark = SparkSession.builder.appName('tree').getOrCreate()

In [18]:
data = spark.read.csv(str(data_path), inferSchema=True, header=True)

In [19]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



We will try and predict whether or not the school is private.

In [20]:
data.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

We need to group all the features into vectors. For that, we'll use a `VectorAssembler`.

In [21]:
from pyspark.ml.feature import VectorAssembler

In [22]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [23]:
assembler = VectorAssembler(inputCols=[
'Apps',
'Accept',
'Enroll',
'Top10perc',
'Top25perc',
'F_Undergrad',
'P_Undergrad',
'Outstate',
'Room_Board',
'Books',
'Personal',
'PhD',
'Terminal',
'S_F_Ratio',
'perc_alumni',
'Expend',
'Grad_Rate'], outputCol='features')

In [24]:
output = assembler.transform(data)

The target output is a string, but we need to do some binary encoding to make mlspark work.

In [26]:
from pyspark.ml.feature import StringIndexer

In [27]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

This one-liner is an alternative to a pipeline for transforming a single column

In [28]:
output_binarized = indexer.fit(output).transform(output)

                                                                                

In [30]:
output_binarized.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [31]:
data_clean = output_binarized.select(['features', 'PrivateIndex'])

In [32]:
data_clean.show()

+--------------------+------------+
|            features|PrivateIndex|
+--------------------+------------+
|[1660.0,1232.0,72...|         0.0|
|[2186.0,1924.0,51...|         0.0|
|[1428.0,1097.0,33...|         0.0|
|[417.0,349.0,137....|         0.0|
|[193.0,146.0,55.0...|         0.0|
|[587.0,479.0,158....|         0.0|
|[353.0,340.0,103....|         0.0|
|[1899.0,1720.0,48...|         0.0|
|[1038.0,839.0,227...|         0.0|
|[582.0,498.0,172....|         0.0|
|[1732.0,1425.0,47...|         0.0|
|[2652.0,1900.0,48...|         0.0|
|[1179.0,780.0,290...|         0.0|
|[1267.0,1080.0,38...|         0.0|
|[494.0,313.0,157....|         0.0|
|[1420.0,1093.0,22...|         0.0|
|[4302.0,992.0,418...|         0.0|
|[1216.0,908.0,423...|         0.0|
|[1130.0,704.0,322...|         0.0|
|[3540.0,2001.0,10...|         1.0|
+--------------------+------------+
only showing top 20 rows



In [33]:
train, test = data_clean.randomSplit([0.7, 0.3])

In [49]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol = 'features')
rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol = 'features', numTrees=150)
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol = 'features')

In [50]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

                                                                                

In [51]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [52]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [53]:
binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [54]:
print('DTC')
print(binary_eval.evaluate(dtc_preds))

DTC
0.9514933166248956


In [55]:
print('RFC')
print(binary_eval.evaluate(rfc_preds))

RFC
0.9675229741019217


In [56]:
print('GBT')
print(binary_eval.evaluate(gbt_preds))

GBT
0.9613617376775271


We can also grab other performance metrics from multiclass classification

In [57]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [59]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy')

In [62]:
rfc_acc = acc_eval.evaluate(rfc_preds)
rfc_acc

0.9288888888888889