# Decision Tree Model with Discrete Values (Classifier)
In this example, we are using Spark MLlib library.


In [2]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkConf, SparkContext
from numpy import array

In [3]:
# converting Y/N into a binary 0 or 1.
def binary(YN):
    if (YN == 'Y'):
        return 1
    else:
        return 0

In [4]:
# converting a degree into a number 0-3
def mapEducation(degree):
    if (degree == 'BS'):
        return 1
    elif (degree =='MS'):
        return 2
    elif (degree == 'PhD'):
        return 3
    else:
        return 0

In [26]:
# It is necessary to map datapoints into integer fields ...
def createLabeledPoints(fields):
    yearsExperience = int(fields[0])
    employed = binary(fields[1])
    previousEmployers = int(fields[2])
    educationLevel = mapEducation(fields[3])
    topTier = binary(fields[4])
    interned = binary(fields[5])
    hired = binary(fields[6])

    return LabeledPoint(hired, array([yearsExperience, employed, previousEmployers, educationLevel, topTier, interned]))

In [1]:
# Loading input / training file / transform into integer values
input_file = "/user/student/PastHires.csv"
raw_data = sc.textFile(input_file)
header = raw_data.first()
raw_data = raw_data.filter(lambda x:x != header)
csv_data = raw_data.map(lambda x: x.split(','))
raw_data.collect()

Py4JJavaError: An error occurred while calling o31.partitions.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://localhost:9000/user/student/PastHires.csv
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:204)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [30]:
# Making training data
training_data = csv_data.map(createLabeledPoints)

[LabeledPoint(1.0, [10.0,1.0,4.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [0.0,0.0,0.0,1.0,1.0,1.0]),
 LabeledPoint(0.0, [7.0,0.0,6.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [2.0,1.0,1.0,2.0,1.0,0.0]),
 LabeledPoint(0.0, [20.0,0.0,2.0,3.0,1.0,0.0]),
 LabeledPoint(1.0, [0.0,0.0,0.0,3.0,1.0,1.0]),
 LabeledPoint(1.0, [5.0,1.0,2.0,2.0,0.0,1.0]),
 LabeledPoint(1.0, [3.0,0.0,1.0,1.0,0.0,1.0]),
 LabeledPoint(1.0, [15.0,1.0,5.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [0.0,0.0,0.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [1.0,0.0,1.0,3.0,1.0,0.0]),
 LabeledPoint(1.0, [4.0,1.0,1.0,1.0,0.0,1.0]),
 LabeledPoint(1.0, [0.0,0.0,0.0,3.0,1.0,0.0])]

In [59]:
test_candidates = [ array([10, 1, 3, 1, 0, 0])]
# test_candidates = [ array([20, 0, 2, 3, 1, 0])]
test_data = sc.parallelize(test_candidates)

In [60]:
# decision tree learning model: statistics, data mining & machine learning
# gini diversity index
model = DecisionTree.trainClassifier(training_data, numClasses=2, \
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2}, \
                                     impurity='gini', maxDepth=5, maxBins=32)

In [61]:
predictions = model.predict(test_data)
print('Hire prediction:')
results = predictions.collect()
for result in results:
    print(result)

Hire prediction:
1.0


In [62]:
print('Learned classification tree model:')
print(model.toDebugString())

Learned classification tree model:
DecisionTreeModel classifier of depth 4 with 9 nodes
  If (feature 1 in {0.0})
   If (feature 5 in {0.0})
    If (feature 0 <= 0.5)
     If (feature 3 in {1.0})
      Predict: 0.0
     Else (feature 3 not in {1.0})
      Predict: 1.0
    Else (feature 0 > 0.5)
     Predict: 0.0
   Else (feature 5 not in {0.0})
    Predict: 1.0
  Else (feature 1 not in {0.0})
   Predict: 1.0

