# Decision Tree Model

import packages

In [1]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkConf, SparkContext
from numpy import array

In [7]:
# converting Y/N into a binary 0 or 1.
def binary(YN):
    if (YN == 'Y'):
        return 1
    else:
        return 0

In [8]:
# converting a degree into a number 0-3
def mapEducation(degree):
    if (degree == 'BS'):
        return 1
    elif (degree =='MS'):
        return 2
    elif (degree == 'PhD'):
        return 3
    else:
        return 0

In [9]:
# It is necessary to map datapoints into integer fields ...
def createLabeledPoints(fields):
    yearsExperience = int(fields[0])
    employed = binary(fields[1])
    previousEmployers = int(fields[2])
    educationLevel = mapEducation(fields[3])
    topTier = binary(fields[4])
    interned = binary(fields[5])
    hired = binary(fields[6])

    return LabeledPoint(hired, array([yearsExperience, employed, previousEmployers, educationLevel, topTier, interned]))

In [10]:
# Loading input / training file / transform into integer values
input_file = "/user/student/PastHires.csv"
raw_data = sc.textFile(input_file)
header = raw_data.first()
raw_data = raw_data.filter(lambda x:x != header)
csv_data = raw_data.map(lambda x: x.split(','))
raw_data.collect()

['10,Y,4,BS,N,N,Y',
 '0,N,0,BS,Y,Y,Y',
 '7,N,6,BS,N,N,N',
 '2,Y,1,MS,Y,N,Y',
 '20,N,2,PhD,Y,N,N',
 '0,N,0,PhD,Y,Y,Y',
 '5,Y,2,MS,N,Y,Y',
 '3,N,1,BS,N,Y,Y',
 '15,Y,5,BS,N,N,Y',
 '0,N,0,BS,N,N,N',
 '1,N,1,PhD,Y,N,N',
 '4,Y,1,BS,N,Y,Y',
 '0,N,0,PhD,Y,N,Y']

In [13]:
# Making training data
training_data = csv_data.map(createLabeledPoints)
training_data.collect()

[LabeledPoint(1.0, [10.0,1.0,4.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [0.0,0.0,0.0,1.0,1.0,1.0]),
 LabeledPoint(0.0, [7.0,0.0,6.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [2.0,1.0,1.0,2.0,1.0,0.0]),
 LabeledPoint(0.0, [20.0,0.0,2.0,3.0,1.0,0.0]),
 LabeledPoint(1.0, [0.0,0.0,0.0,3.0,1.0,1.0]),
 LabeledPoint(1.0, [5.0,1.0,2.0,2.0,0.0,1.0]),
 LabeledPoint(1.0, [3.0,0.0,1.0,1.0,0.0,1.0]),
 LabeledPoint(1.0, [15.0,1.0,5.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [0.0,0.0,0.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [1.0,0.0,1.0,3.0,1.0,0.0]),
 LabeledPoint(1.0, [4.0,1.0,1.0,1.0,0.0,1.0]),
 LabeledPoint(1.0, [0.0,0.0,0.0,3.0,1.0,0.0])]

## Test using test_candidates variable with the same value as the training dataset

In [14]:
test_candidates = [ array([10.0,1.0,4.0,1.0,0.0,0.0]), array([0.0,0.0,0.0,1.0,1.0,1.0]), array([7.0,0.0,6.0,1.0,0.0,0.0]) \
                  , array([2.0,1.0,1.0,2.0,1.0,0.0]), array([20.0,0.0,2.0,3.0,1.0,0.0]), array([0.0,0.0,0.0,3.0,1.0,1.0]) \
                  , array([5.0,1.0,2.0,2.0,0.0,1.0]), array([3.0,0.0,1.0,1.0,0.0,1.0]), array([15.0,1.0,5.0,1.0,0.0,0.0]) \
                  , array([0.0,0.0,0.0,1.0,0.0,0.0]), array([1.0,0.0,1.0,3.0,1.0,0.0]), array([4.0,1.0,1.0,1.0,0.0,1.0]) \
                  , array([0.0,0.0,0.0,3.0,1.0,0.0])]

test_data = sc.parallelize(test_candidates)


In [15]:
# decision tree learning model: statistics, data mining & machine learning
# gini diversity index
model = DecisionTree.trainClassifier(training_data, numClasses=2, \
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2}, \
                                     impurity='gini', maxDepth=5, maxBins=32)

In [16]:
predictions = model.predict(test_data)
print('Hire prediction:')
results = predictions.collect()
for result in results:
    print(result)

Hire prediction:
1.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0


## Test another test_candidates with 10 different cases

In [18]:
test_candidates2 = [ array([9.0,1.0,4.0,1.0,0.0,0.0]), array([8.0,0.0,0.0,1.0,1.0,1.0]), array([6.0,0.0,6.0,1.0,0.0,0.0]) \
                  , array([8.0,1.0,1.0,2.0,1.0,0.0]), array([10.0,0.0,2.0,3.0,1.0,0.0]), array([1.0,0.0,0.0,3.0,1.0,1.0]) \
                  , array([9.0,1.0,2.0,2.0,0.0,1.0]), array([13.0,0.0,1.0,1.0,0.0,1.0]), array([25.0,1.0,5.0,1.0,0.0,0.0]) \
                  , array([10.0,0.0,0.0,3.0,1.0,0.0])]

test_data2 = sc.parallelize(test_candidates2)

In [19]:
predictions = model.predict(test_data2)
print('Hire prediction:')
results = predictions.collect()
for result in results:
    print(result)

Hire prediction:
1.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
0.0
