In [1]:
from pyspark.context import SparkContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [2]:
sc = SparkContext.getOrCreate()

## Preprocessing the data using OneHotEncoder

In [57]:
data_path = "../input/HR_comma_sep.csv"
dataset = spark.read.options(header="true", parserLib="univocity", inferSchema="true").csv(data_path)
cols = dataset.columns
print dataset.dtypes
dataset.printSchema()

[('satisfaction_level', 'double'), ('last_evaluation', 'double'), ('number_project', 'int'), ('average_montly_hours', 'int'), ('time_spend_company', 'int'), ('Work_accident', 'int'), ('left', 'int'), ('promotion_last_5years', 'int'), ('sales', 'string'), ('salary', 'string')]
root
 |-- satisfaction_level: double (nullable = true)
 |-- last_evaluation: double (nullable = true)
 |-- number_project: integer (nullable = true)
 |-- average_montly_hours: integer (nullable = true)
 |-- time_spend_company: integer (nullable = true)
 |-- Work_accident: integer (nullable = true)
 |-- left: integer (nullable = true)
 |-- promotion_last_5years: integer (nullable = true)
 |-- sales: string (nullable = true)
 |-- salary: string (nullable = true)



In [58]:
categoricalColumns = ["sales", "salary"]
stages = []

for categoricalCol in categoricalColumns: 
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
    encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
    stages += [stringIndexer, encoder]

In [59]:
label_string_indexer = StringIndexer(inputCol = "left", outputCol = "label")
stages += [label_string_indexer]
stages[1:5]

[OneHotEncoder_402c8c177081d0a9f247,
 StringIndexer_4e51894d7dfab856a577,
 OneHotEncoder_4fb59c5eb9d88f055191,
 StringIndexer_4b459cdee37211cebbc9]

In [60]:
numericColumns = ['number_project',
                  'average_montly_hours',
                  'time_spend_company',
                  'Work_accident', 
                  'promotion_last_5years']

assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericColumns
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

stages += [assembler]

stages[0:5]

[StringIndexer_48a1a66e1c344f92ac23,
 OneHotEncoder_402c8c177081d0a9f247,
 StringIndexer_4e51894d7dfab856a577,
 OneHotEncoder_4fb59c5eb9d88f055191,
 StringIndexer_4b459cdee37211cebbc9]

In [61]:
print dataset.columns
dataset.corr(col1="satisfaction_level", col2="last_evaluation")
#dataset.

['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years', 'sales', 'salary']


0.10502121397148648

In [62]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dataset)
dataset = pipelineModel.transform(dataset)
selectedcols = ["label", "features"] + cols
dataset = dataset.select(selectedcols)
dataset.show()

+-----+--------------------+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----+------+
|label|            features|satisfaction_level|last_evaluation|number_project|average_montly_hours|time_spend_company|Work_accident|left|promotion_last_5years|sales|salary|
+-----+--------------------+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----+------+
|  1.0|(16,[0,9,11,12,13...|              0.38|           0.53|             2|                 157|                 3|            0|   1|                    0|sales|   low|
|  1.0|(16,[0,10,11,12,1...|               0.8|           0.86|             5|                 262|                 6|            0|   1|                    0|sales|medium|
|  1.0|(16,[0,10,11,12,1...|              0.11|           0.88|             7|                 272|                 4|            0|   

In [63]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print trainingData.count()
print testData.count()

10567
4432


## Fitting the dataset with various machine learning algorithms

In [10]:
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=750)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- satisfaction_level: double (nullable = true)
 |-- last_evaluation: double (nullable = true)
 |-- number_project: integer (nullable = true)
 |-- average_montly_hours: integer (nullable = true)
 |-- time_spend_company: integer (nullable = true)
 |-- Work_accident: integer (nullable = true)
 |-- left: integer (nullable = true)
 |-- promotion_last_5years: integer (nullable = true)
 |-- sales: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [11]:
selected = predictions.select("label", "prediction", "probability")
selected.show()

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.77697641302549...|
|  0.0|       0.0|[0.59339454079627...|
|  0.0|       0.0|[0.65793686279020...|
|  0.0|       0.0|[0.77017435807291...|
|  0.0|       0.0|[0.76106183794182...|
|  0.0|       0.0|[0.69795973045569...|
|  0.0|       0.0|[0.75281798642968...|
|  0.0|       0.0|[0.75226165823401...|
|  0.0|       0.0|[0.69289775060790...|
|  0.0|       0.0|[0.69289775060790...|
|  0.0|       0.0|[0.68586113805961...|
|  0.0|       0.0|[0.74153270537012...|
|  0.0|       0.0|[0.62076415096320...|
|  0.0|       0.0|[0.73576577328656...|
|  0.0|       0.0|[0.67678068206384...|
|  0.0|       0.0|[0.67547231581425...|
|  0.0|       0.0|[0.73109328439357...|
|  0.0|       0.0|[0.72814659424960...|
|  0.0|       0.0|[0.52874904179595...|
|  0.0|       0.0|[0.59081153751529...|
+-----+----------+--------------------+
only showing top 20 rows



## Evaluating logistic regression

In [12]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print evaluator.getMetricName()
print evaluator.evaluate(predictions)
evaluator.setMetricName("areaUnderPR")
print evaluator.getMetricName()
print evaluator.evaluate(predictions)

areaUnderROC
0.719831991107
areaUnderPR
0.379709825066


## Crossvalidation on logistic regression model

In [20]:
#from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [21]:
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)
#numFolds=10 indicates 10-fold cross validation.

In [22]:
cvModel = cv.fit(trainingData)
predictions = cvModel.transform(testData)
evaluator.evaluate(predictions)
print 'Model Intercept: ', cvModel.bestModel.intercept
weights = cvModel.bestModel.coefficients
weights = map(lambda w: (float(w),), weights)  # convert numpy type to float, and to tuple
selected = predictions.select("label", "prediction", "probability")
selected.show()

Model Intercept:  -1.16469160387
+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
|  0.0|       0.0|[0.76218415822844...|
+-----+----------+--------------------+
only sh

In [23]:
print evaluator.getMetricName()
print evaluator.evaluate(predictions)
evaluator.setMetricName("areaUnderPR")
print evaluator.getMetricName()
print evaluator.evaluate(predictions)

areaUnderPR
0.619359205776
areaUnderPR
0.619359205776


## Decision Tree

In [33]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=4)

# Train model with Training Data
dtModel = dt.fit(trainingData)

print "numNodes = ", dtModel.numNodes
print "depth = ", dtModel.depth

# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testData)
predictions.printSchema()


numNodes =  29
depth =  4
root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- satisfaction_level: double (nullable = true)
 |-- last_evaluation: double (nullable = true)
 |-- number_project: integer (nullable = true)
 |-- average_montly_hours: integer (nullable = true)
 |-- time_spend_company: integer (nullable = true)
 |-- Work_accident: integer (nullable = true)
 |-- left: integer (nullable = true)
 |-- promotion_last_5years: integer (nullable = true)
 |-- sales: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [34]:
# View model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability")
selected.show()


+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       1.0|[0.09314140558848...|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       1.0|[0.09314140558848...|
|  0.0|       1.0|[0.09314140558848...|
|  0.0|       0.0|[0.98165137614678...|
|  0.0|       0.0|[0.98165137614678...|
|  0.0|       0.0|[0.98165137614678...|
|  0.0|       0.0|[0.98165137614678...|
|  0.0|       0.0|[0.98165137614678...|
|  0.0|       0.0|[0.98165137614678...|
|  0.0|       0.0|[0.98165137614678...|
|  0.0|       0.0|[0.98165137614678...|
|  0.0|       0.0|[0.98165137614678...|
|  0.0|       0.0|[0.98165137614678...|
+-----+----------+--------------------+
only showing top 20 rows



In [35]:
# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)


0.9335259176421944

In [36]:
DTree = dtModel.toDebugString
print DTree

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4ec6b5d0a5714c6df4a1) of depth 4 with 29 nodes
  If (feature 11 <= 2.0)
   If (feature 12 <= 164.0)
    If (feature 12 <= 124.0)
     Predict: 0.0
    Else (feature 12 > 124.0)
     If (feature 13 <= 2.0)
      Predict: 0.0
     Else (feature 13 > 2.0)
      Predict: 1.0
   Else (feature 12 > 164.0)
    If (feature 12 <= 240.0)
     If (feature 1 in {0.0})
      Predict: 0.0
     Else (feature 1 not in {0.0})
      Predict: 0.0
    Else (feature 12 > 240.0)
     If (feature 0 in {0.0})
      Predict: 0.0
     Else (feature 0 not in {0.0})
      Predict: 0.0
  Else (feature 11 > 2.0)
   If (feature 13 <= 3.0)
    If (feature 12 <= 283.0)
     If (feature 11 <= 6.0)
      Predict: 0.0
     Else (feature 11 > 6.0)
      Predict: 1.0
    Else (feature 12 > 283.0)
     If (feature 14 <= 0.0)
      Predict: 1.0
     Else (feature 14 > 0.0)
      Predict: 0.0
   Else (feature 13 > 3.0)
    If (feature 12 <= 219.0)
     If (feature 12

In [37]:
import json
from bson import json_util
from bson.json_util import dumps

In [38]:
# Parser
def parse(lines):
    block = []
    while lines :

        if lines[0].startswith('If'):
            bl = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
            block.append({'name':bl, 'children':parse(lines)})


            if lines[0].startswith('Else'):
                be = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
                block.append({'name':be, 'children':parse(lines)})
        elif not lines[0].startswith(('If','Else')):
            block2 = lines.pop(0)
            block.append({'name':block2})
        else:
            break
    return block

In [39]:
def tree_json(tree):
    data = []
    for line in tree.splitlines() : 
        if line.strip():
            line = line.strip()
            data.append(line)
        else : break
        if not line : break
    res = []
    res.append({'name':'Root', 'children':parse(data[1:])})
    with open('/Users/dylanbao/Desktop/SJSU/297ML/Project/Decision-Tree-Visualization-Spark-master/data/structure.json', 'w') as outfile:
        json.dump(res[0], outfile)
    print ('Conversion Success !')

In [40]:
tree_json(DTree)

Conversion Success !


In [41]:
print evaluator.getMetricName()
print evaluator.evaluate(predictions)
evaluator.setMetricName("areaUnderPR")
print evaluator.getMetricName()
print evaluator.evaluate(predictions)

areaUnderROC
0.933525917642
areaUnderPR
0.872263034426


## Crossvalidation on Decision Tree

In [20]:
# Create ParamGrid for Cross Validation

paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1,2,6,10])
             .addGrid(dt.maxBins, [20,40,80])
             .build())


In [21]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)

print "numNodes = ", cvModel.bestModel.numNodes
print "depth = ", cvModel.bestModel.depth

# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)


numNodes =  497
depth =  10


0.896319921308404

In [22]:
# View Best model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability")
selected.show()


+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       1.0|[0.01932367149758...|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
|  0.0|       0.0|           [1.0,0.0]|
+-----+----------+--------------------+
only showing top 20 rows



In [23]:
print evaluator.getMetricName()
print evaluator.evaluate(predictions)
evaluator.setMetricName("areaUnderPR")
print evaluator.getMetricName()
print evaluator.evaluate(predictions)


areaUnderPR
0.896319921308
areaUnderPR
0.896319921308
