In [1]:
from pyspark.context import SparkContext
#from pyspark.ml.classification import LogisticRegression
'''Random Forest'''
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [2]:
sc = SparkContext.getOrCreate()

## Preprocessing the data using OneHotEncoder

In [3]:
data_path = "HR_comma_sep.csv"
dataset = spark.read.options(header="true", parserLib="univocity", inferSchema="true").csv(data_path)
cols = dataset.columns
print dataset.dtypes

[('satisfaction_level', 'double'), ('last_evaluation', 'double'), ('number_project', 'int'), ('average_montly_hours', 'int'), ('time_spend_company', 'int'), ('Work_accident', 'int'), ('left', 'int'), ('promotion_last_5years', 'int'), ('sales', 'string'), ('salary', 'string')]


In [4]:
categoricalColumns = ["sales", "salary"]
stages = []

for categoricalCol in categoricalColumns: 
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
    encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
    stages += [stringIndexer, encoder]

In [5]:
label_string_indexer = StringIndexer(inputCol = "left", outputCol = "label")
stages += [label_string_indexer]

In [6]:
numericColumns = ['number_project',
                  'average_montly_hours',
                  'time_spend_company',
                  'Work_accident', 
                  'promotion_last_5years']

assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericColumns
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

stages += [assembler]

In [7]:
print dataset.columns

['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years', 'sales', 'salary']


In [8]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dataset)
dataset = pipelineModel.transform(dataset)
selectedcols = ["label", "features"] + cols
dataset = dataset.select(selectedcols)
dataset.show()

+-----+--------------------+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----+------+
|label|            features|satisfaction_level|last_evaluation|number_project|average_montly_hours|time_spend_company|Work_accident|left|promotion_last_5years|sales|salary|
+-----+--------------------+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----+------+
|  1.0|(16,[0,9,11,12,13...|              0.38|           0.53|             2|                 157|                 3|            0|   1|                    0|sales|   low|
|  1.0|(16,[0,10,11,12,1...|               0.8|           0.86|             5|                 262|                 6|            0|   1|                    0|sales|medium|
|  1.0|(16,[0,10,11,12,1...|              0.11|           0.88|             7|                 272|                 4|            0|   

In [9]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print trainingData.count()
print testData.count()

10567
4432


## Fitting the dataset with various machine learning algorithms

In [10]:
"""lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=1000)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)
predictions.printSchema()"""
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData)

rf = RandomForestRegressor(labelCol="label", featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[featureIndexer, rf])

modelRF = pipeline.fit(trainingData)

predictions = modelRF.transform(testData)
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- satisfaction_level: double (nullable = true)
 |-- last_evaluation: double (nullable = true)
 |-- number_project: integer (nullable = true)
 |-- average_montly_hours: integer (nullable = true)
 |-- time_spend_company: integer (nullable = true)
 |-- Work_accident: integer (nullable = true)
 |-- left: integer (nullable = true)
 |-- promotion_last_5years: integer (nullable = true)
 |-- sales: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- indexedFeatures: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [11]:
selected = predictions.select("label", "prediction")#, no "probability")
selected.show()

+-----+-------------------+
|label|         prediction|
+-----+-------------------+
|  0.0|0.25435765670357735|
|  0.0| 0.3615279201610565|
|  0.0| 0.3615279201610565|
|  0.0|0.25435765670357735|
|  0.0|  0.391266994761577|
|  0.0| 0.8661694432035836|
|  0.0| 0.3827405393657931|
|  0.0| 0.3827405393657931|
|  0.0| 0.7559610748208307|
|  0.0| 0.7559610748208307|
|  0.0|0.09348429154470488|
|  0.0|0.06968954124545983|
|  0.0|0.16435098333330914|
|  0.0|0.06968954124545983|
|  0.0|0.09348429154470488|
|  0.0|0.09348429154470488|
|  0.0|0.06968954124545983|
|  0.0|0.07779281280891363|
|  0.0|0.16435098333330914|
|  0.0|0.16435098333330914|
+-----+-------------------+
only showing top 20 rows



## Evaluating logistic regression

In [60]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
print evaluator.getMetricName()
print evaluator.evaluate(predictions)
evaluator.setMetricName("areaUnderPR")
print evaluator.getMetricName()
print evaluator.evaluate(predictions)

areaUnderROC
0.969401701884
areaUnderPR
0.928127032358


## Crossvalidation on logistic regression model

In [61]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .build())

In [64]:
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)
#numFolds=10 indicates 10-fold cross validation.

In [65]:
cvModel = cv.fit(trainingData)

Py4JError: An error occurred while calling o155.getParam. Trace:
py4j.Py4JException: Target Object ID does not exist for this gateway :o155
	at py4j.Gateway.invoke(Gateway.java:277)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)



In [37]:
predictions = cvModel.transform(testData)
evaluator.evaluate(predictions)

NameError: name 'cvModel' is not defined

In [125]:
print 'Model Intercept: ', cvModel.bestModel.intercept
weights = cvModel.bestModel.coefficients
weights = map(lambda w: (float(w),), weights)  # convert numpy type to float, and to tuple
selected = predictions.select("label", "prediction", "probability")
selected.show()

Model Intercept:  -1.5825557498
+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.76602145926450...|
|  0.0|       0.0|[0.72781081717132...|
|  0.0|       0.0|[0.74027245699381...|
|  0.0|       0.0|[0.76420031630123...|
|  0.0|       0.0|[0.76180407108775...|
|  0.0|       0.0|[0.74782807789455...|
|  0.0|       0.0|[0.75967588046341...|
|  0.0|       0.0|[0.75953353988539...|
|  0.0|       0.0|[0.74665027150570...|
|  0.0|       0.0|[0.74665027150570...|
|  0.0|       0.0|[0.74502486391997...|
|  0.0|       0.0|[0.75681813375813...|
|  0.0|       0.0|[0.73209782082355...|
|  0.0|       0.0|[0.75538064013335...|
|  0.0|       0.0|[0.74294626698678...|
|  0.0|       0.0|[0.74264842159795...|
|  0.0|       0.0|[0.75422651586944...|
|  0.0|       0.0|[0.75350332665462...|
|  0.0|       0.0|[0.71401159920671...|
|  0.0|       0.0|[0.72562813273494...|
+-----+----------+--------------------+
only sho

In [126]:
print evaluator.getMetricName()
print evaluator.evaluate(predictions)
evaluator.setMetricName("areaUnderPR")
print evaluator.getMetricName()
print evaluator.evaluate(predictions)

areaUnderPR
0.619359205776
areaUnderPR
0.619359205776


In [135]:
print type(modelRF)

<class 'pyspark.ml.pipeline.PipelineModel'>


In [142]:
print rf.explainParams()

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n]. (default: auto)
featuresCol: features column name. (default: features, current: indexedFeatures)
impurity: Criterion used for information gain calculation (case-insensitive). Supported options: variance (default: variance)
labelCol: label column name. (default: label, current: label)
maxBins: Max number of bins for discretizing continuous features. 