In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ch04").getOrCreate()
spark.conf.set("spark.driver.memory", "6g")
sc = spark.sparkContext

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

In [3]:
colNames = ["Elevation", "Aspect", "Slope",
"Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points"]
for i in range(4):
    colNames += ["Wilderness_Area_"+str(i),]
for i in range(40):
    colNames += ["Soil_Type_"+str(i),]
colNames += ["Cover_Type",]

In [4]:
schema = StructType()
for name in colNames:
    if name == "Cover_Type":
        schema.add(StructField(name, DoubleType(), True))
    else:
        schema.add(StructField(name, IntegerType(), True))

In [5]:
data = spark.read.csv("covtype.data", header=False, schema=schema)
data = data.sample(0.7)

AnalysisException: 'Path does not exist: file:/C:/Project/LS/covtype.data;'

In [None]:
data.printSchema()

In [None]:
data.take(1)

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
(trainData, testData) = data.randomSplit([0.9, 0.1])

In [None]:
inputCols = trainData.drop('Cover_Type').columns

In [None]:
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
assembledTrainData = assembler.transform(trainData)
assembledTrainData.select('featureVector').show(truncate=False)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
model = classifier.fit(assembledTrainData)

print(model.toDebugString)
print(model.featureImportances)

In [None]:
predictions = model.transform(assembledTrainData)
predictions.select(["Cover_Type", "prediction", "probability"]).show(truncate=False)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")

In [None]:
evaluator.setMetricName("accuracy").evaluate(predictions)

In [None]:
evaluator.setMetricName("f1").evaluate(predictions)

#### confusion matrix - not supported in PySpark ML library

In [None]:
from pyspark.ml import Pipeline

inputCols = trainData.columns[:-1]
assembler = VectorAssembler(inputCols=inputCols, outputCol="featureVector")
classifier = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, classifier])

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder()\
    .addGrid(classifier.impurity, ["gini", "entropy"])\
    .addGrid(classifier.maxDepth, [10, 30])\
    .addGrid(classifier.maxBins, [40, 100])\
    .addGrid(classifier.minInfoGain, [0.0, 0.05])\
    .build()

In [None]:
multiclassEval = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction",
    metricName="accuracy")
multiclassEval.evaluate(predictions)

In [None]:
from pyspark.ml.tuning import TrainValidationSplit

validator = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=multiclassEval,
    trainRatio=0.9)
validatorModel = validator.fit(trainData)

In [None]:
bestModel = validatorModel.bestModel

In [None]:
bestModel.stages[-1].extractParamMap()

In [None]:
paramsAndMetrics = validatorModel.validationMetrics
paramsAndMetrics

In [None]:
multiclassEval.evaluate(bestModel.transform(testData))

#### undoing the one-hot encoding

In [None]:
wildernessCols = []
for i in range(4):
    wildernessCols += ["Wilderness_Area_"+str(i),]

In [None]:
wildernessAssembler = VectorAssembler(
    inputCols=wildernessCols,
    outputCol="wilderness")

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType, StructType

unhotudf = udf(lambda x: float(x.toArray().nonzero()[0]), DoubleType())

In [None]:
withWilderness = wildernessAssembler.transform(data)
withWilderness = withWilderness\
    .drop(*wildernessCols)\
    .withColumn("wilderness", unhotudf(withWilderness['wilderness']))
withWilderness.take(1)


In [None]:
soilCols = []
for i in range(40):
    soilCols += ["Soil_Type_"+str(i),]


In [None]:
soilAssembler = VectorAssembler(
        inputCols=soilCols,
        outputCol="soil")

withWilderness = soilAssembler.transform(withWilderness)
unencData = withWilderness\
    .drop(*soilCols)\
    .withColumn("soil", unhotudf(withWilderness['soil']))
unencData.take(1)

#### Decision Tree Classifier with unencoded data

In [None]:
(unencTrainData, unencTestData) = unencData.randomSplit([0.9, 0.1])

In [None]:
from pyspark.ml.feature import VectorIndexer

inputCols = unencTrainData.drop('Cover_Type').columns
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
indexer = VectorIndexer(
    maxCategories=40,
    inputCol="featureVector",
    outputCol="indexedVector")
classifier = DecisionTreeClassifier(
    seed=42,
    labelCol="Cover_Type",
    featuresCol="indexedVector",
    predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, indexer, classifier])

#### Random Forest Classifier

In [None]:
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(
    seed=42,
    maxBins=40,
    labelCol="Cover_Type",
    featuresCol="indexedVector",
    predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, indexer, classifier])

In [None]:
paramGrid = ParamGridBuilder()\
    .addGrid(classifier.minInfoGain, [0.0, 0.05])\
    .addGrid(classifier.numTrees, [1, 10])\
    .build()

In [None]:
multiclassEval = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction",
    metricName="accuracy")

In [None]:
validator = TrainValidationSplit(
    seed=42,
    estimator=pipeline,
    evaluator=multiclassEval,
    estimatorParamMaps=paramGrid,
    trainRatio=0.9)

In [None]:
%%time
validatorModel = validator.fit(unencTrainData)
bestModel = validatorModel.bestModel
forestModel = bestModel.stages[-1]
print(forestModel.extractParamMap())

In [None]:
forestModel.getNumTrees

In [None]:
sorted(list(zip(inputCols, forestModel.featureImportances)), key=lambda x: x[1], reverse=True)

In [None]:
testAccuracy = multiclassEval.evaluate(bestModel.transform(unencTestData))
testAccuracy

In [None]:
bestModel.transform(unencTestData.drop("Cover_Type")).show()

### Logisitic Regression

In [None]:
(trainData, testData) = data.randomSplit([0.9, 0.1])

inputCols = trainData.drop('Cover_Type').columns

assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
assembledTrainData = assembler.transform(trainData)
assembledTrainData.select('featureVector').show(truncate=False)


from pyspark.ml.classification import DecisionTreeClassifier
classifier = LogisticRegression(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
model = classifier.fit(assembledTrainData)

#print(model.toDebugString)
#print(model.featureImportances)

predictions = model.transform(assembledTrainData)
predictions.select(["Cover_Type", "prediction", "probability"]).show(truncate=False)


from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")


In [None]:
evaluator.setMetricName("accuracy").evaluate(predictions)
evaluator.setMetricName("f1").evaluate(predictions)

### Logistic Regression - ParamGrid

In [None]:
paramGrid = ParamGridBuilder()\
    .addGrid(classifier.regParam, [0.1, 0.2])\
    .addGrid(classifier.elasticNetParam, [0.001, 0.01])\
    .build()

In [None]:
multiclassEval = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction",
    metricName="accuracy")

In [None]:
validator = TrainValidationSplit(
    estimator=pipeline,
    evaluator=multiclassEval,
    estimatorParamMaps=paramGrid,
    trainRatio=0.9)

In [None]:
%%time
validatorModel = validator.fit(unencTrainData)
bestModel = validatorModel.bestModel
forestModel = bestModel.stages[-1]
print(forestModel.extractParamMap())

In [None]:
testAccuracy = multiclassEval.evaluate(bestModel.transform(unencTestData))
testAccuracy