In [1]:
from __future__ import division, print_function, unicode_literals 

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [2]:
spark_session = SparkSession.builder\
                            .enableHiveSupport()\
                            .appName("spark sql")\
                            .master("local[4]")\
                            .getOrCreate()

In [3]:
train = spark_session.read.csv("/data/covertype2/train.csv", header=True, inferSchema=True)
test = spark_session.read.csv("/data/covertype2/test.csv", header=True, inferSchema=True)

In [4]:
names = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 
         'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 
         'Horizontal_Distance_To_Fire_Points', 'Wild_Type_ohe', 'Soil_Type_ohe']

In [5]:
wildTypeTransformer = StringIndexer(inputCol="Wild_Type", outputCol="Wild_Type_int")
soilTypeTransformer = StringIndexer(inputCol="Soil_Type", outputCol="Soil_Type_int")

wildTypeEncoder = OneHotEncoder(inputCol="Wild_Type_int", outputCol="Wild_Type_ohe")
soilTypeEncoder = OneHotEncoder(inputCol="Soil_Type_int", outputCol="Soil_Type_ohe")

assembler = VectorAssembler(inputCols=names, outputCol="features")

rf = RandomForestClassifier(numTrees=50, maxDepth=12, labelCol="Target", predictionCol="prediction")

evaluator = MulticlassClassificationEvaluator(labelCol="Target", predictionCol="prediction")

pipeline = Pipeline(stages=[wildTypeTransformer, soilTypeTransformer, wildTypeEncoder, soilTypeEncoder, assembler, rf])

In [6]:
model = pipeline.fit(train)
predict = model.transform(test)

In [7]:
print(evaluator.evaluate(predict))

0.7550895936553261
