In [1]:
# Import the PySpark module
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [2]:
##Create SparkContext
sc = SparkContext.getOrCreate()

# Create SparkSession object
spark = SparkSession.builder.master('local[*]').appName('oneHot').getOrCreate()

In [3]:
## Is not the best choose for large data sets
flights = spark.read.csv('flights.csv',sep=',',header=True,inferSchema=True,nullValue='NA')

In [4]:
flights = flights.withColumnRenamed('mile','km')
# Create 'label' column indicating whether flight delayed (1) or not (0)
flights = flights.withColumn('label', (flights.delay >=15).cast('integer'))

In [5]:
flights = flights.filter(flights.label.isNotNull())

In [6]:
from pyspark.ml.feature import VectorAssembler

# Create an assembler object

assembler = VectorAssembler(
    inputCols=['mon', 'dom' , 'dow','km', 'depart', 'duration' ], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)
flights = flights_assembled

In [7]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

In [8]:
# Import the classes required
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create model objects and train on training data
tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(tree.transform(flights_test))
evaluator.evaluate(gbt.transform(flights_test))

# Find the number of trees and the relative importance of features
print(gbt.trees)
print(gbt.featureImportances)

[DecisionTreeRegressionModel (uid=dtr_7b00cf39f70f) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_d09c235c87a0) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_a44e713fad43) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_994e7c3ecb58) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_739de57a24e5) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_f967dbeeca22) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_2fa1fd670888) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_53e2703a48ec) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_ab993c53d95f) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_48782d3b5837) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_a6a5ef516293) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_322403624dda) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_f1ba7545c2b8) of depth 5 with 63 nodes