In [1]:
import findspark

In [2]:
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructField, StringType,
                               IntegerType, StructType)

# from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [5]:
data_schema = [StructField('_c0', IntegerType(), True), 
               StructField('0', IntegerType(), True),
               StructField('1', IntegerType(), True),
               StructField('2', IntegerType(), True),
               StructField('3', IntegerType(), True),
               StructField('4', StringType(), True),
               StructField('5', IntegerType(), True),
               StructField('6', IntegerType(), True)]
final_struc = StructType(fields = data_schema)

In [6]:
df = spark.read.csv('clean_input_sample.csv', header=True, schema=final_struc)
df = df.withColumnRenamed('_c0', 'index') \
        .withColumnRenamed('0', 'station') \
        .withColumnRenamed('1', 'day_of_week') \
        .withColumnRenamed('2', 'time_of_day') \
        .withColumnRenamed('3', 'net_flow') \
        .withColumnRenamed('4', 'date') \
        .withColumnRenamed('5', 'num_of_incid') \
        .withColumnRenamed('6', 'incid_in_effect')
df.head(2)

[Row(index=0, station=0, day_of_week=4, time_of_day=0, net_flow=2488, date='1012016', num_of_incid=0, incid_in_effect=0),
 Row(index=1, station=0, day_of_week=0, time_of_day=2, net_flow=-6066, date='1042016', num_of_incid=0, incid_in_effect=0)]

In [7]:
assembler = VectorAssembler(inputCols = ['day_of_week',
                            'time_of_day', 'net_flow'], 
                            outputCol = 'features')
output = assembler.transform(df)
output.printSchema()

root
 |-- index: integer (nullable = true)
 |-- station: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- time_of_day: integer (nullable = true)
 |-- net_flow: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- num_of_incid: integer (nullable = true)
 |-- incid_in_effect: integer (nullable = true)
 |-- features: vector (nullable = true)



In [8]:
final_data = output.select('features', 'incid_in_effect')
final_data.show()

+-----------------+---------------+
|         features|incid_in_effect|
+-----------------+---------------+
| [4.0,0.0,2488.0]|              0|
|[0.0,2.0,-6066.0]|              0|
|[1.0,2.0,-6975.0]|              0|
|[2.0,2.0,-6740.0]|              0|
| [3.0,0.0,5058.0]|              0|
| [4.0,0.0,5198.0]|              0|
| [5.0,0.0,3362.0]|              0|
| [6.0,4.0,4616.0]|              0|
| [0.0,1.0,-152.0]|              1|
| [1.0,1.0,1016.0]|              0|
|   [2.0,1.0,82.0]|              0|
|  [3.0,1.0,100.0]|              0|
|  [5.0,3.0,386.0]|              0|
|  [6.0,3.0,650.0]|              0|
| [0.0,0.0,2258.0]|              0|
| [1.0,0.0,4678.0]|              0|
|[2.0,4.0,11070.0]|              0|
|[3.0,4.0,11018.0]|              0|
|[4.0,2.0,-5860.0]|              0|
|[5.0,2.0,-1720.0]|              0|
+-----------------+---------------+
only showing top 20 rows



In [9]:
featureIndexer = VectorIndexer(inputCol="features",
                               outputCol="indexedFeatures",
                               maxCategories=7).fit(final_data)

In [10]:
(train_data, test_data) = final_data.randomSplit([0.8, 0.2])

In [11]:
train_data.describe().show()

+-------+--------------------+
|summary|     incid_in_effect|
+-------+--------------------+
|  count|               24021|
|   mean|0.006078015070146955|
| stddev| 0.07772595644143572|
|    min|                   0|
|    max|                   1|
+-------+--------------------+



In [12]:
test_data.describe().show()

+-------+--------------------+
|summary|     incid_in_effect|
+-------+--------------------+
|  count|                5979|
|   mean|0.005853821709315939|
| stddev| 0.07629238478341577|
|    min|                   0|
|    max|                   1|
+-------+--------------------+



In [13]:
rf = RandomForestClassifier(featuresCol = 'indexedFeatures',
                           labelCol="incid_in_effect",
                           seed=100)
gb = GBTClassifier(labelCol = 'incid_in_effect',
                  featuresCol="indexedFeatures",
                  maxIter=10)

In [14]:
re_pipeline = Pipeline(stages=[featureIndexer, rf])
gb_pipeline = Pipeline(stages=[featureIndexer, gb])

In [15]:
rf_model = re_pipeline.fit(train_data)
gb_model = gb_pipeline.fit(train_data)

In [16]:
rf_pred = rf_model.transform(test_data)
gb_pred = gb_model.transform(test_data)

In [17]:
bi_eval = BinaryClassificationEvaluator(labelCol="incid_in_effect")

In [18]:
print('Random Forest')
print(bi_eval.evaluate(rf_pred))

Random Forest
0.6168453182080361


In [19]:
gb_bi_eval = BinaryClassificationEvaluator(labelCol="incid_in_effect",
                                           rawPredictionCol="prediction")

In [20]:
print('Gradient Boosting')
print(gb_bi_eval.evaluate(gb_pred))

Gradient Boosting
0.5


In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [23]:
acc_eval = MulticlassClassificationEvaluator(labelCol='incid_in_effect', 
                                             metricName='accuracy')

In [24]:
rf_acc = acc_eval.evaluate(rf_pred)
rf_acc

0.9941461782906841

In [25]:
acc_eval_f1 = MulticlassClassificationEvaluator(labelCol='incid_in_effect', 
                                             metricName='f1')

In [26]:
rf_acc_f1 = acc_eval_f1.evaluate(rf_pred)
rf_acc_f1

0.9912278593910636

In [27]:
acc_eval_pre = MulticlassClassificationEvaluator(labelCol='incid_in_effect', 
                                             metricName='weightedPrecision')
rf_acc_pre = acc_eval_pre.evaluate(rf_pred)
rf_acc_pre

0.9883266238099725

In [28]:
acc_eval_re = MulticlassClassificationEvaluator(labelCol='incid_in_effect', 
                                             metricName='weightedRecall')
rf_acc_re = acc_eval_re.evaluate(rf_pred)
rf_acc_re

0.9941461782906841