In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [21]:
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')
df.columns

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

Define features that will be used for the classification

In [5]:
featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
        'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am',
        'rain_duration_9am']

Drop unused columns. We don't need the "number" column.

In [7]:
df = df.drop('number')
df = df.na.drop()

In [8]:
df.count(), len(df.columns)

(1064, 10)

Create a categorical variable. With this we define the class.

In [9]:
binarizer = Binarizer(threshold = 24.999999, inputCol = "relative_humidity_3pm", outputCol = "label")
binarizedDF = binarizer.transform(df)

In [10]:
binarizedDF.select("relative_humidity_3pm", "label").show(5)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
|    76.74000000000046|  1.0|
+---------------------+-----+
only showing top 5 rows



We defined a threshold for the class variable, based on the values found in relative_humidity_3pm

Now we will aggregate the features we will use to make predictions into a single column:

In [12]:
assembler = VectorAssembler(inputCols = featureColumns, outputCol = "features")
assembled = assembler.transform(binarizedDF)

In [16]:
assembled.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[918.060000000008...|
|[917.347688117709...|
|[923.040000000008...|
|[920.502751175919...|
|[921.160000000003...|
+--------------------+
only showing top 5 rows



Now we will split train and test. Then we will train a decission tree classifier

In [17]:
(trainingData, testData) = assembled.randomSplit([0.8, 0.2], seed = 13234)
trainingData.count(), testData.count()

(854, 210)

In [18]:
dtree = DecisionTreeClassifier(labelCol = "label", featuresCol = "features",
                              maxDepth = 5, minInstancesPerNode = 20, impurity = "gini")
pipeline = Pipeline(stages=[dtree])
model = pipeline.fit(trainingData)

We need to execute a pipeline to train the decission tree. Now lets get the predictions.

In [19]:
predictions = model.transform(testData)
predictions.select("prediction", "label").show(10)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 10 rows



The predictions match the expected label at least in the 10 top rows.
Now let's save the results into a CSV.

In [20]:
predictions.select("prediction", "label").write.save(path="file:///home/cloudera/Downloads/big-data-4/predictions.csv",
                                                    format = "com.databricks.spark.csv",
                                                    header = "true")