In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [2]:
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/cloudera/workspace/coursera/big-data-4/daily_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')
df.columns

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

In [3]:
featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
        'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am',
        'rain_duration_9am']

In [4]:
df.head(5)

[Row(number=0, air_pressure_9am=918.0600000000087, air_temp_9am=74.82200000000041, avg_wind_direction_9am=271.1, avg_wind_speed_9am=2.080354199999768, max_wind_direction_9am=295.39999999999986, max_wind_speed_9am=2.863283199999908, rain_accumulation_9am=0.0, rain_duration_9am=0.0, relative_humidity_9am=42.42000000000046, relative_humidity_3pm=36.160000000000494),
 Row(number=1, air_pressure_9am=917.3476881177097, air_temp_9am=71.40384263106537, avg_wind_direction_9am=101.93517935618371, avg_wind_speed_9am=2.4430092157340217, max_wind_direction_9am=140.47154847112498, max_wind_speed_9am=3.5333236016106238, rain_accumulation_9am=0.0, rain_duration_9am=0.0, relative_humidity_9am=24.328697291802207, relative_humidity_3pm=19.4265967985621),
 Row(number=2, air_pressure_9am=923.0400000000084, air_temp_9am=60.637999999998776, avg_wind_direction_9am=51.000000000000036, avg_wind_speed_9am=17.067852199999727, max_wind_direction_9am=63.6999999999999, max_wind_speed_9am=22.100967200000003, rain_acc

In [5]:
df = df.drop('number')

In [6]:
df = df.na.drop() 

In [7]:
df.count(), len(df.columns)

(1064, 10)

In [8]:
binarizer = Binarizer(threshold=24.99999, inputCol="relative_humidity_3pm", outputCol="label")
binarizedDF = binarizer.transform(df)

In [9]:
binarizedDF.select("relative_humidity_3pm","label").show(4)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
+---------------------+-----+
only showing top 4 rows



In [10]:
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")
assembled = assembler.transform(binarizedDF)

In [11]:
(trainingData, testData) = assembled.randomSplit([0.8,0.2], seed = 13234 )

In [12]:
trainingData.count(), testData.count()

(854, 210)

In [13]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5,
                            minInstancesPerNode=20, impurity="gini")

In [14]:
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)

In [15]:
predictions = model.transform(testData)

In [19]:
predictions.select("prediction", "label").show(20)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       0.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 20 rows



In [18]:
predictions.select("prediction", "label").write.save(path="file:///home/cloudera/workspace/coursera/big-data-4/predictions1.csv",
                                                     format="com.databricks.spark.csv",
                                                     header='true')

In [20]:
(trainingData2, testData2) = assembled.randomSplit([0.7,0.3], seed = 13234 )

In [21]:
trainingData2.count(), testData2.count()

(730, 334)