In [22]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
from pyspark.ml.feature import VectorAssembler
import pandas as pd
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [23]:
spark = SparkSession.builder.appName('weather').getOrCreate()

In [24]:
df = spark.read.csv('weather-data.csv')

In [25]:
df = df.withColumnRenamed('_c0', "timestamp")\
        .withColumnRenamed('_c1', "temp")\
        .withColumnRenamed('_c2', "feels_like")\
        .withColumnRenamed('_c3', "pressure")\
        .withColumnRenamed('_c4', "humidity")\
        .withColumnRenamed('_c5', "dew_point")\
        .withColumnRenamed('_c6', "clouds")\
        .withColumnRenamed('_c7', "visibility")\
        .withColumnRenamed('_c8', "wind_speed")\
        .withColumnRenamed('_c9', "wind_deg")\
        .withColumnRenamed('_c10', "weather")\
        .withColumnRenamed('_c11', "pop")

In [26]:
df.show()

+----------+------+----------+--------+--------+---------+------+----------+----------+--------+--------------------+----+
| timestamp|  temp|feels_like|pressure|humidity|dew_point|clouds|visibility|wind_speed|wind_deg|             weather| pop|
+----------+------+----------+--------+--------+---------+------+----------+----------+--------+--------------------+----+
|1603429200|280.44|    277.22|     995|      93|   279.38|    75|     10000|      3.36|     264|[{'id': 500, 'mai...| 0.7|
|1603432800|280.97|    278.47|     996|      91|   279.59|    71|     10000|       2.4|     311|[{'id': 500, 'mai...|0.74|
|1603436400| 280.7|     277.5|     997|      89|   279.01|    95|     10000|      3.22|       9|[{'id': 804, 'mai...|0.74|
|1603440000|   280|    276.14|     999|      86|   277.82|    96|     10000|      3.81|      10|[{'id': 804, 'mai...|0.55|
|1603443600|279.62|    275.77|    1000|      82|   276.77|    97|     10000|      3.52|       8|[{'id': 804, 'mai...|0.54|
|1603447200|279.

In [27]:
df = df.withColumn("temp", df["temp"].cast(FloatType()))
df = df.withColumn("feels_like", df["feels_like"].cast(FloatType()))
df = df.withColumn("pressure", df["pressure"].cast(IntegerType()))
df = df.withColumn("humidity", df["humidity"].cast(IntegerType()))
df = df.withColumn("dew_point", df["dew_point"].cast(FloatType()))
df = df.withColumn("clouds", df["clouds"].cast(IntegerType()))
df = df.withColumn("visibility", df["visibility"].cast(IntegerType()))
df = df.withColumn("wind_speed", df["wind_speed"].cast(FloatType()))
df = df.withColumn("wind_deg", df["wind_deg"].cast(IntegerType()))
df = df.withColumn("weather", df["weather"].cast(IntegerType()))
df = df.withColumn("pop", df["pop"].cast(IntegerType()))

In [28]:
df.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- temp: float (nullable = true)
 |-- feels_like: float (nullable = true)
 |-- pressure: integer (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- dew_point: float (nullable = true)
 |-- clouds: integer (nullable = true)
 |-- visibility: integer (nullable = true)
 |-- wind_speed: float (nullable = true)
 |-- wind_deg: integer (nullable = true)
 |-- weather: integer (nullable = true)
 |-- pop: integer (nullable = true)



In [29]:
# dataset = pd.read_csv("weather-data.csv")
# dataset.head()

# How does the temperature feel given the features pressure, humidity, dew_point, clouds, visibility and wind_speed?

In [30]:
feature_columns = df.columns[3:9] 
feature_columns

In [33]:
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [34]:
data_2 = assembler.transform(df)
# train/test split
train, test = data_2.randomSplit([0.7, 0.3])
# define the model
from pyspark.ml.regression import LinearRegression
algo = LinearRegression(featuresCol="features", labelCol="feels_like")
# train the model
model = algo.fit(train)
# evaluation
evaluation_summary = model.evaluate(test)
# predicting values
predictions = model.transform(test)
predictions.select(predictions.columns[13:]).show() # here I am filtering out some columns just for the figure to fit


+------------------+
|        prediction|
+------------------+
| 276.0526722359368|
| 276.4670294882255|
| 278.5277025033061|
|273.48842957897335|
|273.62654866306957|
|275.17025985208903|
|275.38896011088434|
|277.34299589241306|
| 273.3461984068658|
| 275.2642325646095|
| 275.3563353719595|
|274.74749769427854|
| 274.8593396831862|
| 274.9724613470303|
| 274.9724613470303|
|274.99546948540336|
|274.99546948540336|
| 275.0069911180543|
| 275.6772703775378|
| 275.6772703775378|
+------------------+
only showing top 20 rows



In [35]:
evaluation_summary.meanAbsoluteError

0.15809777010106355

In [36]:
evaluation_summary.rootMeanSquaredError

0.2112907204408863

In [39]:
evaluation_summary.r2

0.9917875859691913

# Multilayer Perceptron Classifier

In [None]:
feature_columns = df.columns[3:9] 
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")


# Split the data into train and test
splits = data_2.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 5, 4, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234, labelCol="feels_like")

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "feels_like")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

# TimeSeries Analysis

In [38]:
# maybe we could create our own class labels like this
# dataset['windy?'] = pd.cut(dataset['wind_speed'],
#                    bins=[0, 1.69,2.3,3.17, 5],
#                    labels=['very low wind', 'low wind', 'high wind', 'very high wind'])