In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
from pyspark.ml.feature import VectorAssembler
import pandas as pd
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F



In [2]:
spark = SparkSession.builder.appName('weather').getOrCreate()

In [3]:
df = spark.read.csv('weather-data.csv')

In [4]:
df = df.withColumnRenamed('_c0', "timestamp")\
        .withColumnRenamed('_c1', "temp")\
        .withColumnRenamed('_c2', "feels_like")\
        .withColumnRenamed('_c3', "pressure")\
        .withColumnRenamed('_c4', "humidity")\
        .withColumnRenamed('_c5', "dew_point")\
        .withColumnRenamed('_c6', "clouds")\
        .withColumnRenamed('_c7', "visibility")\
        .withColumnRenamed('_c8', "wind_speed")\
        .withColumnRenamed('_c9', "wind_deg")\
        .withColumnRenamed('_c10', "weather")\
        .withColumnRenamed('_c11', "pop")

In [5]:
df.show()

+----------+------+----------+--------+--------+---------+------+----------+----------+--------+--------------------+----+
| timestamp|  temp|feels_like|pressure|humidity|dew_point|clouds|visibility|wind_speed|wind_deg|             weather| pop|
+----------+------+----------+--------+--------+---------+------+----------+----------+--------+--------------------+----+
|1603548000|281.41|    277.26|    1000|     100|   281.41|    90|     10000|      5.35|     260|[{'id': 804, 'mai...|0.44|
|1603551600|281.29|    276.56|    1001|      94|   280.38|    88|     10000|      5.84|     279|[{'id': 804, 'mai...|0.43|
|1603555200|280.56|    275.78|    1002|      91|   279.19|    88|     10000|      5.53|     295|[{'id': 804, 'mai...|0.37|
|1603558800|279.71|    275.34|    1004|      88|   277.87|    86|     10000|      4.55|     296|[{'id': 804, 'mai...|0.35|
|1603562400|278.72|    274.78|    1005|      91|   277.37|    85|     10000|      3.81|     300|[{'id': 804, 'mai...|0.32|
|1603566000|278.

In [6]:
# import pyspark.sql.functions as f
# df_split = df.select(f.split(df.weather,":")).rdd.flatMap(
#               lambda x: x).toDF(schema=["col1","col2","col3"])

In [7]:
df = df.withColumn("temp", df["temp"].cast(FloatType()))
df = df.withColumn("feels_like", df["feels_like"].cast(FloatType()))
df = df.withColumn("pressure", df["pressure"].cast(IntegerType()))
df = df.withColumn("humidity", df["humidity"].cast(IntegerType()))
df = df.withColumn("dew_point", df["dew_point"].cast(FloatType()))
df = df.withColumn("clouds", df["clouds"].cast(IntegerType()))
df = df.withColumn("visibility", df["visibility"].cast(IntegerType()))
df = df.withColumn("wind_speed", df["wind_speed"].cast(FloatType()))
df = df.withColumn("wind_deg", df["wind_deg"].cast(IntegerType()))
df = df.withColumn("weather", df["weather"].cast(IntegerType()))
df = df.withColumn("pop", df["pop"].cast(FloatType()))

In [8]:
df.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- temp: float (nullable = true)
 |-- feels_like: float (nullable = true)
 |-- pressure: integer (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- dew_point: float (nullable = true)
 |-- clouds: integer (nullable = true)
 |-- visibility: integer (nullable = true)
 |-- wind_speed: float (nullable = true)
 |-- wind_deg: integer (nullable = true)
 |-- weather: integer (nullable = true)
 |-- pop: float (nullable = true)



In [9]:
# dataset = pd.read_csv("weather-data.csv")
# dataset.head()

# How does the temperature feel given the features pressure, humidity, dew_point, clouds, visibility and wind_speed?

In [10]:
feature_columns = df.columns[3:9] 
feature_columns

['pressure', 'humidity', 'dew_point', 'clouds', 'visibility', 'wind_speed']

In [11]:
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [12]:
data_2 = assembler.transform(df)
# train/test split
train, test = data_2.randomSplit([0.7, 0.3])
# define the model
from pyspark.ml.regression import LinearRegression
algo = LinearRegression(featuresCol="features", labelCol="feels_like")
# train the model
model = algo.fit(train)
# evaluation
evaluation_summary = model.evaluate(test)
# predicting values
predictions = model.transform(test)
predictions.select(predictions.columns[2]).show()


+----------+
|feels_like|
+----------+
|    276.03|
|    275.82|
|    275.11|
|    275.11|
|    275.18|
|    275.06|
|    274.86|
|     274.7|
|     274.7|
|     275.8|
|    276.92|
|    277.47|
|    278.06|
|    278.91|
|    278.91|
|    279.45|
|    280.01|
|    280.01|
|    280.65|
|    280.47|
+----------+
only showing top 20 rows



In [13]:
evaluation_summary.meanAbsoluteError

0.058473163749247686

In [14]:
evaluation_summary.rootMeanSquaredError

0.0705299882245731

In [15]:
evaluation_summary.r2

0.9991539903214101

In [16]:
df.show()

+----------+------+----------+--------+--------+---------+------+----------+----------+--------+-------+----+
| timestamp|  temp|feels_like|pressure|humidity|dew_point|clouds|visibility|wind_speed|wind_deg|weather| pop|
+----------+------+----------+--------+--------+---------+------+----------+----------+--------+-------+----+
|1603548000|281.41|    277.26|    1000|     100|   281.41|    90|     10000|      5.35|     260|   null|0.44|
|1603551600|281.29|    276.56|    1001|      94|   280.38|    88|     10000|      5.84|     279|   null|0.43|
|1603555200|280.56|    275.78|    1002|      91|   279.19|    88|     10000|      5.53|     295|   null|0.37|
|1603558800|279.71|    275.34|    1004|      88|   277.87|    86|     10000|      4.55|     296|   null|0.35|
|1603562400|278.72|    274.78|    1005|      91|   277.37|    85|     10000|      3.81|     300|   null|0.32|
|1603566000|278.13|    274.63|    1005|      92|   277.08|    45|     10000|      3.06|     305|   null| 0.0|
|160356960

# Multilayer Perceptron Classifier

In [17]:
df = df.withColumn(
    'label',
    F.when((F.col("pop") >= 0.7), 2)\
    .when((F.col("pop") < 0.7) & (F.col('pop') > 0.3), 1)\
    .when((F.col("pop") <= 0.3), 0)
)

In [18]:
df.show()

+----------+------+----------+--------+--------+---------+------+----------+----------+--------+-------+----+-----+
| timestamp|  temp|feels_like|pressure|humidity|dew_point|clouds|visibility|wind_speed|wind_deg|weather| pop|label|
+----------+------+----------+--------+--------+---------+------+----------+----------+--------+-------+----+-----+
|1603548000|281.41|    277.26|    1000|     100|   281.41|    90|     10000|      5.35|     260|   null|0.44|    1|
|1603551600|281.29|    276.56|    1001|      94|   280.38|    88|     10000|      5.84|     279|   null|0.43|    1|
|1603555200|280.56|    275.78|    1002|      91|   279.19|    88|     10000|      5.53|     295|   null|0.37|    1|
|1603558800|279.71|    275.34|    1004|      88|   277.87|    86|     10000|      4.55|     296|   null|0.35|    1|
|1603562400|278.72|    274.78|    1005|      91|   277.37|    85|     10000|      3.81|     300|   null|0.32|    1|
|1603566000|278.13|    274.63|    1005|      92|   277.08|    45|     10

In [19]:
feature_columns = df.columns[1:10] 
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [20]:
df2 = assembler.transform(df)

In [21]:

# Split the data into train and test
splits = df2.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [9, 5, 4, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.6944444444444444
