### A.Creating Spark Session and Loading the Data

##### Step 01: Import Spark Session and initialize Spark

In [1]:
#!pip install pandas


In [48]:
from pyspark import SparkContext # Spark
from pyspark.sql import SparkSession # Spark SQL
# from pyspark.sql.functions import lit
# from pyspark.sql.functions import isnan, when, count, col
import pyspark.sql.functions as F
from pyspark.ml.feature import Imputer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from ast import literal_eval 
from pyspark.sql import Window
import pyspark.sql.functions as psf
import pandas as pd
from pyspark.sql.types import DoubleType


In [3]:
sc = SparkContext.getOrCreate() # create spark context

In [4]:
if (sc is None):
    sc = SparkContext(master="local[]", appName="PartA")
spark = SparkSession(sparkContext=sc)

##### Step 02: Load the dataset and print the schema and total number of entries

In [5]:
weatherAUS = spark.read.csv('weatherAUS.csv', header = True, inferSchema = True)

In [6]:
print(weatherAUS.count())

142193


### B. Data Cleaning and Processing

##### Step 03: Delete columns from the dataset

In [7]:
weatherAUS.columns

['Date',
 'Location',
 'MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustDir',
 'WindGustSpeed',
 'WindDir9am',
 'WindDir3pm',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'RainToday',
 'RainTomorrow']

In [8]:
drop_list = ['Date','Location', 'Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']

In [9]:
weatherAUS_dropped = weatherAUS.select([column for column in weatherAUS.columns if column not in drop_list])
weatherAUS_dropped.show(truncate=False)

+-------+-------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+---------+------------+
|MinTemp|MaxTemp|Rainfall|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|RainToday|RainTomorrow|
+-------+-------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+---------+------------+
|13.4   |22.9   |0.6     |W          |44           |W         |WNW       |20          |24          |71         |22         |1007.7     |1007.1     |No       |No          |
|7.4    |25.1   |0       |WNW        |44           |NNW       |WSW       |4           |22          |44         |25         |1010.6     |1007.8     |No       |No          |
|12.9   |25.7   |0       |WSW        |46           |W         |WSW       |19          |26          |38         |30         |1007.6     |1008

In [10]:
# weatherAUS_dropped = weatherAUS.drop('Date','Location', 'Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm')

In [11]:
weatherAUS_dropped

DataFrame[MinTemp: string, MaxTemp: string, Rainfall: string, WindGustDir: string, WindGustSpeed: string, WindDir9am: string, WindDir3pm: string, WindSpeed9am: string, WindSpeed3pm: string, Humidity9am: string, Humidity3pm: string, Pressure9am: string, Pressure3pm: string, RainToday: string, RainTomorrow: string]

In [12]:
# rows = weatherAUS_dropped.count()
# summary = weatherAUS_dropped.describe().filter(col("summary") == "count")
# summary.select(*((lit(rows)-col(c)).alias(c) for c in weatherAUS_dropped.columns)).show()

##### Step 04: Print the number of missing data in each column.

In [13]:
# weatherAUS_dropped.select([count(when(isnan(c) | col(c).isNull() | c == 'NA', c)).alias(c) for c in weatherAUS_dropped.columns]).show()


In [14]:
weatherAUS_dropped.select([F.count(F.when(F.isnan(i) | \
                                   F.col(i).contains('NA') | \
                                   F.col(i).contains('NULL') | \
                                   F.col(i).isNull(), i)).alias(i) \
                    for i in weatherAUS_dropped.columns]).show()

+-------+-------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+---------+------------+
|MinTemp|MaxTemp|Rainfall|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|RainToday|RainTomorrow|
+-------+-------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+---------+------------+
|    637|    322|    1406|       9330|         9270|     10013|      3778|        1348|        2630|       1774|       3610|      14014|      13981|     1406|           0|
+-------+-------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+---------+------------+



In [15]:
# errors.show()

##### Step 05: Fill the missing data with average value and maximum occurrence value.

In [16]:
weatherAUS_dropped.columns

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'WindGustDir',
 'WindGustSpeed',
 'WindDir9am',
 'WindDir3pm',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'RainToday',
 'RainTomorrow']

In [17]:
weatherAUS_dropped['MinTemp']

Column<b'MinTemp'>

In [18]:
objectcolumns = ['WindGustDir', 'WindDir3pm', 'WindDir9am','RainToday','RainTomorrow']

In [19]:
weatherAUS_dropped_obj_columns = weatherAUS_dropped[objectcolumns]

In [20]:
weatherAUS_dropped_obj_columns.show()

+-----------+----------+----------+---------+------------+
|WindGustDir|WindDir3pm|WindDir9am|RainToday|RainTomorrow|
+-----------+----------+----------+---------+------------+
|          W|       WNW|         W|       No|          No|
|        WNW|       WSW|       NNW|       No|          No|
|        WSW|       WSW|         W|       No|          No|
|         NE|         E|        SE|       No|          No|
|          W|        NW|       ENE|       No|          No|
|        WNW|         W|         W|       No|          No|
|          W|         W|        SW|       No|          No|
|          W|         W|       SSE|       No|          No|
|        NNW|        NW|        SE|       No|         Yes|
|          W|       SSE|         S|      Yes|          No|
|          N|       ESE|       SSE|       No|         Yes|
|        NNE|       ENE|        NE|      Yes|         Yes|
|          W|       NNW|       NNW|      Yes|         Yes|
|         SW|       SSW|         W|      Yes|          N

In [21]:
weatherAUS_dropped_num_columns = weatherAUS_dropped.select([column for column in weatherAUS_dropped.columns if column not in objectcolumns])


In [22]:
weatherAUS_dropped_num_columns.show()

+-------+-------+--------+-------------+------------+------------+-----------+-----------+-----------+-----------+
|MinTemp|MaxTemp|Rainfall|WindGustSpeed|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|
+-------+-------+--------+-------------+------------+------------+-----------+-----------+-----------+-----------+
|   13.4|   22.9|     0.6|           44|          20|          24|         71|         22|     1007.7|     1007.1|
|    7.4|   25.1|       0|           44|           4|          22|         44|         25|     1010.6|     1007.8|
|   12.9|   25.7|       0|           46|          19|          26|         38|         30|     1007.6|     1008.7|
|    9.2|     28|       0|           24|          11|           9|         45|         16|     1017.6|     1012.8|
|   17.5|   32.3|       1|           41|           7|          20|         82|         33|     1010.8|       1006|
|   14.6|   29.7|     0.2|           56|          19|          24|         55|  

In [23]:
# for col_name in weatherAUS_dropped_num_columns.columns:
#     weatherAUS_dropped_num_columns = weatherAUS_dropped_num_columns.withColumn(col_name, F.col(col_name).cast('float'))

In [24]:
weatherAUS_dropped_num_columns.printSchema()

root
 |-- MinTemp: string (nullable = true)
 |-- MaxTemp: string (nullable = true)
 |-- Rainfall: string (nullable = true)
 |-- WindGustSpeed: string (nullable = true)
 |-- WindSpeed9am: string (nullable = true)
 |-- WindSpeed3pm: string (nullable = true)
 |-- Humidity9am: string (nullable = true)
 |-- Humidity3pm: string (nullable = true)
 |-- Pressure9am: string (nullable = true)
 |-- Pressure3pm: string (nullable = true)



In [25]:
weatherAUS_dropped_num_columns.select([F.count(F.when(F.isnan(i) | \
                                   F.col(i).contains('NA') | \
                                   F.col(i).contains('NULL') | \
                                   F.col(i).isNull(), i)).alias(i) \
                    for i in weatherAUS_dropped_num_columns.columns]).show()

+-------+-------+--------+-------------+------------+------------+-----------+-----------+-----------+-----------+
|MinTemp|MaxTemp|Rainfall|WindGustSpeed|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|
+-------+-------+--------+-------------+------------+------------+-----------+-----------+-----------+-----------+
|    637|    322|    1406|         9270|        1348|        2630|       1774|       3610|      14014|      13981|
+-------+-------+--------+-------------+------------+------------+-----------+-----------+-----------+-----------+



In [26]:
# imputer = Imputer(
#     inputCols=weatherAUS_dropped_num_columns.columns, 
#     outputCols=["{}_imputed".format(c) for c in weatherAUS_dropped_num_columns.columns]
# )
# imputer.fit(weatherAUS_dropped_num_columns).transform(weatherAUS_dropped_num_columns)

In [27]:
mean_dict = { F.col: 'mean' for F.col in weatherAUS_dropped_num_columns.columns }
# mean_dict
col_avgs = weatherAUS_dropped_num_columns.agg( mean_dict ).collect()[0].asDict()
# col_avgs
col_avgs = { k[4:-1]: v for k,v in col_avgs.items() }
col_avgs
# weatherAUS_dropped_num_columns = weatherAUS_dropped_num_columns.fillna( col_avgs )
for col_name in weatherAUS_dropped_num_columns.columns:
    weatherAUS_dropped_num_columns = weatherAUS_dropped_num_columns.withColumn(col_name, F.regexp_replace(col_name, 'NA', str(col_avgs[col_name])))

weatherAUS_dropped_num_columns.show()


+-------+-------+------------------+-------------+---------------+------------+-----------+-----------+-----------+-----------+
|MinTemp|MaxTemp|          Rainfall|WindGustSpeed|   WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|
+-------+-------+------------------+-------------+---------------+------------+-----------+-----------+-----------+-----------+
|   13.4|   22.9|               0.6|           44|             20|          24|         71|         22|     1007.7|     1007.1|
|    7.4|   25.1|                 0|           44|              4|          22|         44|         25|     1010.6|     1007.8|
|   12.9|   25.7|                 0|           46|             19|          26|         38|         30|     1007.6|     1008.7|
|    9.2|     28|                 0|           24|             11|           9|         45|         16|     1017.6|     1012.8|
|   17.5|   32.3|                 1|           41|              7|          20|         82|         33| 

In [52]:
# for col_name in weatherAUS_modified.columns:
    weatherAUS_modified = weatherAUS_modified.withColumn(col_name, F.when(F.col(col_name)).cast('double'))

# weatherAUS_dropped_num_columns = weatherAUS_dropped_num_columns.withColumn('MinTemp', weatherAUS_dropped_num_columns["MinTemp"].cast("double"))


UnboundLocalError: local variable 'weatherAUS_modified' referenced before assignment

In [28]:
for col_name in weatherAUS_dropped_obj_columns.columns:
    common = weatherAUS_dropped_obj_columns.dropna().groupBy(col_name).agg(F.count("*")).orderBy('count(1)', ascending=False).first()[col_name]
    weatherAUS_dropped_obj_columns = weatherAUS_dropped_obj_columns.withColumn(col_name, F.regexp_replace(col_name, 'NA', common))

In [29]:
# new_df = weatherAUS_dropped_obj_columns.join(weatherAUS_dropped_num_columns,how='left_outer',on = 'Index')


In [30]:
# new_df.show()

In [31]:
for col_name in weatherAUS_dropped_obj_columns.columns:
    l_indexer = StringIndexer(inputCol=col_name, outputCol=col_name+'_index')
    weatherAUS_dropped_obj_columns = l_indexer.fit(weatherAUS_dropped_obj_columns).transform(weatherAUS_dropped_obj_columns)


In [32]:
weatherAUS_dropped_obj_columns = weatherAUS_dropped_obj_columns.select([column for column in weatherAUS_dropped_obj_columns.columns if column not in objectcolumns])

weatherAUS_dropped_obj_columns.show()

+-----------------+----------------+----------------+---------------+------------------+
|WindGustDir_index|WindDir3pm_index|WindDir9am_index|RainToday_index|RainTomorrow_index|
+-----------------+----------------+----------------+---------------+------------------+
|              0.0|             7.0|             6.0|            0.0|               0.0|
|              9.0|             3.0|             9.0|            0.0|               0.0|
|              6.0|             3.0|             6.0|            0.0|               0.0|
|             13.0|            10.0|             1.0|            0.0|               0.0|
|              0.0|             8.0|            10.0|            0.0|               0.0|
|              9.0|             1.0|             6.0|            0.0|               0.0|
|              0.0|             1.0|             7.0|            0.0|               0.0|
|              0.0|             1.0|             3.0|            0.0|               0.0|
|             14.0|  

In [33]:
weatherAUS_dropped_obj_columns_indexed = weatherAUS_dropped_obj_columns.select("*").withColumn("id", F.monotonically_increasing_id())
weatherAUS_dropped_num_columns_indexed = weatherAUS_dropped_num_columns.select("*").withColumn("id", F.monotonically_increasing_id())


In [34]:
new_df = weatherAUS_dropped_obj_columns_indexed.join(weatherAUS_dropped_num_columns_indexed, on=['id'], how='left_outer')


In [35]:
new_df.show()

+---+-----------------+----------------+----------------+---------------+------------------+-------+-------+------------------+-------------+---------------+------------+-----------+-----------+-----------+-----------+
| id|WindGustDir_index|WindDir3pm_index|WindDir9am_index|RainToday_index|RainTomorrow_index|MinTemp|MaxTemp|          Rainfall|WindGustSpeed|   WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|
+---+-----------------+----------------+----------------+---------------+------------------+-------+-------+------------------+-------------+---------------+------------+-----------+-----------+-----------+-----------+
|  0|              0.0|             7.0|             6.0|            0.0|               0.0|   13.4|   22.9|               0.6|           44|             20|          24|         71|         22|     1007.7|     1007.1|
|  1|              9.0|             3.0|             9.0|            0.0|               0.0|    7.4|   25.1|                

In [36]:
weatherAUS_modified = new_df.drop('id')

In [37]:
weatherAUS_modified

DataFrame[WindGustDir_index: double, WindDir3pm_index: double, WindDir9am_index: double, RainToday_index: double, RainTomorrow_index: double, MinTemp: string, MaxTemp: string, Rainfall: string, WindGustSpeed: string, WindSpeed9am: string, WindSpeed3pm: string, Humidity9am: string, Humidity3pm: string, Pressure9am: string, Pressure3pm: string]

In [38]:



# l_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
# df = l_indexer.fit(df).transform(df)


In [39]:
weatherAUS_modified.show()

+-----------------+----------------+----------------+---------------+------------------+-------+-------+------------------+-------------+---------------+------------+-----------+-----------+-----------+-----------+
|WindGustDir_index|WindDir3pm_index|WindDir9am_index|RainToday_index|RainTomorrow_index|MinTemp|MaxTemp|          Rainfall|WindGustSpeed|   WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|
+-----------------+----------------+----------------+---------------+------------------+-------+-------+------------------+-------------+---------------+------------+-----------+-----------+-----------+-----------+
|              0.0|             7.0|             6.0|            0.0|               0.0|   13.4|   22.9|               0.6|           44|             20|          24|         71|         22|     1007.7|     1007.1|
|              9.0|             3.0|             9.0|            0.0|               0.0|    7.4|   25.1|                 0|           44|   

In [42]:
ignore =['RainToday_index']

In [44]:
assembler = VectorAssembler(
    inputCols=[x for x in weatherAUS_modified.columns if x not in ignore],
    outputCol='features')

In [45]:
assembler.transform(weatherAUS_modified)


IllegalArgumentException: 'Data type string of column MinTemp is not supported.\nData type string of column MaxTemp is not supported.\nData type string of column Rainfall is not supported.\nData type string of column WindGustSpeed is not supported.\nData type string of column WindSpeed9am is not supported.\nData type string of column WindSpeed3pm is not supported.\nData type string of column Humidity9am is not supported.\nData type string of column Humidity3pm is not supported.\nData type string of column Pressure9am is not supported.\nData type string of column Pressure3pm is not supported.'