In [104]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.ml.feature import Imputer
from pyspark.sql.types import FloatType

In [66]:
# Intiate our spark session
spark = SparkSession.builder \
    .master("local") \
    .config("spark.executor.memory", "2gb") \
    .appName("Tech Challenge") \
    .getOrCreate()

In [105]:
# Load data into spark
data = spark.read.csv("paytmteam-de-weather-challenge-beb4fc53605c/data/2019/*", header=True)
countries = spark.read.csv("paytmteam-de-weather-challenge-beb4fc53605c/countrylist.csv", header=True)
stations = spark.read.csv("paytmteam-de-weather-challenge-beb4fc53605c/stationlist.csv", header=True)

In [106]:
# Rename column to make joins easier
stations = stations.withColumnRenamed("COUNTRY_ABBR", "STATION_COUNTRY_ABBR")   

In [107]:
# Convert important columns to int
data = data.withColumn("TEMP", data.TEMP.cast(FloatType()))
data = data.withColumn("WDSP", data.WDSP.cast(FloatType()))

In [108]:
# Join Countries to stations and data to stations.
dataset = stations.join(countries, stations.STATION_COUNTRY_ABBR == countries.COUNTRY_ABBR) \
                    .join(data, col("STN---") == stations.STN_NO)

In [125]:
# Replace missing temperature values with the mean

imputer = Imputer(strategy='mean', inputCol="TEMP", outputCol='TEMP_IMPUTED', missingValue=9999.9)
impute_model = imputer.fit(dataset)

dataset_temp_imputed = impute_model.transform(dataset)

In [126]:
# Replace missing wind speed values with the mean. 
# Ideally this could be done in the same operation as above, 
# however different missing values makes this complicated.

imputer = Imputer(strategy='mean', inputCol="WDSP", outputCol='WDSP_IMPUTED', missingValue=999.9)
impute_model = imputer.fit(dataset_temp_imputed)

dataset_both_imputed = impute_model.transform(dataset_temp_imputed)

# Which country had the hottest average mean temperature over the year?

In [127]:
dataset_both_imputed.printSchema()

root
 |-- STN_NO: string (nullable = true)
 |-- STATION_COUNTRY_ABBR: string (nullable = true)
 |-- COUNTRY_ABBR: string (nullable = true)
 |-- COUNTRY_FULL: string (nullable = true)
 |-- STN---: string (nullable = true)
 |-- WBAN: string (nullable = true)
 |-- YEARMODA: string (nullable = true)
 |-- TEMP: float (nullable = true)
 |-- DEWP: string (nullable = true)
 |-- SLP: string (nullable = true)
 |-- STP: string (nullable = true)
 |-- VISIB: string (nullable = true)
 |-- WDSP: float (nullable = true)
 |-- MXSPD: string (nullable = true)
 |-- GUST: string (nullable = true)
 |-- MAX: string (nullable = true)
 |-- MIN: string (nullable = true)
 |-- PRCP: string (nullable = true)
 |-- SNDP: string (nullable = true)
 |-- FRSHTT: string (nullable = true)
 |-- TEMP_IMPUTED: float (nullable = true)
 |-- WDSP_IMPUTED: float (nullable = true)



In [133]:
dataset_both_imputed.groupby(col("COUNTRY_FULL")) \
        .agg(mean("TEMP_IMPUTED").alias("avg_temp")) \
        .sort(col("avg_temp").desc()) \
        .first()

Row(COUNTRY_FULL='DJIBOUTI', avg_temp=90.06114474836602)

Djibouti had the hottest average temperature for the year at 90.1 degrees Fahrenheit.

# Which country had the most consecutive days of tornadoes/funnel cloud formations?

In [141]:
dataset_both_imputed.printSchema()

root
 |-- STN_NO: string (nullable = true)
 |-- STATION_COUNTRY_ABBR: string (nullable = true)
 |-- COUNTRY_ABBR: string (nullable = true)
 |-- COUNTRY_FULL: string (nullable = true)
 |-- STN---: string (nullable = true)
 |-- WBAN: string (nullable = true)
 |-- YEARMODA: string (nullable = true)
 |-- TEMP: float (nullable = true)
 |-- DEWP: string (nullable = true)
 |-- SLP: string (nullable = true)
 |-- STP: string (nullable = true)
 |-- VISIB: string (nullable = true)
 |-- WDSP: float (nullable = true)
 |-- MXSPD: string (nullable = true)
 |-- GUST: string (nullable = true)
 |-- MAX: string (nullable = true)
 |-- MIN: string (nullable = true)
 |-- PRCP: string (nullable = true)
 |-- SNDP: string (nullable = true)
 |-- FRSHTT: string (nullable = true)
 |-- TEMP_IMPUTED: float (nullable = true)
 |-- WDSP_IMPUTED: float (nullable = true)



In [153]:
dataset_both_imputed


dataset_both_imputed.withColumn("tornado_day", substring("FRSHTT", 6, 1)) \
    .withColumn("consecutive_tornadoes", 
                row_number().over(Window
                                  .partitionBy(col("COUNTRY_FULL"), col("tornado_day"))
                                  .orderBy(col("YEARMODA")))) \
    .filter(col("tornado_day") == 1) \
    .sort(col("consecutive_tornadoes").desc()) \
    .first()
    .select("COUNTRY_FULL", "consecutive_tornadoes")
    .show()
                        
                    

Row(STN_NO='725846', STATION_COUNTRY_ABBR='US', COUNTRY_ABBR='US', COUNTRY_FULL='UNITED STATES', STN---='725846', WBAN='93201', YEARMODA='20191209', TEMP=29.299999237060547, DEWP='28.6', SLP='9999.9', STP='821.7', VISIB='1.1', WDSP=0.5, MXSPD='4.1', GUST='999.9', MAX='41.0', MIN='25.0', PRCP='99.99', SNDP='3.1', FRSHTT='101001', TEMP_IMPUTED=29.299999237060547, WDSP_IMPUTED=0.5, tornado_day='1', consecutive_tornadoes=34)

The United States had the most consecutive tornado days, at 34.

# Which country had the second highest average mean wind speed over the year?

In [154]:
avg_windspeed = dataset_both_imputed.groupby(col("COUNTRY_FULL")) \
        .agg(mean("WDSP_IMPUTED").alias("avg_wdsp")) \
        .sort(col("avg_wdsp").desc()).take(2)[1]

print(avg_windspeed)

Row(COUNTRY_FULL='ARMENIA', avg_wdsp=457.3659429499847)



Armenia had the second highest average windspeed at 457 knots. Although this seems too high, so perhaps my imputer did not work properly. 

While the method of taking 2 and dropping selecting the last one works for selecting the second, it would be very inefficient for high seNth