In [7]:
spark.stop()
### possible solution 
from pyspark import StorageLevel
from pyspark.sql import functions as F, SQLContext, SparkSession, Window
from pyspark.sql.types import*
from random import randint
import time
import datetime

spark = (SparkSession.builder
         .appName("workshop-spark-optimisation")
         .master("spark://spark-master:7077")
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "/opt/workspace/history")
         .config("spark.executor.cores", 2)
         .enableHiveSupport()
         .getOrCreate()
         )

meteo_data_file = "/opt/workspace/data/meteo-data/parquet"
meteo_df = spark.read.parquet(meteo_data_file)

stations_meta_file = "/opt/workspace/data/meteo-data/stations.csv"

schema = StructType([
    StructField('station_identifier', StringType(), True),
    StructField('latitude', FloatType(), True),
    StructField('longitude', FloatType(), True),
    StructField('height_above_sea_level', FloatType(), True),
    StructField('station_name', StringType(), True)
])

stations_df = spark.read.schema(schema).option("header", "false").csv(stations_meta_file)

observation_type_file = "/opt/workspace/data/meteo-data/observation_type.csv"

schema = StructType([
    StructField('observation_type', StringType(), True),
    StructField('description', StringType(), True)
])

observation_type_df = spark.read.schema(schema).option("header", "false").csv(observation_type_file)


res = (
    meteo_df
    .where("yyyy == 2010 or yyyy == 2020")
    .where("yyyy > 2000")
    .join(stations_df, "station_identifier", "left")
    .join(observation_type_df, "observation_type", "left")
    .select( "observation_type","description","observation_value", "latitude")
    .withColumn("station_zone", 
                F.when((F.col("latitude") > "-20")&(F.col("latitude") < "20") , F.lit("equator"))
               .when(((F.col("latitude")>="20")&(F.col("latitude") <"30"))|((F.col("latitude") <="-20")&(F.col("latitude")>"-30")), F.lit("tropics"))
               .when(((F.col("latitude")>="30")&(F.col("latitude") <"40"))|((F.col("latitude") <="-30")&(F.col("latitude")>"-40")), F.lit("subtropics"))
               .when(((F.col("latitude")>="40")&(F.col("latitude") <"50"))|((F.col("latitude") <="-40")&(F.col("latitude")>"-50")), F.lit("warm_temperate"))
               .when(((F.col("latitude")>="50")&(F.col("latitude") <"60"))|((F.col("latitude") <="-50")&(F.col("latitude")>"-60")), F.lit("boreal"))
               .when(((F.col("latitude")>="60")&(F.col("latitude") <"70"))|((F.col("latitude") <="-60")&(F.col("latitude")>"-70")), F.lit("tundra"))
               .otherwise(F.lit("ice_cap"))
    )
    .drop("latitude")
    .coalesce(10)
    .write.format("parquet").mode("overwrite").save("res.parquet")
)

In [8]:
df = spark.read.parquet("res.parquet")
df.count()

121754216