# Exploring the data
![footer_logo_new](images/logo_new.png)

In [1]:
from pyspark import StorageLevel
from pyspark.sql import functions as F, SQLContext, SparkSession, Window
from pyspark.sql.types import*
from random import randint
import time
import datetime

spark = (SparkSession.builder
         .appName("explore-data")
         .master("spark://spark-master:7077")
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "/opt/workspace/history")
         .config("spark.speculation", "true")
         .enableHiveSupport()
         .getOrCreate()
         )

## Meteo observations 
- Parquet format
- partitioned by year (yyyy)

In [2]:
meteo_data_file = "data/meteo-data/parquet"
meteo_df = spark.read.parquet(meteo_data_file)
meteo_df.printSchema()
meteo_df.show(10, truncate=100)
# be careful with `describe` operation on the large datasets, as it triggers statistical analysis job
# meteo_df.describe().show()

root
 |-- station_identifier: string (nullable = true)
 |-- date: date (nullable = true)
 |-- observation_type: string (nullable = true)
 |-- observation_value: integer (nullable = true)
 |-- MFLAG1: string (nullable = true)
 |-- QFLAG1: string (nullable = true)
 |-- SFLAG1: string (nullable = true)
 |-- time: string (nullable = true)
 |-- yyyy: integer (nullable = true)

+------------------+----------+----------------+-----------------+------+------+------+----+----+
|station_identifier|      date|observation_type|observation_value|MFLAG1|QFLAG1|SFLAG1|time|yyyy|
+------------------+----------+----------------+-----------------+------+------+------+----+----+
|       AE000041196|2001-01-01|            TMAX|              238|  null|  null|     I|null|2001|
|       USC00127255|2001-11-07|            SNOW|                0|     P|  null|     0|null|2001|
|       AE000041196|2001-01-01|            TMIN|              121|  null|  null|     I|null|2001|
|       USC00127255|2001-11-07|      

### Stations dictionary

In [3]:
stations_meta_file = "data/meteo-data/stations.csv"

schema = StructType([
    StructField('station_identifier', StringType(), True),
    StructField('latitude', FloatType(), True),
    StructField('longitude', FloatType(), True),
    StructField('height_above_sea_level', FloatType(), True),
    StructField('station_name', StringType(), True)
])

stations_df = (spark.read
               .schema(schema)
               .option("header", "false")
               .csv(stations_meta_file)
              )

# 
stations_df.printSchema()
stations_df.show(10, truncate=100)
stations_df.describe().show()

root
 |-- station_identifier: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- height_above_sea_level: float (nullable = true)
 |-- station_name: string (nullable = true)

+------------------+--------+---------+----------------------+----------------------+
|station_identifier|latitude|longitude|height_above_sea_level|          station_name|
+------------------+--------+---------+----------------------+----------------------+
|       ACW00011604| 17.1167|  -61.783|                  10.1| ST JOHNS COOLIDGE FLD|
|       ACW00011647| 17.1333|  -61.783|                  19.2|              ST JOHNS|
|       AE000041196|  25.333|   55.517|                  34.0|   SHARJAH INTER. AIRP|
|       AEM00041194|  25.255|   55.364|                  10.4|            DUBAI INTL|
|       AEM00041217|  24.433|   54.651|                  26.8|        ABU DHABI INTL|
|       AEM00041218|  24.262|   55.609|                 264.9|           AL AIN 

### Observation types dictionary

In [4]:
observation_type_file = "data/meteo-data/observation_type.csv"

schema = StructType([
    StructField('observation_type', StringType(), True),
    StructField('description', StringType(), True)
])

observation_type_df = (spark.read
               .schema(schema)
               .option("header", "false")
               .csv(observation_type_file)
              )
observation_type_df.printSchema()
observation_type_df.show(10, truncate=100)
observation_type_df.describe().show(truncate=100)

root
 |-- observation_type: string (nullable = true)
 |-- description: string (nullable = true)

+----------------+---------------------------------------------------------------------------------+
|observation_type|                                                                      description|
+----------------+---------------------------------------------------------------------------------+
|            PRCP|                                                     Precipitation (tenths of mm)|
|            SNOW|                                                                    Snowfall (mm)|
|            SNWD|                                                                  Snow depth (mm)|
|            TMAX|                                        Maximum temperature (tenths of degrees C)|
|            TMIN|                                        Minimum temperature (tenths of degrees C)|
|            ACMC| Average cloudiness midnight to midnight from 30-second ceilometer data (perc

### Meteo flags dictionary

In [5]:
flag_description_file = "data/meteo-data/flag_description.csv"

schema = StructType([
    StructField('flag', StringType(), True),
    StructField('flag_description', StringType(), True),
    StructField('value', StringType(), True),
    StructField('value_description', StringType(), True)


])

flag_description_df = (spark.read
               .schema(schema)
               .option("header", "false")
               .csv(flag_description_file)
              )

flag_description_df.printSchema()
flag_description_df.show(10, truncate=100)
flag_description_df.describe().show(truncate=100)

root
 |-- flag: string (nullable = true)
 |-- flag_description: string (nullable = true)
 |-- value: string (nullable = true)
 |-- value_description: string (nullable = true)

+------+------------------------------------------------+-----+----------------------------------------------------------------------------------------------------+
|  flag|                                flag_description|value|                                                                                   value_description|
+------+------------------------------------------------+-----+----------------------------------------------------------------------------------------------------+
|MFLAG1| measurement flag for the first day of the month| null|                                                               no measurement information applicable|
|MFLAG1| measurement flag for the first day of the month|    B|                                                  precipitation total formed from two 12-hour totals|

In [6]:
spark.stop()