# Exploring the data
![footer_logo_new](images/logo_new.png)

In [None]:
from pyspark import StorageLevel
from pyspark.sql import functions as F, SQLContext, SparkSession, Window
from pyspark.sql.types import*
from random import randint
import time
import datetime

spark = (SparkSession.builder
         .appName("explore-data")
         .master("spark://spark-master:7077")
         .config("spark.eventLog.enabled", "true")
         .config("spark.eventLog.dir", "/opt/workspace/history")
         .config("spark.speculation", "true")
         .enableHiveSupport()
         .getOrCreate()
         )

## Meteo observations 
- Parquet format
- partitioned by year (yyyy)

In [None]:
meteo_data_file = "data/meteo-data/parquet"
meteo_df = spark.read.parquet(meteo_data_file)
meteo_df.printSchema()
meteo_df.show(10, truncate=100)
# be careful with `describe` operation on the large datasets, as it triggers statistical analysis job
# meteo_df.describe().show()

### Stations dictionary

In [None]:
stations_meta_file = "data/meteo-data/stations.csv"

schema = StructType([
    StructField('station_identifier', StringType(), True),
    StructField('latitude', FloatType(), True),
    StructField('longitude', FloatType(), True),
    StructField('height_above_sea_level', FloatType(), True),
    StructField('station_name', StringType(), True)
])

stations_df = (spark.read
               .schema(schema)
               .option("header", "false")
               .csv(stations_meta_file)
              )

# 
stations_df.printSchema()
stations_df.show(10, truncate=100)
stations_df.describe().show()

### Observation types dictionary

In [None]:
observation_type_file = "data/meteo-data/observation_type.csv"

schema = StructType([
    StructField('observation_type', StringType(), True),
    StructField('description', StringType(), True)
])

observation_type_df = (spark.read
               .schema(schema)
               .option("header", "false")
               .csv(observation_type_file)
              )
observation_type_df.printSchema()
observation_type_df.show(10, truncate=100)
observation_type_df.describe().show(truncate=100)

### Meteo flags dictionary

In [None]:
flag_description_file = "data/meteo-data/flag_description.csv"

schema = StructType([
    StructField('flag', StringType(), True),
    StructField('flag_description', StringType(), True),
    StructField('value', StringType(), True),
    StructField('value_description', StringType(), True)


])

flag_description_df = (spark.read
               .schema(schema)
               .option("header", "false")
               .csv(flag_description_file)
              )

flag_description_df.printSchema()
flag_description_df.show(10, truncate=100)
flag_description_df.describe().show(truncate=100)

In [None]:
spark.stop()