In [1]:
from all_etl import *
#from hurdat_etl import *
from nexrad_etl import *
from sql_queries import *
from nexrad_etl import check_nexrad_data_quality
import numpy as np
import numpy.ma


## You are using the Python ARM Radar Toolkit (Py-ART), an open source
## library for working with weather radar data. Py-ART is partly
## supported by the U.S. Department of Energy as part of the Atmospheric
## Radiation Measurement (ARM) Climate Research Facility, an Office of
## Science user facility.
##
## If you use this software to prepare a publication, please cite:
##
##     JJ Helmus and SM Collis, JORS 2016, doi: 10.5334/jors.119



In [2]:
spark_verify()

3.1432


In [3]:
print("START ETL pipeline process")
results_all = []

START ETL pipeline process


In [4]:
path_d = define_paths()

In [5]:
# Create Spark session for the pipeline.
spark = create_spark_session()

Create Spark session




In [6]:
# read NEXRAD file
#radar = pyart.io.read('data/NEXRAD/KCRP20200101_000431_V06')   
nexrad_df = pyart.io.read(path_d["nexrad"])       

In [7]:
volume_start = datetime.strptime(nexrad_df.time['units'][14:34], '%Y-%m-%dT%H:%M:%SZ')

In [8]:
## move masks to variables
rmask = nexrad_df.fields['reflectivity']['data'].mask
vmask = nexrad_df.fields['velocity']['data'].mask

In [9]:
## assign nan to masked values - NOT using filled method
(nexrad_df.fields['reflectivity']['data'])[rmask] = np.nan
(nexrad_df.fields['velocity']['data'])[vmask] = np.nan

In [10]:
### remove mask from masked arrays
nexrad_df.fields['reflectivity']['data'].mask = numpy.ma.nomask
nexrad_df.fields['velocity']['data'].mask = numpy.ma.nomask

In [11]:
### build initial samples table
merged_lat = list(itertools.chain.from_iterable(nexrad_df.gate_latitude['data']))
merged_lon = list(itertools.chain.from_iterable(nexrad_df.gate_longitude['data']))
merged_alt = list(itertools.chain.from_iterable(nexrad_df.gate_altitude['data']))
merged_refl = list(itertools.chain.from_iterable(nexrad_df.fields['reflectivity']['data']))
merged_velo = list(itertools.chain.from_iterable(nexrad_df.fields['velocity']['data']))
time_x1 = [volume_start + timedelta(seconds=s) for s in nexrad_df.time['data']]
time_xgates = [val for val in time_x1 for _ in range(nexrad_df.ngates)]

In [20]:
nexrad_sample_df = pd.DataFrame(
    {'GateLat': merged_lat,
     'GateLon': merged_lon,
     'GateAlt': merged_alt,
     'GateTime': time_xgates,
     'Reflectivity': merged_refl,
     'Velocity': merged_velo
    })    

In [21]:
# filter out rows without reflectivity or velocity measurements
# might need a .copy() here

nexrad_sample_df = nexrad_sample_df[ ( (nexrad_sample_df['Reflectivity'] >= -32) & (nexrad_sample_df['Reflectivity'] <= 94.5) ) | ( (nexrad_sample_df['Velocity'] >= -95) & (nexrad_sample_df['Velocity'] <= 95) ) ]

#nexrad_sample_df = nexrad_sample_df[(nexrad_sample_df['Reflectivity'] == -8)].copy()

In [22]:
print(nexrad_sample_df)

            GateLat    GateLon  GateAlt                GateTime  Reflectivity  \
0         27.802363 -97.505206     61.0 2020-01-01 00:04:31.183          -8.0   
1         27.804522 -97.504495     63.0 2020-01-01 00:04:31.183         -12.0   
2         27.806680 -97.503783     65.0 2020-01-01 00:04:31.183          -3.0   
3         27.808838 -97.503072     67.0 2020-01-01 00:04:31.183          -3.0   
4         27.810997 -97.502360     69.0 2020-01-01 00:04:31.183          -3.0   
...             ...        ...      ...                     ...           ...   
11869653  27.547467 -97.716142   3837.0 2020-01-01 00:11:18.914         -12.5   
11869693  27.476573 -97.777324   4999.0 2020-01-01 00:11:18.914          -9.5   
11869694  27.474801 -97.778852   5028.0 2020-01-01 00:11:18.914          -3.0   
11869696  27.471256 -97.781908   5086.0 2020-01-01 00:11:18.914          -7.0   
11869697  27.469483 -97.783437   5116.0 2020-01-01 00:11:18.914          -9.5   

          Velocity  
0     

In [23]:
nexrad_sample_schema = t.StructType([
                            t.StructField('GateLat', t.StringType(), False),
                            t.StructField('GateLon', t.StringType(), False),
                            t.StructField('GateAlt', t.StringType(), False),
                            t.StructField('GateTime', t.DateType(), False),
                            t.StructField('Reflectivity', t.StringType(), False),
                            t.StructField('Velocity', t.StringType(), False)    
                        ])

In [24]:
nexrad_samples_df_spark = spark.createDataFrame(nexrad_sample_df, schema=nexrad_sample_schema)

In [25]:
parquet_wr(spark, path_d["output_data"] + "nexrad_samples_stage.parquet", nexrad_samples_df_spark)

In [26]:
results = check_nexrad_data_quality( spark, nexrad_samples_df_spark)
results_all.append(results)

print(results_all)

Checking NEXRAD table...
NULLS:
+--------+
|count(1)|
+--------+
|       0|
+--------+

ROWS:
+--------+
|count(1)|
+--------+
|  703436|
+--------+

Checking data quality complete
[{'nexrad_count': 13361, 'nexrad': 'OK'}, {'nexrad_count': 703436, 'nexrad': 'OK'}]


In [27]:
print("ETL pipeline complete")

ETL pipeline complete
