#This notebook describes the GNSS raw and derived data downloading and preprocessing scripts.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
!pwd

/home/kibrom/kwork/sw-GNSS/fdl18_Frontiers/GNSS_data_derived_products


In [9]:
!ls

CHAIN_data_labels.xlsx		  Download_GNSS_CHAIN_data.py
CHAIN_stations_GSV4004Bonly.xlsx  GNSS_ISMR_data_preparation.ipynb
CHAIN_stations_PolaRxSonly.xlsx   Preprocess_GNSS_data_for_pipeline.py


# Step 1: Download raw GNSS data


1.1. Required files:

A). Download_GNSS_CHAIN_data.py --- this script downlads GNSS ISMR data from the 
Canadian High Arctic Ionospheric Network (CHAIN) website (chain.physics.unb.ca). The script requires an input file (see B.))



In [35]:
!tail  Download_GNSS_CHAIN_data.py

	print('numprocessors = {0}'.format(numprocessors))
	
	pool = multiprocessing.Pool(numprocessors)
	datetime_start = datetime.datetime(2017,1,1)
	input_datetimes = [ (datetime_start + datetime.timedelta(days=d)) for d in (range(2)) ]
	pool.map(PolaRxS_batchDataDownloadToLocal,input_datetimes) 


if __name__=='__main__':
	main()


--------------------------------------------------------------------------------
---------------------------------------------------------------------------------

B). CHAIN_data_labels.xlsx - This spreadsheet file contains all (62) ionospheric scintillation monitoring records based on pages 31 - 33 of the PolaRxS Application Manual (https://ftp.space.dtu.dk/pub/bm/Septentrio/polarx5s_User_Manual_1%200%202%20(1).pdf) from Septentrio (https://www.septentrio.com)

NB: PolaRxS is a Multi-GNSS Multi-frequency receiver for ionospheric applications


In [23]:
df_labels = pd.read_excel('CHAIN_data_labels.xlsx', sheet_name='PolaRxS_labels', header=None, usecols=[1])
display(df_labels.head(6))
display(df_labels.tail(6))

Unnamed: 0,0
0,"WN, GPS Week Number"
1,"TOW, GPS Time of Week (seconds)"
2,SVID (see numbering convention in the SBF Outl...
3,Value of the RxState field of the ReceiverStat...
4,Azimuth (degrees)
5,Elevation (degrees)


Unnamed: 0,0
56,SI Index on Sig3 (dimensionless)
57,"SI Index on Sig3, numerator only (dB)"
58,"p on Sig3, phase spectral slope in the 0.1 to ..."
59,"T on Sig1, phase power spectral density at 1 H..."
60,"T on Sig2, phase power spectral density at 1 H..."
61,"T on Sig3, phase power spectral density at 1 H..."


1.2. Running Download_GNSS_CHAIN_data.py script

In [38]:
#This will run the script in a multiprocessor mode (2 processors in this specific example).
#  I am running the script to download GNSS data for two days in 2017 (2017-1-1 to 2017-1-2). 
#The date specification is hardcoded in the script which can be changed according to the user's interest
!python Download_GNSS_CHAIN_data.py 2
#The csript will create 'PolaRxS_CHAINdata__Year_Doy.csv' files in a 'level1' directory.

numprocessors = 2
-------> working on datetime = 2017-01-02 00:00:00
-------> working on datetime = 2017-01-01 00:00:00
this date = 2017-01-02 00:00:00
this date = 2017-01-01 00:00:00
this hour directory = /gps/ismr/2017/002/00/
this hour directory = /gps/ismr/2017/001/00/
this date = 2017-01-02 00:00:00
this hour directory = /gps/ismr/2017/002/01/
this date = 2017-01-01 00:00:00
this hour directory = /gps/ismr/2017/001/01/
  txt_thishour_thisfile = np.genfromtxt(local_fn_and_dir, delimiter=",", filling_values=99)
this date = 2017-01-01 00:00:00
this hour directory = /gps/ismr/2017/001/02/
this date = 2017-01-02 00:00:00
this hour directory = /gps/ismr/2017/002/02/
this date = 2017-01-01 00:00:00
this hour directory = /gps/ismr/2017/001/03/
this date = 2017-01-02 00:00:00
this hour directory = /gps/ismr/2017/002/03/
this date = 2017-01-01 00:00:00
this hour directory = /gps/ismr/2017/001/04/
this date = 2017-01-02 00:00:00
this hour directory = /gps/ismr/2017/002/04/
this date = 2017-0

In [39]:
!ls

CHAIN_data_labels.xlsx		  GNSS_ISMR_data_preparation.ipynb
CHAIN_stations_GSV4004Bonly.xlsx  level1
CHAIN_stations_PolaRxSonly.xlsx   Preprocess_GNSS_data_for_pipeline.py
Download_GNSS_CHAIN_data.py


In [48]:
#level1 directory contains .csv files for two days
!ls ./level1/


data_download_runtimes		 PolaRxS_CHAINdata__2017_002.csv
PolaRxS_CHAINdata__2017_001.csv


In [61]:
GNSS_raw_data = pd.read_csv('./level1/PolaRxS_CHAINdata__2017_001.csv')

In [121]:
GNSS_raw_data.columns

Index(['Unnamed: 0', 'WN, GPS Week Number', 'TOW, GPS Time of Week (seconds)',
       'SVID (see numbering convention in the SBF Outline section of the Reference Guide)',
       'Value of the RxState field of the ReceiverStatus SBF block',
       'Azimuth (degrees)', 'Elevation (degrees)',
       'Average Sig1 C/N0 over the last minute (dB-Hz)',
       'Total S4 on Sig1 (dimensionless)',
       'Correction to total S4 on Sig1 (thermal noise component only) (dimensionless)',
       'Phi01 on Sig1, 1-second phase sigma (radians)',
       'Phi03 on Sig1, 3-second phase sigma (radians)',
       'Phi10 on Sig1, 10-second phase sigma (radians)',
       'Phi30 on Sig1, 30-second phase sigma (radians)',
       'Phi60 on Sig1, 60-second phase sigma (radians)',
       'AvgCCD on Sig1, average of code/carrier divergence (meters)',
       'SigmaCCD on Sig1, standard deviation of code/carrier divergence (meters)',
       'TEC at TOW-45s (TECU), taking calibration into account (see -C option)',
    

# Step 2: Preprocess GNSS data 


2.1. Required files:

A). Preprocess_GNSS_data_for_pipeline.py --- this script processes the .csv files created in the above step. New ml_database__Year_Doy.csv files with the required features and targets are then created and stored in 'level2' directory.
The script requires two input files (see B.))



In [42]:
!tail Preprocess_GNSS_data_for_pipeline.py

	print('numprocessors = {0}'.format(numprocessors))

	pool = multiprocessing.Pool(numprocessors)
	datetime_start = datetime.datetime(2017,1,1)
	input_datetimes = [ (datetime_start + datetime.timedelta(days=d)) for d in (range(1)) ]
	pool.map(PolaRxS_MLDatabaseGeneration,input_datetimes) 


if __name__=='__main__':
	main()


B.) In addition to CHAIN_data_labels.xlsx , this script requires 'CHAIN_stations_PolaRxSonly.xlsx', 
which is a GNSS station list information.

------------------------------------------------

In [45]:
GNSS_stations = pd.read_excel('CHAIN_stations_PolaRxSonly.xlsx')
GNSS_stations

Unnamed: 0,Name,Abbr,Lat,Lon,Instrument,Model,ID
0,Arctic Bay,arc,73.004093,274.973959,GISTM/GPS,PolaRxS,1
1,Arviat,arv,61.097941,265.928533,GISTM/GPS,PolaRxS,2
2,Churchill,chu,58.759279,265.913402,GISTM/GPS,PolaRxS,4
3,Coral Harbour,cor,64.188201,276.650145,GISTM/GPS,PolaRxS,5
4,Fort McMurray,mcm,56.649535,248.779728,GISTM/GPS,PolaRxS,7
5,Fort Simpson,fsi,61.756554,238.771946,GISTM/GPS,PolaRxS,8
6,Fort Smith,fsm,60.026095,248.067109,GISTM/GPS,PolaRxS,9
7,Gillam,gil,56.3766,265.356197,GISTM/GPS,PolaRxS,10
8,Gjoa Haven,gjo,68.63263,264.151719,GISTM/GPS,PolaRxS,11
9,Grise Fiord,gri,76.423281,277.096506,GISTM/GPS,PolaRxS,12


2.2. Running the Preprocess_GNSS_data_for_pipeline.py script

In [None]:
#This will run the script in a multiprocessor mode (1 processor in this specific example).
#  I am running the script to process GNSS data from the leve1 directory for one day in 2017 (2017-1-1).
!python Preprocess_GNSS_data_for_pipeline.py 1
#The csript will create 'ml_database__Year_Doy.csv' file in a 'level2' directory, which will be the final ML input GNSS 
#data to be combined with other sources of data (Solar data, intermagentic data e.t.c. ...)

numprocessors = 1
-------> working on datetime = 2017-01-01 00:00:00


In [49]:
!ls

CHAIN_data_labels.xlsx		  GNSS_ISMR_data_preparation.ipynb
CHAIN_stations_GSV4004Bonly.xlsx  level1
CHAIN_stations_PolaRxSonly.xlsx   level2
Download_GNSS_CHAIN_data.py	  Preprocess_GNSS_data_for_pipeline.py


In [50]:
!ls ./level2/

ml_database__2017_001.csv  ml_db_generator_runtimes


In [92]:
ml_database = pd.read_csv('./level2/ml_database__2017_001.csv')

In [98]:
ml_database['datetime'] = ml_database['datetime'].apply(pd.to_datetime)
ml_database.set_index('datetime', inplace=True)

In [118]:
ml_database.columns

Index(['Unnamed: 0', 'doy', 'ut', 'azimuth [deg]', 'elevation [deg]',
       'geographic latitude [deg]', 'geographic longitude [deg]',
       'TEC at current time [TECU]', 'dTEC 0min-15s to 0min-0s [TECU]',
       'SI [dimensionless]', 'spectral slope [dimensionless]',
       'S4 [dimensionless]', 'S4 projected to vertical [dimensionless]',
       'sigmaPhi [radians]', 'sigmaPhi projected to vertical [radians]',
       'datetime at prediction time (.5h)', 'ut at prediction time(.5h) [sec]',
       'doy at prediction time(.5h) [sec]',
       'TEC at prediction time(.5h) [TECU]',
       'dTEC at prediction time(.5h) [TECU]',
       'S4 at prediction time(.5h) [dimensionless]',
       'S4 projected to vertical at prediction time(.5h) [dimensionless]',
       'sigmaPhi at prediction time(.5h) [radians]',
       'sigmaPhi projected to vertical at prediction time(.5h) [radians]',
       'datetime at prediction time (1h)', 'ut at prediction time(1h) [sec]',
       'doy at prediction time(1h)