## AKATSUKI Radio Science (RS) Data Archive

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import shutil
import pandas as pd
import numpy as np

In [None]:
# specify url for l2, l3 and l4 RS data
data_url_l2 = 'https://data.darts.isas.jaxa.jp/pub/pds3/vco-v-rs-3-occ-v1.0/'
data_url_l3l4 = 'https://data.darts.isas.jaxa.jp/pub/pds3/vco-v-rs-5-occ-v1.0/'

data_url_list = [data_url_l2 , data_url_l3l4]

# specify local directory to store downloaded data
data_directory = '/home/hb/Desktop/Akatsuki/Data/'

# specify local directory to store final merged .csv files data
working_data_directory = '/home/hb/Desktop/Akatsuki/working_data/'

In [None]:
# create specified directory path if it doesn't exist
if not os.path.exists(data_directory):
    os.makedirs(data_directory)
    
if not os.path.exists(working_data_directory):
    os.makedirs(working_data_directory)
    
if not os.path.exists(working_data_directory + '/merged_ig_data'):
    os.makedirs(working_data_directory + '/merged_ig_data')

if not os.path.exists(working_data_directory + '/merged_eg_data'):
    os.makedirs(working_data_directory + '/merged_eg_data')

In [None]:
# download all files ending in .zip from the specified url
for data_url in data_url_list:
    response = requests.get(data_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    zip_files_list = [a['href'] for a in soup.find_all('a') if a['href'].endswith('.zip')]

    for zip_file in zip_files_list:
        dwnld_url = data_url + zip_file
        response = requests.get(dwnld_url)
        data_file = data_directory + zip_file
        with open(data_file , 'wb') as file:
            file.write(response.content)

In [None]:
# unzip all downloaded files and delete original .zip files
dwnld_zip_files = os.listdir(data_directory)
for zip_file in dwnld_zip_files:
    zipfilepath = data_directory + zip_file
    shutil.unpack_archive(zipfilepath,data_directory)
    os.remove(zipfilepath)

In [None]:
# extracting relevant .tab files for conversion to .csv files

data_subdir_list = []

for root, dirs, files in os.walk(data_directory):
    for name in dirs:
        if 'data' in name:
            filtered_dirs = os.path.join(root, name)
            data_subdir_list.append(filtered_dirs)

filtered_l2tabfile_list =[]
filtered_l3tabfile_list =[]
filtered_l4tabfile_list =[]

for subdir in data_subdir_list:
    for root, dirs, files in os.walk(subdir):
        for name in files:
            if '.tab' in name and 'l2' in name:
                filtered_files = os.path.join(root, name)
                filtered_l2tabfile_list.append(filtered_files)                
            elif '.tab' in name and 'l3' in name:
                filtered_files = os.path.join(root, name)
                filtered_l3tabfile_list.append(filtered_files)
            elif '.tab' in name and 'l4' in name:
                filtered_files = os.path.join(root, name)
                filtered_l4tabfile_list.append(filtered_files)

print(len(filtered_l2tabfile_list))
print(len(filtered_l3tabfile_list))
print(len(filtered_l4tabfile_list))


In [None]:
# convert .tab files to .csv files with proper headers

l2_columns = ["SAMPLE_NUMBER",
              "UTC_TIME",
              "DAY_OF_YEAR_WITH_FRACTIONS",
              "EPHEMERIS_SECONDS",
              "DISTANCE",
              "TRANSMIT_FREQUENCY_RAMP_REFERENCE_TIME",
              "TRANSMIT_FREQUENCY_CONSTANT_TERM",
              "TRANSMIT_FREQUENCY_LINEAR_TERM",
              "OBSERVED_X_BAND_ANTENNA_FREQUENCY",
              "PREDICTED_X_BAND_ANTENNA_FREQUENCY",
              "CORRECTION_OF_EARTH_ATMOSPHERE_PROPAGATION",
              "RESIDUAL_CALIBRATED_X_BAND_FREQUENCY_SHIFT",
              "SIGNAL_LEVEL_X_BAND",
              "DIFFERENTIAL_DOPPLER",
              "SIGMA_OBSERVED_X_BAND_ANTENNA_FREQUENCY",
              "SIGNAL_QUALITY_X_BAND",
              "SIGMA_SIGNAL_LEVEL_X_BAND"]

l3_columns = ["SAMPLE_NUMBER",
              "UTC_TIME",
              "EPHEMERIS_SECONDS",
              "RESIDUAL_CALIBRATED_X_BAND_FREQUENCY_SHIFT",
              "RESIDUAL_CALIBRATED_X_BAND_FREQUENCY_SHIFT_AFTER_BASELINE_FIT",
              "RECONSTRUCTED_TRANSMIT_FREQUENCY",
              "RADIUS",
              "SIGMA_RADIUS",
              "BENDING_ANGLE",
              "SIGMA_BENDING_ANGLE",
              "REFRACTIVE_INDEX",
              "REFRACTIVITY",
              "SIGMA_REFRACTIVITY",
              "SIGNAL_LEVEL",
              "DIFFERENTIAL_DOPPLER",
              "IMPACT_PARAMETER",
              "SIGMA_IMPACT_PARAMETER",
              "LONGITUDE",
              "LATITUDE"]

l4_columns = ["SAMPLE_NUMBER",
              "UTC_TIME",
              "EPHEMERIS_SECONDS",
              "RADIUS",
              "LATITUDE",
              "LONGITUDE",
              "GEOPOTENTIAL",
              "GEOPOTENTIAL_HEIGHT",
              "PRESSURE_LOWER_TEMPERATURE_AT_BOUNDARY",
              "SIGMA_PRESSURE_LOWER_TEMPERATURE_AT_BOUNDARY",
              "PRESSURE_MEDIUM_TEMPERATURE_AT_BOUNDARY",
              "SIGMA_PRESSURE_MEDIUM_TEMPERATURE_AT_BOUNDARY",
              "PRESSURE_HIGHER_TEMPERATURE_AT_BOUNDARY",
              "SIGMA_PRESSURE_HIGHER_TEMPERATURE_AT_BOUNDARY",
              "TEMPERATURE_LOWER_TEMPERATURE_AT_BOUNDARY",
              "SIGMA_TEMPERATURE_LOWER_TEMPERATURE_AT_BOUNDARY",
              "TEMPERATURE_MEDIUM_TEMPERATURE_AT_BOUNDARY",
              "SIGMA_TEMPERATURE_MEDIUM_TEMPERATURE_AT_BOUNDARY",
              "TEMPERATURE_HIGHER_TEMPERATURE_AT_BOUNDARY",
              "SIGMA_TEMPERATURE_HIGHER_TEMPERATURE_AT_BOUNDARY",
              "NUMBER_DENSITY",
              "SIGMA_NUMBER_DENSITY",
              "SOLAR_ZENITH_ANGLE",
              "LOCAL_SOLAR_TIME"]

common_columns = []
for element in l2_columns:
    if element in l3_columns :
        common_columns.append(element)
print(common_columns)

In [None]:
# all converted l2, l3, l4 .csv files to be stored in directory 'working_data_directory'

# converting l2 .tab files to .csv files
for tabfile in filtered_l2tabfile_list:
    df = pd.read_csv(tabfile, delim_whitespace=True, header=None)
    csv_file = os.path.join(working_data_directory, os.path.splitext(os.path.basename(tabfile))[0] + '.csv')
    df.to_csv(csv_file, header=l2_columns, index=False)

# converting l3 .tab files to .csv files
for tabfile in filtered_l3tabfile_list:
    df = pd.read_csv(tabfile, delim_whitespace=True, header=None)
    csv_file = os.path.join(working_data_directory, os.path.splitext(os.path.basename(tabfile))[0] + '.csv')
    df.to_csv(csv_file, header=l3_columns, index=False)

# converting l4 .tab files to .csv files
for tabfile in filtered_l4tabfile_list:
    df = pd.read_csv(tabfile, delim_whitespace=True, header=None)
    csv_file = os.path.join(working_data_directory, os.path.splitext(os.path.basename(tabfile))[0] + '.csv')
    df.to_csv(csv_file, header=l4_columns, index=False)

In [None]:
# extracting instances when l2 data was recorded

data_instances = []

for filename in filtered_l2tabfile_list:
    group_name = os.path.basename(filename)[0:18]
    data_instances.append(group_name)

data_instances.sort()
print(len(data_instances))

In [None]:
# grouping together similar instances of l2, l3 and l4 data
# ideally there should be 5 files in each data-instance group

data_group = []

for data_instance in data_instances:
    data_instance_group = []
    for root, dirs, files in os.walk(working_data_directory):           
        for name in files: 
            if data_instance in name:
                data_instance_group.append(name)
                data_instance_group.sort()
    data_group.append(data_instance_group)
print(len(data_group))

In [None]:
j=0
for i in data_group:
    if len(i) == 5:
        j=j+1
print('no. of data_instance_groups with 5 files: ',j, '\n')

j=0
for i in data_group:
    if len(i) != 5:
        j=j+1
print('no. of data_instance_groups with more or less than 5 files',j,'\n\n')
        
for i in data_group:
    if len(i) != 5:
        print(len(i),':', i, '\n')

In [None]:
# sort each data-instance group

for data_instance_group in data_group:
    data_instance_group.sort()

print(data_group[0])

In [None]:
# dataframe merging

for data_instance_group in  data_group:
    if len(data_instance_group) >= 3:
        ingress_dataframes = []
        egress_dataframes = []
        for csv_file in data_instance_group:
            if '_ai_v10' in csv_file or '_i_v10' in csv_file:
                df = pd.read_csv(os.path.join(working_data_directory,csv_file))
                ingress_dataframes.append(df.drop(['SAMPLE_NUMBER'], axis=1))
            if '_ae_v10' in csv_file or '_e_v10' in csv_file:
                df = pd.read_csv(os.path.join(working_data_directory,csv_file))
                egress_dataframes.append(df.drop(['SAMPLE_NUMBER'], axis=1))
            if 'l2' in csv_file:
                l2_df = pd.read_csv(os.path.join(working_data_directory,csv_file))
                l2_df = l2_df.drop(['SAMPLE_NUMBER'], axis=1)
        
        if (len(ingress_dataframes)) == 2:
            merged_ingress_dataframe = pd.merge(ingress_dataframes[0], ingress_dataframes[1], how='outer', sort=True, on = ['UTC_TIME','EPHEMERIS_SECONDS'], suffixes=(None, '_duplicate')) 
        final_merged_ingress_dataframe = pd.merge(merged_ingress_dataframe, l2_df, how='left', on = ['UTC_TIME','EPHEMERIS_SECONDS'])
        final_merged_ingress_csv = os.path.join(working_data_directory + '/merged_ig_data', os.path.basename(data_instance_group[0])[0:24] + '_ing.csv')
        final_merged_ingress_dataframe.to_csv(final_merged_ingress_csv, index=False)
        
            
        if (len(egress_dataframes)) == 2:
            merged_egress_dataframe = pd.merge(egress_dataframes[0], egress_dataframes[1], how='outer', sort=True, on = ['UTC_TIME','EPHEMERIS_SECONDS'], suffixes=(None, '_duplicate')) 
        final_merged_egress_dataframe = pd.merge(merged_egress_dataframe, l2_df, how='left', on = ['UTC_TIME','EPHEMERIS_SECONDS'])
        final_merged_egress_csv = os.path.join(working_data_directory + '/merged_eg_data', os.path.basename(data_instance_group[0])[0:24] + '_eg.csv')
        final_merged_egress_dataframe.to_csv(final_merged_egress_csv, index=False)
        
        #display(merged_ingress_dataframe)
        #display(final_merged_ingress_dataframe)
        #display(merged_egress_dataframe)
        #display(final_merged_egress_dataframe)
    
    