In [2]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from multiprocessing import Pool

##### Download level 2 data

In [3]:
# Define the base URL and directory path
base_url = 'https://data.darts.isas.jaxa.jp/pub/pds3/vco-v-rs-3-occ-v1.0/'
base_directory = '/home/dev/Desktop/Venus/Data/'

# Mapping of old directory names to new directory names
directory_mapping = {
    'vcors_1001': 'vcors_2001',
    'vcors_1002': 'vcors_2002',
    'vcors_1003': 'vcors_2003',
    'vcors_1004': 'vcors_2004'
}

# Create the base directory if it doesn't exist
if not os.path.exists(base_directory):
    os.makedirs(base_directory)

# Function to download a file from URL
def download_file(url, file_path):
    response = requests.get(url)
    with open(file_path, 'wb') as file:
        file.write(response.content)

# Function to convert tab file to CSV
def convert_to_csv(args):
    directory, subdir, tab_file = args
    url = base_url + directory + '/data/' + subdir + '/'
    file_path = os.path.join(base_directory, directory_mapping[directory], subdir, tab_file)
    download_file(url + tab_file, file_path)
    try:
        df = pd.read_csv(file_path, delim_whitespace=True, header=None)
        csv_file = os.path.join(base_directory, directory_mapping[directory], subdir, os.path.splitext(tab_file)[0] + '.csv')
        df.to_csv(csv_file, index=False)
        os.remove(file_path)  # Delete the .tab file
        return csv_file
    except pd.errors.ParserError:
        print(f"Error processing file: {file_path}. Skipping...")
        os.remove(file_path)  # Delete the problematic file
        return None

# List of directories to process
directories = ['vcors_1001', 'vcors_1002', 'vcors_1003', 'vcors_1004']
subdirectories = ['l2']

# Create a pool of worker processes
pool = Pool()

# Convert tab files to CSV using multiprocessing
for directory in directories:
    directory_path = os.path.join(base_directory, directory_mapping[directory])
    
    # Create the directory if it doesn't exist
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    
    for subdir in subdirectories:
        subdir_path = os.path.join(directory_path, subdir)
        
        # Create the subdirectory if it doesn't exist
        if not os.path.exists(subdir_path):
            os.makedirs(subdir_path)
        
        # Define the URL and get the list of tab files
        url = base_url + directory + '/data/' + subdir + '/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        tab_files = [a['href'] for a in soup.find_all('a') if a['href'].endswith('.tab')]
        
        # Perform parallel conversion for each tab file
        args = zip([directory] * len(tab_files), [subdir] * len(tab_files), tab_files)
        csv_files = pool.map(convert_to_csv, args)
        csv_files = [f for f in csv_files if f is not None]  # Remove None values
    print(f"All data downloaded and converted to CSV successfully for the {directory} directory.")

# Close the pool to prevent any more tasks from being submitted
pool.close()

# Wait for all processes in the pool to finish
pool.join()

print("All data downloaded, converted to CSV, and .tab files deleted successfully.")


All data downloaded and converted to CSV successfully for the vcors_1001 directory.
All data downloaded and converted to CSV successfully for the vcors_1002 directory.
All data downloaded and converted to CSV successfully for the vcors_1003 directory.
All data downloaded and converted to CSV successfully for the vcors_1004 directory.
All data downloaded, converted to CSV, and .tab files deleted successfully.


##### Download level 3 and level 4 data, and sort it accordingly as per "ingress" or "egress"

In [4]:
# Define the base URL and directory path
base_url = 'https://data.darts.isas.jaxa.jp/pub/pds3/vco-v-rs-5-occ-v1.0/'
base_directory = '/home/dev/Desktop/Venus/Data/'

# Create the base directory if it doesn't exist
if not os.path.exists(base_directory):
    os.makedirs(base_directory)

# Function to download a file from URL
def download_file(url, file_path):
    response = requests.get(url)
    with open(file_path, 'wb') as file:
        file.write(response.content)

# Function to convert tab file to CSV
def convert_to_csv(args):
    directory, subdir, tab_file = args
    url = base_url + directory + '/data/' + subdir + '/'
    file_path = os.path.join(base_directory, directory, subdir, tab_file)
    download_file(url + tab_file, file_path)
    df = pd.read_csv(file_path, delim_whitespace=True, header=None)
    csv_file = os.path.join(base_directory, directory, subdir, os.path.splitext(tab_file)[0] + '.csv')
    df.to_csv(csv_file, index=False)
    os.remove(file_path)  # Delete the .tab file
    return csv_file

# List of directories to process
directories = ['vcors_2001', 'vcors_2002', 'vcors_2003', 'vcors_2004']
subdirectories = ['l3', 'l4']

# Create a pool of worker processes
pool = Pool()

# Convert tab files to CSV using multiprocessing
for directory in directories:
    directory_path = os.path.join(base_directory, directory)

    # Create the directory if it doesn't exist
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

    for subdir in subdirectories:
        subdir_path = os.path.join(directory_path, subdir)

        # Create the subdirectory if it doesn't exist
        if not os.path.exists(subdir_path):
            os.makedirs(subdir_path)

        # Define the URL and get the list of tab files
        url = base_url + directory + '/data/' + subdir + '/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        tab_files = [a['href'] for a in soup.find_all('a') if a['href'].endswith('.tab')]

        # Perform parallel conversion for each tab file
        args = zip([directory] * len(tab_files), [subdir] * len(tab_files), tab_files)
        csv_files = pool.map(convert_to_csv, args)

        # Create subdirectories for "ingress" and "egress"
        ingress_subdir = os.path.join(directory_path, subdir, 'ingress')
        egress_subdir = os.path.join(directory_path, subdir, 'egress')
        os.makedirs(ingress_subdir, exist_ok=True)
        os.makedirs(egress_subdir, exist_ok=True)

        for csv_file in csv_files:
            file_name = os.path.basename(csv_file)

            # Check if the filename contains 'i' or 'e'
            if 'i' in file_name.lower():
                destination = os.path.join(ingress_subdir, file_name)
            elif 'e' in file_name.lower():
                destination = os.path.join(egress_subdir, file_name)
            else:
                destination = os.path.join(directory_path, subdir, file_name)

            # Move the CSV file to the appropriate directory
            os.rename(csv_file, destination)
        print(f"Data downloaded, converted to CSV, and sorted successfully for the {subdir} directory.")

    print(f"All data downloaded, converted to CSV, and sorted successfully for the {directory} directory.")

# Close the pool to prevent any more tasks from being submitted
pool.close()

# Wait for all processes in the pool to finish
pool.join()

print("All data downloaded, converted to CSV, and sorted successfully.")


Data downloaded, converted to CSV, and sorted successfully for the l3 directory.
Data downloaded, converted to CSV, and sorted successfully for the l4 directory.
All data downloaded, converted to CSV, and sorted successfully for the vcors_2001 directory.
Data downloaded, converted to CSV, and sorted successfully for the l3 directory.
Data downloaded, converted to CSV, and sorted successfully for the l4 directory.
All data downloaded, converted to CSV, and sorted successfully for the vcors_2002 directory.
Data downloaded, converted to CSV, and sorted successfully for the l3 directory.
Data downloaded, converted to CSV, and sorted successfully for the l4 directory.
All data downloaded, converted to CSV, and sorted successfully for the vcors_2003 directory.
Data downloaded, converted to CSV, and sorted successfully for the l3 directory.
Data downloaded, converted to CSV, and sorted successfully for the l4 directory.
All data downloaded, converted to CSV, and sorted successfully for the vc

#### Number of files downloaded

In [1]:
import os
import glob

path = '/home/dev/Desktop/Venus/Data'
directories = ['vcors_2001', 'vcors_2002', 'vcors_2003', 'vcors_2004']
subdirectories = ['l2', 'l3', 'l4']
subsubdirectories = ['ingress', 'egress']

l2_count = 0
l3_count = 0
l4_count = 0

for directory in directories:
    for subdirectory in subdirectories:
        if subdirectory == 'l3' or subdirectory == 'l4':
            for subsubdirectory in subsubdirectories:
                folder_path = os.path.join(path, directory, subdirectory, subsubdirectory)
                csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
                for csv_file in csv_files:
                    filename = os.path.basename(csv_file)
                    if 'l2' in filename.lower():
                        l2_count += 1
                    if 'l3' in filename.lower():
                        l3_count += 1
                    if 'l4' in filename.lower():
                        l4_count += 1
        else:
            folder_path = os.path.join(path, directory, subdirectory)
            csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
            for csv_file in csv_files:
                filename = os.path.basename(csv_file)
                if 'l2' in filename.lower():
                    l2_count += 1
                if 'l3' in filename.lower():
                    l3_count += 1
                if 'l4' in filename.lower():
                    l4_count += 1

print(f"Number of CSV files with 'l2' in their name: {l2_count}")
print(f"Number of CSV files with 'l3' in their name: {l3_count}")
print(f"Number of CSV files with 'l4' in their name: {l4_count}")


Number of CSV files with 'l2' in their name: 69
Number of CSV files with 'l3' in their name: 111
Number of CSV files with 'l4' in their name: 112
