Import libraries

In [None]:
import math
import matplotlib
import numpy as np
import json
import pandas as pd
import netCDF4 as nc
from netCDF4 import Dataset
import csv
import datetime, time
from datetime import date, timedelta, datetime
import os.path
import re

Formatting CVAO O3 data

In [None]:
# Specify the directory containing the files
directory_path = '/scratch/ajp255/homes/mrr32/home/data/PartIII_23/CVAO/O3'

# List all files in the directory
file_names = os.listdir(directory_path)

# Create a Pandas DataFrame from the file names
df = pd.DataFrame({'file_name': file_names})

# Extract information from file names
df['base_date'] = df['file_name'].str.extract(r'(\d{8})')

# Convert 'base_date' to datetime format
df['base_date'] = pd.to_datetime(df['base_date'], format='%Y%m%d')
df = df.sort_values(by='base_date', ascending=True)

def process_netcdf_file(file_info, directory_path):
    name, base_date = file_info
    file_path = os.path.join(directory_path, name)

    with Dataset(file_path, 'r') as nc_file:
        data = {}  # Create an empty dictionary to store variable data

        # Define the variables you want and add them to the dictionary
        variable_list = ['time', 'latitude', 'longitude', 'o3_concentration_in_air', 'qc_flag']

        for variable in variable_list:
            if variable in nc_file.variables:
                data[variable] = nc_file.variables[variable][:]

        # Repeat latitude and longitude values for every row
        if 'latitude' in data:
            data['latitude'] = np.repeat(data['latitude'], len(data['time']))

        # Set constant longitude for every row (from observatory coordinates)
        constant_longitude = -24.8672  
        data['longitude'] = np.full(len(data['time']), constant_longitude)
        
        # Create a new column 'altitude' and assign a value of 300 for every row
        data['altitude'] = 300
        
        # Convert time to days
        sec_per_day = 24 * 60 * 60
        data['time'] = data['time'] / sec_per_day

    # Create a DataFrame for the current file
    return pd.DataFrame(data)

# Create an empty list to store individual DataFrames
dataframes = []

for name, base_date in zip(df['file_name'], df['base_date']):
    result = process_netcdf_file((name, base_date), directory_path)
    if result is not None:
        dataframes.append(result)

# Concatenate all individual DataFrames into a single DataFrame
flight_data = pd.concat(dataframes, ignore_index=True)

# Define a function to loop over each day in the file range (assuming chronological order)
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n) 
        
# Define start date from the beginning of the day
sec_per_day = 24*60*60 
start_date = datetime.fromtimestamp(flight_data['time'].values[0]*sec_per_day)
starting_point = start_date.hour*60*60+start_date.minute*60+start_date.second 
start_date = start_date - timedelta(0,starting_point)

# Define end date
sec_per_day = 24*60*60
end_date=datetime.fromtimestamp(flight_data['time'].values[-1]*sec_per_day)
end_point=end_date.hour*60*60+end_date.minute*60+ end_date.second
end_date=end_date-timedelta(0,starting_point)+timedelta(1)

# Iterate through each day and create a daily netCDF file with desired variables
sec_per_day = 24*60*60
for single_date in daterange(start_date, end_date):
    dt = single_date
    s = time.mktime(single_date.timetuple())
    dtt = single_date+timedelta(1)
    s2 = time.mktime(dtt.timetuple())
    daily_data=flight_data.loc[(flight_data['time']*sec_per_day < s2) & (flight_data['time']*sec_per_day >= s)]
    
    #The netCDF file is saved on the local server due to issues with the remote server
    if not daily_data.empty:
        obs=np.shape(daily_data)[0]
        date_string=dt.strftime('%Y%m%d')
        ncout = Dataset('/home/ajp255/nethome/Data/Output/CVAO_O3/CVAO_O3_noalt_data_'+ date_string  +'.nc','w','NETCDF4');
        ncout.createDimension('obs',obs);
        
        timevar=ncout.createVariable('time','float64',('obs')); timevar[:]=daily_data['time']
        timevar.setncattr('units','days since 1970-01-01')
        
        latitude=ncout.createVariable('latitude','float32',('obs')); latitude[:]=daily_data['latitude']
        latitude.setncattr('units','degrees north')
        
        longitude=ncout.createVariable('longitude','float32',('obs')); longitude[:]=daily_data['longitude']

        altitude=ncout.createVariable('altitude','float32',('obs')); altitude[:]=daily_data['altitude']
        altitude.setncattr('units','m asl')
        
        ozone=ncout.createVariable('mole_fraction_of_ozone_in_air','float32',('obs')); ozone[:]=daily_data['o3_concentration_in_air']
        ozone.setncattr('units', 'ppbv')
        
        # Create a variable to store a tag
        tag_variable = ncout.createVariable('tag', 'str', ('obs'))
        tags = ['CVAO_O3'  for tag in range(obs)]
        tag_variable[:] = np.array(tags)
        
        ncout.close();

Formatting CVAO CO data

In [None]:
# Specify the directory containing the files
directory_path = '/scratch/ajp255/homes/mrr32/home/data/PartIII_23/CVAO/CO'

# List all files in the directory
file_names = os.listdir(directory_path)

# Create a Pandas DataFrame from the file names
df = pd.DataFrame({'file_name': file_names})

# Extract information from file names
df['base_date'] = df['file_name'].str.extract(r'(\d{8})')

# Convert 'base_date' to datetime format
df['base_date'] = pd.to_datetime(df['base_date'], format='%Y%m%d')
df = df.sort_values(by='base_date', ascending=True)

def process_netcdf_file(file_info, directory_path):
    name, base_date = file_info
    file_path = os.path.join(directory_path, name)

    with Dataset(file_path, 'r') as nc_file:
        data = {}  # Create an empty dictionary to store variable data

        # Define the variables you want and add them to the dictionary
        variable_list = ['time', 'latitude', 'longitude', 'co_concentration_in_air', 'qc_flag']

        for variable in variable_list:
            if variable in nc_file.variables:
                data[variable] = nc_file.variables[variable][:]

        # Repeat latitude and longitude values for every row
        if 'latitude' in data:
            data['latitude'] = np.repeat(data['latitude'], len(data['time']))

        # Set constant longitude for every row (from observatory coordinates)
        constant_longitude = -24.8672  
        data['longitude'] = np.full(len(data['time']), constant_longitude)
        
        # Create a new column 'altitude' and assign a value of 300 for every row
        data['altitude'] = 300
        
        # Convert time to days
        sec_per_day = 24 * 60 * 60
        data['time'] = data['time'] / sec_per_day

    # Create a DataFrame for the current file
    return pd.DataFrame(data)

# Create an empty list to store individual DataFrames
dataframes = []

for name, base_date in zip(df['file_name'], df['base_date']):
    result = process_netcdf_file((name, base_date), directory_path)
    if result is not None:
        dataframes.append(result)

# Concatenate all individual DataFrames into a single DataFrame
flight_data = pd.concat(dataframes, ignore_index=True)

# Define a function to loop over each day in the file range (assuming chronological order)
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n) 
        
# Define start date from the beginning of the day
sec_per_day = 24*60*60 
start_date = datetime.fromtimestamp(flight_data['time'].values[0]*sec_per_day)
starting_point = start_date.hour*60*60+start_date.minute*60+start_date.second 
start_date = start_date - timedelta(0,starting_point)

# Define end date
sec_per_day = 24*60*60
end_date=datetime.fromtimestamp(flight_data['time'].values[-1]*sec_per_day)
end_point=end_date.hour*60*60+end_date.minute*60+ end_date.second
end_date=end_date-timedelta(0,starting_point)+timedelta(1)

# Iterate through each day and create a daily netCDF file with desired variables
sec_per_day = 24*60*60
for single_date in daterange(start_date, end_date):
    dt = single_date
    s = time.mktime(single_date.timetuple())
    dtt = single_date+timedelta(1)
    s2 = time.mktime(dtt.timetuple())
    daily_data=flight_data.loc[(flight_data['time']*sec_per_day < s2) & (flight_data['time']*sec_per_day >= s)]
    
    #The netCDF file is saved on the local server due to issues with the remote server
    if not daily_data.empty:
        obs=np.shape(daily_data)[0]
        date_string=dt.strftime('%Y%m%d')
        ncout = Dataset('/home/ajp255/nethome/Data/Output/CVAO_CO/CVAO_CO_noalt_data_'+ date_string  +'.nc','w','NETCDF4');
        ncout.createDimension('obs',obs);
        
        timevar=ncout.createVariable('time','float64',('obs')); timevar[:]=daily_data['time']
        timevar.setncattr('units','days since 1970-01-01')
        
        latitude=ncout.createVariable('latitude','float32',('obs')); latitude[:]=daily_data['latitude']
        latitude.setncattr('units','degrees north')
        
        longitude=ncout.createVariable('longitude','float32',('obs')); longitude[:]=daily_data['longitude']
        
        altitude=ncout.createVariable('altitude','float32',('obs')); altitude[:]=daily_data['altitude']
        altitude.setncattr('units','m asl')
        
        ozone=ncout.createVariable('mole_fraction_of_carbon_monoxide_in_air','float32',('obs')); ozone[:]=daily_data['co_concentration_in_air']
        ozone.setncattr('units', 'ppbv')
        
        # Create a variable to store a tag
        tag_variable = ncout.createVariable('tag', 'str', ('obs'))
        tags = ['CVAO_CO'  for tag in range(obs)]
        tag_variable[:] = np.array(tags)
        
        ncout.close();