# Install eccodes Pakcage

In [None]:
pip install eccodes

# Coverting (.GRIB) to (.csv)

Remark:

- base_dir is the directory that contains all the files that we need to convert in a folder
- (.GRIB) file should be in a folder with format name of: Year-Month


In [None]:
base_dir = r"C:\Users\Irene\OneDrive - UNSW\UNSW\Courses Study\Research Thesis\Py\nwp_preprocess\nwp-datasets-lead-6hr"
start_year = 2018
start_month = 12
end_year = 2020
end_month = 5

In [None]:
import eccodes
import csv
import os
from datetime import datetime

def grib_to_csv(grib_file, csv_file):
    with open(grib_file, 'rb') as f:
        gid = eccodes.codes_grib_new_from_file(f)
        
        if gid is None:
            raise ValueError(f"No valid GRIB found in file: {grib_file}")
        
        keys = ['Date', 'Time', 'Latitude', 'Longitude', 'Value']
        
        with open(csv_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(keys)
            
            while gid is not None:
                lats = eccodes.codes_get_array(gid, 'latitudes')
                lons = eccodes.codes_get_array(gid, 'longitudes')
                values = eccodes.codes_get_array(gid, 'values')
                data_date = eccodes.codes_get(gid, 'dataDate')
                data_time = eccodes.codes_get(gid, 'dataTime')
                
                date_str = f"{str(data_date)[:4]}-{str(data_date)[4:6]}-{str(data_date)[6:]}"
                time_str = f"{str(data_time).zfill(4)[:2]}:{str(data_time).zfill(4)[2:]}"
                
                for lat, lon, value in zip(lats, lons, values):
                    row = [date_str, time_str, lat, lon, value]
                    writer.writerow(row)
                
                eccodes.codes_release(gid)
                gid = eccodes.codes_grib_new_from_file(f)
    print(f'GRIB data has been successfully converted to {csv_file}')

def convert_all_grib_to_csv(base_dir, start_year, start_month, end_year, end_month):
    current_date = datetime(start_year, start_month, 1)
    end_date = datetime(end_year, end_month, 1)
    
    while current_date <= end_date:
        year_month_str = current_date.strftime("%Y-%m")
        grib_file_path = os.path.join(base_dir, year_month_str, f"6hr_{year_month_str.replace('-', '.')}.grib")
        csv_file_path = os.path.join(base_dir, year_month_str, f"6hr_{year_month_str.replace('-', '.')}.csv")
        
        if os.path.exists(grib_file_path):
            try:
                grib_to_csv(grib_file_path, csv_file_path)
            except ValueError as e:
                print(e)
        else:
            print(f"File not found: {grib_file_path}")
        
        current_date = datetime(current_date.year + (current_date.month // 12), (current_date.month % 12) + 1, 1)

convert_all_grib_to_csv(base_dir, start_year, start_month, end_year, end_month)

# Filter (.csv) based on Latitudes and Longitudes Needed

In [None]:
import pandas as pd
import os
from datetime import datetime

def filter_and_process_csv_data(csv_file, filtered_csv_file):
    df = pd.read_csv(csv_file)
    
    # Filter the data for the specified latitude and longitude values
    filtered_df = df[
        ((df['Latitude'] == -17.5) & (df['Longitude'] == 177.5)) |
        ((df['Latitude'] == -17.5) & (df['Longitude'] == 178.0)) |
        ((df['Latitude'] == -18.0) & (df['Longitude'] == 177.5)) |
        ((df['Latitude'] == -18.0) & (df['Longitude'] == 178.0)) |
        ((df['Latitude'] == -18.0) & (df['Longitude'] == 178.5))
    ]
    
    # Sort the data by Latitude, Longitude, Date, and Time
    filtered_df = filtered_df.sort_values(by=['Latitude', 'Longitude', 'Date', 'Time'])
    
    # Modify the time values
    times = ['06', '12', '18', '24']
    filtered_df['Time'] = filtered_df.groupby(['Latitude', 'Longitude', 'Date']).cumcount().map(lambda x: times[x % 4])
    
    filtered_df.to_csv(filtered_csv_file, index=False)
    print(f'Filtered and processed data has been saved to {filtered_csv_file}')

def filter_and_process_all_csv(base_dir, start_year, start_month, end_year, end_month):
    current_date = datetime(start_year, start_month, 1)
    end_date = datetime(end_year, end_month, 1)
    
    while current_date <= end_date:
        year_month_str = current_date.strftime("%Y-%m")
        csv_file_path = os.path.join(base_dir, year_month_str, f"6hr_{year_month_str.replace('-', '.')}.csv")
        filtered_csv_file_path = os.path.join(base_dir, year_month_str, f"filtered_6hr_{year_month_str.replace('-', '.')}.csv")
        
        if os.path.exists(csv_file_path):
            filter_and_process_csv_data(csv_file_path, filtered_csv_file_path)
        else:
            print(f"File not found: {csv_file_path}")
        
        current_date = datetime(current_date.year + (current_date.month // 12), (current_date.month % 12) + 1, 1)

filter_and_process_all_csv(base_dir, start_year, start_month, end_year, end_month)

# Assign Station ID

In [None]:
import pandas as pd
import os
from datetime import datetime

# Define station IDs and their corresponding latitudes and longitudes
stations = {
    '778601': {'Latitude': -18.0, 'Longitude': 177.5},
    '1770001': {'Latitude': -18.0, 'Longitude': 178.0},
    '2216302': {'Latitude': -18.0, 'Longitude': 178.5},
    '1127500': {'Latitude': -17.5, 'Longitude': 177.5},
    '177599': {'Latitude': -17.5, 'Longitude': 178.0}
}

def create_station_data(filtered_csv_file, station_csv_file):
    df = pd.read_csv(filtered_csv_file)
    
    # Create a new DataFrame for the output
    output_df = pd.DataFrame(columns=['Date', 'Time'] + list(stations.keys()))
    
    # Populate the new DataFrame with values from the filtered CSV file
    for _, row in df.iterrows():
        date, time, latitude, longitude, value = row['Date'], row['Time'], row['Latitude'], row['Longitude'], row['Value']
        
        # Find the station ID corresponding to the latitude and longitude
        for station_id, coords in stations.items():
            if coords['Latitude'] == latitude and coords['Longitude'] == longitude:
                # Check if the (date, time) combination already exists in the output DataFrame
                existing_row = output_df[(output_df['Date'] == date) & (output_df['Time'] == time)]
                if existing_row.empty:
                    # Add a new row for the (date, time) combination
                    new_row = {'Date': date, 'Time': time, station_id: value}
                    output_df = pd.concat([output_df, pd.DataFrame(new_row, index=[0])], ignore_index=True)
                else:
                    # Update the existing row with the value for the corresponding station ID
                    output_df.loc[(output_df['Date'] == date) & (output_df['Time'] == time), station_id] = value
    
    # Sort columns to match the required output order
    sorted_columns = ['Date', 'Time'] + list(stations.keys())
    output_df = output_df[sorted_columns]
    
    # Save the resulting DataFrame to a CSV file
    output_df.to_csv(station_csv_file, index=False)
    print(f'Station data has been saved to {station_csv_file}')

def process_all_filtered_csv(base_dir, start_year, start_month, end_year, end_month):
    current_date = datetime(start_year, start_month, 1)
    end_date = datetime(end_year, end_month, 1)
    
    while current_date <= end_date:
        year_month_str = current_date.strftime("%Y-%m")
        filtered_csv_file_path = os.path.join(base_dir, year_month_str, f"filtered_6hr_{year_month_str.replace('-', '.')}.csv")
        station_csv_file_path = os.path.join(base_dir, year_month_str, f"station_6hr_{year_month_str.replace('-', '.')}.csv")
        
        if os.path.exists(filtered_csv_file_path):
            create_station_data(filtered_csv_file_path, station_csv_file_path)
        else:
            print(f"File not found: {filtered_csv_file_path}")
        
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

process_all_filtered_csv(base_dir, start_year, start_month, end_year, end_month)

# Combine All into 1 single (.csv)

In [None]:
import pandas as pd
import os
from datetime import datetime

def combine_station_csv_files(base_dir, start_year, start_month, end_year, end_month, output_file):
    all_data = []

    current_date = datetime(start_year, start_month, 1)
    end_date = datetime(end_year, end_month, 1)
    
    while current_date <= end_date:
        year_month_str = current_date.strftime("%Y-%m")
        station_csv_file_path = os.path.join(base_dir, year_month_str, f"station_6hr_{year_month_str.replace('-', '.')}.csv")
        
        if os.path.exists(station_csv_file_path):
            df = pd.read_csv(station_csv_file_path)
            all_data.append(df)
        else:
            print(f"File not found: {station_csv_file_path}")
        
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

    if all_data:
        combined_df = pd.concat(all_data)
        combined_df = combined_df.sort_values(by=['Date', 'Time'])
        combined_df.to_csv(output_file, index=False)
        print(f'Combined data has been saved to {output_file}')
    else:
        print('No data to combine.')

# Directory paths and parameters
base_dir = r"C:\Users\Irene\OneDrive - UNSW\UNSW\Courses Study\Research Thesis\Py\nwp_preprocess\nwp-datasets-lead-6hr"
output_dir = os.path.join(base_dir, '0_combined_6hr')
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'combined_6hr.csv')

combine_station_csv_files(base_dir, start_year, start_month, end_year, end_month, output_file)