1. Konfigurasi API dan Inisialisasi Klien
Ganti nilai CDS_API_KEY dengan API key Anda.

In [1]:
import cdsapi
import xarray as xr
import numpy as np
import pandas as pd
import os
import glob
from itertools import product # Digunakan untuk looping tahun dan bulan

# --- KONFIGURASI API & KOORDINAT ---
CDS_API_KEY = "0454455d-cef6-4a4e-ad46-f0e905629467" 
CDS_API_URL = "https://cds.climate.copernicus.eu/api"

# **TARGET TAHUN**
TARGET_YEARS = ['2022'] #, '2023', '2024'

# Bounding Box (Mencakup semua 5 titik) [N, W, S, E]
AREA_BOUNDS = [-6.15, 106.75, -6.36, 106.92] 

# Koordinat 5 Lokasi Spesifik (Digunakan di Cell 4)
LOCATIONS = {
    'bundaran_hi': {'lat': -6.19466, 'lon': 106.8235},
    'kelapa_gading': {'lat': -6.15358, 'lon': 106.91089},
    'jagakarsa': {'lat': -6.35693, 'lon': 106.80367},
    'lubang_buaya': {'lat': -6.28889, 'lon': 106.90919},
    'kebun_jeruk': {'lat': -6.20735, 'lon': 106.75319}
}

# Jalur Penyimpanan Lokal
LOCAL_PATH = 'C:\\Users\\user\\OneDrive\\IPB\\Thesis\\02. Development\\01. Data Praprocessing'
# -------------------------------------

c = cdsapi.Client(url=CDS_API_URL, key=CDS_API_KEY)
os.makedirs(LOCAL_PATH, exist_ok=True)

print(f"Klien CDSAPI siap. Area: {AREA_BOUNDS}.")
print(f"Data akan disimpan di: {LOCAL_PATH}")

Klien CDSAPI siap. Area: [-6.15, 106.75, -6.36, 106.92].
Data akan disimpan di: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing


2. Fungsi Download Data ERA5 (Mengambil Data Setahun Penuh)  
Request ini meminta semua bulan dan semua hari pada tahun yang ditentukan di Cell 2.

In [2]:
all_days = [f'{i:02d}' for i in range(1, 32)] 
all_hours = [f'{h:02d}:00' for h in range(24)]
all_months = [f'{i:02d}' for i in range(2, 3)]

print(f"Memulai request data ERA5 bulanan untuk tahun {TARGET_YEARS}...")

# Melakukan looping untuk setiap kombinasi Tahun dan Bulan
for year, month in product(TARGET_YEARS, all_months):
    MONTHLY_OUTPUT_FILE = os.path.join(LOCAL_PATH, f'data_era5_hourly_{year}_{month}.nc')
    
    print(f"\n--- Memproses Bulan: {year}-{month} ---")
    
    # Lewati jika file sudah ada dan ukurannya logis
    if os.path.exists(MONTHLY_OUTPUT_FILE) and os.path.getsize(MONTHLY_OUTPUT_FILE) > 10000000:
        print(f"File {month}/{year} sudah ada dan utuh. Melanjutkan ke bulan berikutnya.")
        continue

    try:
        c.retrieve(
            'reanalysis-era5-single-levels',
            {
                'product_type': 'reanalysis',
                'format': 'netcdf',
                'variable': [
                    '10m_u_component_of_wind', '10m_v_component_of_wind', 
                    '2m_dewpoint_temperature', '2m_temperature', 
                    'total_precipitation', 
                ],
                'year': year,
                'month': month, 
                'day': all_days,
                'time': all_hours,
                'area': AREA_BOUNDS, # Menggunakan bounding box yang diperbesar
            },
            MONTHLY_OUTPUT_FILE)
        
        print(f"Download {year}-{month} selesai.")

    except Exception as e:
        print(f"!!! ERROR fatal saat download {year}-{month}. ERROR: {e}")

print("\nSemua request bulanan selesai. Siap untuk penggabungan dan pemrosesan lokasi spesifik.")

Memulai request data ERA5 bulanan untuk tahun ['2022']...

--- Memproses Bulan: 2022-02 ---


2025-10-31 21:55:55,688 INFO Request ID is f7d90fa4-03a0-4afe-8297-02555a3ef22b
2025-10-31 21:55:56,015 INFO status has been updated to accepted
2025-10-31 21:56:05,552 INFO status has been updated to running
2025-10-31 21:56:11,114 INFO status has been updated to successful
                                                                                       

Download 2022-02 selesai.

Semua request bulanan selesai. Siap untuk penggabungan dan pemrosesan lokasi spesifik.




3. Pemrosesan Data (Agregasi Harian & Konversi Unit)  
Ganti YEAR_TO_PROCESS dengan tahun yang baru selesai Anda unduh.

In [3]:
import xarray as xr
import pandas as pd
import os
import glob
import numpy as np
from pathlib import Path

def convert_era5_to_csv(nc_file, output_dir=None):
    """
    Mengkonversi file NetCDF ERA5 ke format CSV dengan error handling yang lebih baik
    """
    try:
        print(f"\n{'='*60}")
        print(f"Memproses file: {nc_file}")
        print(f"{'='*60}")
        
        # Periksa apakah file ada dan dapat dibaca
        if not os.path.exists(nc_file):
            print(f"‚ùå File tidak ditemukan: {nc_file}")
            return None
        
        file_size = os.path.getsize(nc_file) / (1024*1024)  # Size in MB
        print(f"Ukuran file: {file_size:.2f} MB")
        
        # Buka file NetCDF dengan error handling
        try:
            ds = xr.open_dataset(nc_file, engine='netcdf4')
        except Exception as e:
            print(f"‚ùå Gagal membuka file NetCDF: {str(e)}")
            # Coba engine alternatif
            try:
                print("Mencoba engine alternatif...")
                ds = xr.open_dataset(nc_file, engine='scipy')
            except:
                print("‚ùå Semua engine gagal. File mungkin rusak atau format tidak didukung.")
                return None
        
        # Ekstrak nama file tanpa ekstensi
        base_name = Path(nc_file).stem
        
        # Tentukan direktori output
        if output_dir is None:
            output_dir = "era5_csv_output"
        
        # Buat direktori output jika belum ada
        os.makedirs(output_dir, exist_ok=True)
        
        # Tampilkan informasi dataset
        print(f"üìä Informasi Dataset:")
        print(f"   Dimensi: {dict(ds.dims)}")
        print(f"   Koordinat: {list(ds.coords)}")
        
        # Konversi ke DataFrame
        variables_to_extract = []
        available_vars = list(ds.data_vars)
        
        print(f"üîç Variabel yang tersedia: {available_vars}")
        
        # Cek variabel yang umum di ERA5
        common_vars = ['t2m', 'd2m', 'u10', 'v10', 'msl', 'sp', 'tp']  # Variabel umum ERA5
        for var in common_vars:
            if var in available_vars:
                variables_to_extract.append(var)
        
        # Jika tidak ada variabel umum, gunakan semua variabel yang ada
        if not variables_to_extract:
            print("‚ö†Ô∏è Tidak ada variabel umum ditemukan, menggunakan semua variabel yang tersedia")
            variables_to_extract = available_vars
        
        print(f"üéØ Variabel yang akan dikonversi: {variables_to_extract}")
        
        # Konversi ke pandas DataFrame
        df_list = []
        
        for var in variables_to_extract:
            try:
                print(f"   üîÑ Memproses variabel: {var}")
                
                # Konversi variabel ke DataFrame
                var_data = ds[var]
                var_df = var_data.to_dataframe().reset_index()
                
                # Tambahkan nama variabel sebagai kolom
                var_df['variable'] = var
                var_df['value'] = var_df[var]
                
                # Pilih kolom yang relevan
                cols_to_keep = ['time', 'latitude', 'longitude', 'variable', 'value']
                
                # Tambahkan kolom tambahan jika ada
                optional_cols = ['number', 'expver', 'step', 'height']
                for col in optional_cols:
                    if col in var_df.columns:
                        cols_to_keep.append(col)
                
                var_df = var_df[cols_to_keep]
                df_list.append(var_df)
                
                print(f"   ‚úÖ {var}: {var_df.shape[0]} baris, {var_df.shape[1]} kolom")
                
            except Exception as var_error:
                print(f"   ‚ùå Error memproses {var}: {str(var_error)}")
                continue
        
        if not df_list:
            print("‚ùå Tidak ada data yang berhasil dikonversi")
            ds.close()
            return None
        
        # Gabungkan semua variabel
        combined_df = pd.concat(df_list, ignore_index=True)
        
        # Konversi tipe data untuk menghemat space
        combined_df['latitude'] = combined_df['latitude'].astype(np.float32)
        combined_df['longitude'] = combined_df['longitude'].astype(np.float32)
        combined_df['value'] = combined_df['value'].astype(np.float32)
        
        # Simpan ke CSV
        output_file = os.path.join(output_dir, f"{base_name}.csv")
        
        # Simpan dengan chunks jika file sangat besar
        if len(combined_df) > 1000000:  # Jika lebih dari 1 juta baris
            print("üì¶ File besar, menyimpan dalam chunks...")
            chunks = np.array_split(combined_df, 10)
            for i, chunk in enumerate(chunks):
                chunk_file = output_file.replace('.csv', f'_part_{i+1:02d}.csv')
                chunk.to_csv(chunk_file, index=False)
                print(f"   ‚úÖ Bagian {i+1}: {chunk_file}")
        else:
            combined_df.to_csv(output_file, index=False)
            print(f"‚úÖ File berhasil dikonversi: {output_file}")
        
        print(f"üìà Statistik data:")
        print(f"   Total baris: {combined_df.shape[0]:,}")
        print(f"   Total kolom: {combined_df.shape[1]}")
        print(f"   Rentang waktu: {combined_df['time'].min()} hingga {combined_df['time'].max()}")
        print(f"   Rentang latitude: {combined_df['latitude'].min():.2f} hingga {combined_df['latitude'].max():.2f}")
        print(f"   Rentang longitude: {combined_df['longitude'].min():.2f} hingga {combined_df['longitude'].max():.2f}")
        
        print(f"üëÄ Preview data:")
        print(combined_df.head(10))
        
        # Tutup dataset
        ds.close()
        
        return output_file
        
    except Exception as e:
        print(f"‚ùå Error processing {nc_file}: {str(e)}")
        return None

def batch_convert_era5_files(input_pattern, output_dir=None):
    """
    Konversi batch semua file ERA5 yang sesuai dengan pattern
    """
    # Cari semua file yang sesuai pattern
    nc_files = glob.glob(input_pattern)
    
    if not nc_files:
        print(f"‚ùå Tidak ada file yang ditemukan dengan pattern: {input_pattern}")
        print("üí° Pastikan file berada di direktori yang benar dan pattern sesuai")
        return []
    
    print(f"üìÅ Menemukan {len(nc_files)} file untuk dikonversi:")
    for i, nc_file in enumerate(sorted(nc_files), 1):
        print(f"   {i:2d}. {nc_file}")
    
    converted_files = []
    failed_files = []
    
    for nc_file in sorted(nc_files):
        output_file = convert_era5_to_csv(nc_file, output_dir)
        if output_file:
            converted_files.append(output_file)
        else:
            failed_files.append(nc_file)
    
    # Summary
    print(f"\n{'='*60}")
    print("üìä SUMMARY KONVERSI")
    print(f"{'='*60}")
    print(f"‚úÖ Berhasil dikonversi: {len(converted_files)} file")
    print(f"‚ùå Gagal dikonversi: {len(failed_files)} file")
    
    if converted_files:
        print(f"\nüìÇ File CSV yang dihasilkan:")
        for cf in converted_files:
            print(f"   üìÑ {cf}")
    
    if failed_files:
        print(f"\n‚ö†Ô∏è File yang gagal:")
        for ff in failed_files:
            print(f"   ‚ùå {ff}")
    
    return converted_files

def check_environment():
    """Memeriksa environment dan dependencies"""
    print("üîç Memeriksa environment...")
    try:
        import xarray
        import pandas
        import numpy
        print("‚úÖ Semua dependencies tersedia")
        return True
    except ImportError as e:
        print(f"‚ùå Dependency missing: {e}")
        print("üí° Install dengan: pip install xarray pandas numpy netcdf4")
        return False

# Contoh penggunaan untuk file bulan Januari-Desember 2022
if __name__ == "__main__":
    print("üöÄ MEMULAI KONVERSI ERA5 NETCDF KE CSV")
    print("=" * 60)
    
    # Periksa environment terlebih dahulu
    if not check_environment():
        exit(1)
    
    # Pattern untuk file bulanan ERA5 tahun 2022
    input_pattern = "data_era5_hourly_2022_*.nc"
    
    # Alternatif pattern jika file memiliki format berbeda
    alternative_patterns = [
        "data_era5_hourly_2022_*.nc",
        "era5_2022_*.nc",
        "*.nc"  # Semua file NetCDF
    ]
    
    # Cari file dengan pattern alternatif jika pattern utama tidak ditemukan
    nc_files = glob.glob(input_pattern)
    if not nc_files:
        print(f"‚ö†Ô∏è Pattern '{input_pattern}' tidak menemukan file, mencoba pattern alternatif...")
        for pattern in alternative_patterns:
            nc_files = glob.glob(pattern)
            if nc_files:
                input_pattern = pattern
                print(f"‚úÖ Menggunakan pattern: {pattern}")
                break
    
    # Direktori output
    output_directory = "era5_csv_output"
    
    # Jalankan konversi batch
    converted_files = batch_convert_era5_files(input_pattern, output_directory)
    
    if converted_files:
        print(f"\nüéâ Konversi selesai! {len(converted_files)} file berhasil dikonversi")
        print(f"üìÅ File CSV tersimpan di: {output_directory}")
    else:
        print(f"\nüí• Tidak ada file yang berhasil dikonversi")
        print("üîß Tips troubleshooting:")
        print("   1. Pastikan file NetCDF ada di direktori yang benar")
        print("   2. Periksa format file dengan: ncdump -h filename.nc")
        print("   3. Coba buka file dengan software NetCDF viewer")
        print("   4. Download ulang file jika rusak")

üöÄ MEMULAI KONVERSI ERA5 NETCDF KE CSV
üîç Memeriksa environment...
‚úÖ Semua dependencies tersedia
üìÅ Menemukan 1 file untuk dikonversi:
    1. data_era5_hourly_2022_02.nc

Memproses file: data_era5_hourly_2022_02.nc
Ukuran file: 0.18 MB
‚ùå Gagal membuka file NetCDF: [Errno -51] NetCDF: Unknown file format: 'c:\\Users\\user\\OneDrive\\IPB\\Thesis\\02. Development\\01. Data Praprocessing\\data_era5_hourly_2022_02.nc'
Mencoba engine alternatif...
‚ùå Semua engine gagal. File mungkin rusak atau format tidak didukung.

üìä SUMMARY KONVERSI
‚úÖ Berhasil dikonversi: 0 file
‚ùå Gagal dikonversi: 1 file

‚ö†Ô∏è File yang gagal:
   ‚ùå data_era5_hourly_2022_02.nc

üí• Tidak ada file yang berhasil dikonversi
üîß Tips troubleshooting:
   1. Pastikan file NetCDF ada di direktori yang benar
   2. Periksa format file dengan: ncdump -h filename.nc
   3. Coba buka file dengan software NetCDF viewer
   4. Download ulang file jika rusak


In [5]:
import cdsapi
import xarray as xr
import numpy as np
import pandas as pd
import os
import glob
import warnings
from itertools import product
from pathlib import Path
import logging

# Setup logging untuk monitoring yang lebih baik
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress warnings yang tidak penting
warnings.filterwarnings('ignore')

# =============================================================================
# KONFIGURASI
# =============================================================================

class ERA5Config:
    """Konfigurasi untuk download dan processing data ERA5"""
    
    # API Configuration - HATI-HATI: Jangan expose API key di production!
    CDS_API_KEY = "0454455d-cef6-4a4e-ad46-f0e905629467" 
    CDS_API_URL = "https://cds.climate.copernicus.eu/api"
    
    # Target years dan months
    TARGET_YEARS = ['2022']
    TARGET_MONTHS = ['03']  # February saja untuk contoh
    
    # Area bounds untuk Jakarta dan sekitarnya
    AREA_BOUNDS = [-6.15, 106.75, -6.36, 106.92] 
    
    # Lokasi spesifik
    LOCATIONS = {
        'bundaran_hi': {'lat': -6.19466, 'lon': 106.8235},
        'kelapa_gading': {'lat': -6.15358, 'lon': 106.91089},
        'jagakarsa': {'lat': -6.35693, 'lon': 106.80367},
        'lubang_buaya': {'lat': -6.28889, 'lon': 106.90919},
        'kebun_jeruk': {'lat': -6.20735, 'lon': 106.75319}
    }
    
    # Variables yang akan didownload
    VARIABLES = [
        '10m_u_component_of_wind', '10m_v_component_of_wind', 
        '2m_dewpoint_temperature', '2m_temperature', 
        'total_precipitation'
    ]
    
    # Path configuration
    BASE_PATH = 'C:\\Users\\user\\OneDrive\\IPB\\Thesis\\02. Development\\01. Data Praprocessing'
    RAW_DATA_PATH = os.path.join(BASE_PATH, 'raw_data')
    PROCESSED_DATA_PATH = os.path.join(BASE_PATH, 'processed_data')
    CSV_OUTPUT_PATH = os.path.join(BASE_PATH, 'csv_output')

# =============================================================================
# FUNGSI UTILITY
# =============================================================================

def setup_directories():
    """Membuat direktori yang diperlukan"""
    paths = [
        ERA5Config.RAW_DATA_PATH,
        ERA5Config.PROCESSED_DATA_PATH, 
        ERA5Config.CSV_OUTPUT_PATH
    ]
    
    for path in paths:
        os.makedirs(path, exist_ok=True)
        logger.info(f"Directory created/verified: {path}")

def check_dependencies():
    """Memeriksa semua dependencies yang diperlukan"""
    required_packages = {
        'xarray': 'xarray',
        'pandas': 'pandas', 
        'numpy': 'numpy',
        'cdsapi': 'cdsapi'
    }
    
    missing_packages = []
    for package, import_name in required_packages.items():
        try:
            __import__(import_name)
            logger.info(f"‚úÖ {package} tersedia")
        except ImportError:
            missing_packages.append(package)
            logger.error(f"‚ùå {package} tidak tersedia")
    
    if missing_packages:
        logger.error(f"Package yang missing: {missing_packages}")
        logger.error("Install dengan: pip install " + " ".join(missing_packages))
        return False
    
    return True

# =============================================================================
# DOWNLOAD DATA ERA5
# =============================================================================

class ERA5Downloader:
    """Class untuk handle download data ERA5"""
    
    def __init__(self):
        self.client = cdsapi.Client(
            url=ERA5Config.CDS_API_URL, 
            key=ERA5Config.CDS_API_KEY
        )
        logger.info("CDS API client initialized")
    
    def download_monthly_data(self, year, month):
        """Download data bulanan untuk tahun dan bulan tertentu"""
        
        output_file = os.path.join(
            ERA5Config.RAW_DATA_PATH, 
            f'data_era5_hourly_{year}_{month}.nc'
        )
        
        # Skip jika file sudah ada dan ukurannya reasonable
        if (os.path.exists(output_file) and 
            os.path.getsize(output_file) > 10_000_000):  # > 10MB
            logger.info(f"File {output_file} sudah ada, skip download")
            return output_file
        
        # Prepare request parameters
        all_days = [f'{i:02d}' for i in range(1, 32)]
        all_hours = [f'{h:02d}:00' for h in range(24)]
        
        request_params = {
            'product_type': 'reanalysis',
            'format': 'netcdf',
            'variable': ERA5Config.VARIABLES,
            'year': year,
            'month': month,
            'day': all_days,
            'time': all_hours,
            'area': ERA5Config.AREA_BOUNDS,
        }
        
        try:
            logger.info(f"Downloading data untuk {year}-{month}...")
            self.client.retrieve(
                'reanalysis-era5-single-levels',
                request_params,
                output_file
            )
            logger.info(f"‚úÖ Download selesai: {output_file}")
            return output_file
            
        except Exception as e:
            logger.error(f"‚ùå Gagal download {year}-{month}: {str(e)}")
            return None
    
    def download_all_data(self):
        """Download semua data berdasarkan konfigurasi"""
        downloaded_files = []
        
        for year, month in product(ERA5Config.TARGET_YEARS, ERA5Config.TARGET_MONTHS):
            result = self.download_monthly_data(year, month)
            if result:
                downloaded_files.append(result)
        
        logger.info(f"Total files downloaded: {len(downloaded_files)}")
        return downloaded_files

# =============================================================================
# PROCESSING DATA NETCDF KE CSV
# =============================================================================

class ERA5Processor:
    """Class untuk processing data ERA5 dari NetCDF ke CSV"""
    
    @staticmethod
    def try_open_netcdf(file_path):
        """Mencoba berbagai engine untuk membuka file NetCDF"""
        engines_to_try = ['netcdf4', 'scipy', 'h5netcdf']
        
        for engine in engines_to_try:
            try:
                logger.info(f"Mencoba buka {file_path} dengan engine: {engine}")
                ds = xr.open_dataset(file_path, engine=engine)
                logger.info(f"‚úÖ Berhasil buka file dengan engine: {engine}")
                return ds
            except Exception as e:
                logger.warning(f"‚ùå Engine {engine} gagal: {str(e)}")
                continue
        
        # Jika semua engine gagal, coba tanpa specify engine
        try:
            logger.info("Mencoba buka file tanpa specify engine...")
            ds = xr.open_dataset(file_path)
            logger.info("‚úÖ Berhasil buka file tanpa specify engine")
            return ds
        except Exception as e:
            logger.error(f"‚ùå Semua metode gagal: {str(e)}")
            return None
    
    @staticmethod
    def convert_era5_to_csv(nc_file_path, output_dir=None):
        """
        Konversi file NetCDF ERA5 ke format CSV dengan error handling yang robust
        """
        try:
            logger.info(f"Memproses file: {nc_file_path}")
            
            # Validasi file
            if not os.path.exists(nc_file_path):
                logger.error(f"File tidak ditemukan: {nc_file_path}")
                return None
            
            file_size = os.path.getsize(nc_file_path) / (1024 * 1024)  # MB
            logger.info(f"Ukuran file: {file_size:.2f} MB")
            
            # Buka file NetCDF dengan multiple engine fallback
            ds = ERA5Processor.try_open_netcdf(nc_file_path)
            if ds is None:
                return None
            
            # Ekstrak informasi dataset
            logger.info(f"Dimensi dataset: {dict(ds.dims)}")
            logger.info(f"Variabel yang tersedia: {list(ds.data_vars)}")
            
            # Mapping nama variabel ERA5 yang lengkap ke nama singkat
            variable_mapping = {
                '10m_u_component_of_wind': 'u10',
                '10m_v_component_of_wind': 'v10', 
                '2m_dewpoint_temperature': 'd2m',
                '2m_temperature': 't2m',
                'total_precipitation': 'tp'
            }
            
            # Process setiap variabel
            df_list = []
            for var_long, var_short in variable_mapping.items():
                if var_long in ds.data_vars:
                    try:
                        logger.info(f"Memproses variabel: {var_long} -> {var_short}")
                        
                        # Extract data untuk variabel
                        var_data = ds[var_long]
                        var_df = var_data.to_dataframe().reset_index()
                        
                        # Tambahkan metadata
                        var_df['variable'] = var_short
                        var_df['value'] = var_df[var_long]
                        
                        # Select kolom yang relevan
                        base_cols = ['time', 'latitude', 'longitude', 'variable', 'value']
                        available_cols = [col for col in base_cols if col in var_df.columns]
                        
                        # Tambahkan kolom optional jika ada
                        optional_cols = ['number', 'expver', 'step', 'height']
                        for opt_col in optional_cols:
                            if opt_col in var_df.columns:
                                available_cols.append(opt_col)
                        
                        var_df = var_df[available_cols]
                        df_list.append(var_df)
                        
                        logger.info(f"‚úÖ {var_short}: {len(var_df)} baris")
                        
                    except Exception as var_error:
                        logger.error(f"‚ùå Error processing {var_long}: {str(var_error)}")
                        continue
            
            if not df_list:
                logger.error("Tidak ada variabel yang berhasil diproses")
                ds.close()
                return None
            
            # Gabungkan semua data
            combined_df = pd.concat(df_list, ignore_index=True)
            
            # Optimasi tipe data
            combined_df = ERA5Processor.optimize_dataframe(combined_df)
            
            # Simpan ke CSV
            base_name = Path(nc_file_path).stem
            if output_dir is None:
                output_dir = ERA5Config.CSV_OUTPUT_PATH
            
            output_file = os.path.join(output_dir, f"{base_name}.csv")
            
            # Handle large files dengan chunks
            if len(combined_df) > 1_000_000:
                logger.info("File besar, menyimpan dalam chunks...")
                ERA5Processor.save_large_dataframe(combined_df, output_file)
            else:
                combined_df.to_csv(output_file, index=False)
                logger.info(f"‚úÖ File CSV disimpan: {output_file}")
            
            # Log statistics
            ERA5Processor.log_data_statistics(combined_df)
            
            ds.close()
            return output_file
            
        except Exception as e:
            logger.error(f"‚ùå Error processing {nc_file_path}: {str(e)}")
            return None
    
    @staticmethod
    def optimize_dataframe(df):
        """Optimasi tipe data untuk menghemat memory"""
        # Optimasi numeric columns
        float_cols = df.select_dtypes(include=['float64']).columns
        for col in float_cols:
            df[col] = df[col].astype(np.float32)
        
        # Optimasi integer columns  
        int_cols = df.select_dtypes(include=['int64']).columns
        for col in int_cols:
            df[col] = df[col].astype(np.int32)
        
        return df
    
    @staticmethod
    def save_large_dataframe(df, output_file, chunk_size=500000):
        """Simpan dataframe besar dalam chunks"""
        base_name = Path(output_file).stem
        
        num_chunks = (len(df) // chunk_size) + 1
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, len(df))
            
            chunk_df = df.iloc[start_idx:end_idx]
            chunk_file = output_file.replace('.csv', f'_part_{i+1:02d}.csv')
            
            chunk_df.to_csv(chunk_file, index=False)
            logger.info(f"‚úÖ Chunk {i+1}/{num_chunks}: {chunk_file}")
    
    @staticmethod
    def log_data_statistics(df):
        """Log statistics data"""
        logger.info("üìä Data Statistics:")
        logger.info(f"   Total rows: {len(df):,}")
        logger.info(f"   Total columns: {len(df.columns)}")
        logger.info(f"   Time range: {df['time'].min()} to {df['time'].max()}")
        
        if 'latitude' in df.columns:
            logger.info(f"   Latitude range: {df['latitude'].min():.3f} to {df['latitude'].max():.3f}")
        if 'longitude' in df.columns:
            logger.info(f"   Longitude range: {df['longitude'].min():.3f} to {df['longitude'].max():.3f}")
        
        logger.info(f"   Variables: {df['variable'].unique().tolist()}")
    
    @staticmethod
    def batch_convert_era5_files(input_pattern=None, output_dir=None):
        """Konversi batch semua file ERA5"""
        
        if input_pattern is None:
            input_pattern = os.path.join(ERA5Config.RAW_DATA_PATH, "data_era5_hourly_*.nc")
        
        if output_dir is None:
            output_dir = ERA5Config.CSV_OUTPUT_PATH
        
        # Cari file NetCDF
        nc_files = glob.glob(input_pattern)
        
        if not nc_files:
            logger.warning(f"Tidak ada file ditemukan dengan pattern: {input_pattern}")
            
            # Coba pattern alternatif
            alternative_patterns = [
                os.path.join(ERA5Config.RAW_DATA_PATH, "*.nc"),
                "data_era5_hourly_*.nc",
                "era5_*.nc"
            ]
            
            for pattern in alternative_patterns:
                nc_files = glob.glob(pattern)
                if nc_files:
                    logger.info(f"Menggunakan pattern alternatif: {pattern}")
                    break
        
        if not nc_files:
            logger.error("Tidak ada file NetCDF yang ditemukan!")
            return []
        
        logger.info(f"Menemukan {len(nc_files)} file untuk dikonversi:")
        for i, nc_file in enumerate(sorted(nc_files), 1):
            logger.info(f"  {i:2d}. {nc_file}")
        
        # Process setiap file
        converted_files = []
        failed_files = []
        
        for nc_file in sorted(nc_files):
            result = ERA5Processor.convert_era5_to_csv(nc_file, output_dir)
            if result:
                converted_files.append(result)
            else:
                failed_files.append(nc_file)
        
        # Summary
        logger.info(f"\n{'='*50}")
        logger.info("SUMMARY KONVERSI:")
        logger.info(f"{'='*50}")
        logger.info(f"‚úÖ Berhasil: {len(converted_files)} file")
        logger.info(f"‚ùå Gagal: {len(failed_files)} file")
        
        if failed_files:
            logger.info("File yang gagal:")
            for ff in failed_files:
                logger.info(f"  - {ff}")
        
        return converted_files

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main function untuk menjalankan seluruh pipeline"""
    logger.info("üöÄ MEMULAI ERA5 DATA PROCESSING PIPELINE")
    
    # Step 1: Setup environment
    logger.info("üîß Setup environment...")
    if not check_dependencies():
        logger.error("Dependencies tidak lengkap, proses dihentikan")
        return
    
    setup_directories()
    
    # Step 2: Download data
    logger.info("üì• Download data ERA5...")
    downloader = ERA5Downloader()
    downloaded_files = downloader.download_all_data()
    
    if not downloaded_files:
        logger.warning("Tidak ada file yang didownload, lanjut ke file yang sudah ada")
    
    # Step 3: Process data ke CSV
    logger.info("üîÑ Konversi NetCDF ke CSV...")
    processor = ERA5Processor()
    converted_files = processor.batch_convert_era5_files()
    
    # Final summary
    logger.info(f"\nüéâ PIPELINE SELESAI!")
    logger.info(f"üìÅ File CSV tersimpan di: {ERA5Config.CSV_OUTPUT_PATH}")
    
    if converted_files:
        logger.info("File yang berhasil dikonversi:")
        for cf in converted_files:
            logger.info(f"  üìÑ {cf}")

if __name__ == "__main__":
    main()

2025-10-31 22:06:11,706 - INFO - üöÄ MEMULAI ERA5 DATA PROCESSING PIPELINE
2025-10-31 22:06:11,708 - INFO - üîß Setup environment...
2025-10-31 22:06:11,709 - INFO - ‚úÖ xarray tersedia
2025-10-31 22:06:11,710 - INFO - ‚úÖ pandas tersedia
2025-10-31 22:06:11,711 - INFO - ‚úÖ numpy tersedia
2025-10-31 22:06:11,713 - INFO - ‚úÖ cdsapi tersedia
2025-10-31 22:06:11,716 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data


2025-10-31 22:06:11,719 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\processed_data
2025-10-31 22:06:11,722 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output
2025-10-31 22:06:11,724 - INFO - üì• Download data ERA5...
2025-10-31 22:08:23,023 - INFO - Retrying now...
2025-10-31 22:10:30,063 - INFO - Retrying now...
2025-10-31 22:12:41,347 - INFO - Retrying now...
2025-10-31 22:12:43,680 - INFO - CDS API client initialized
2025-10-31 22:12:43,682 - INFO - Downloading data untuk 2022-03...
2025-10-31 22:12:44,940 INFO Request ID is ce547519-8fb2-4236-b3f9-48dd3b73dce0
2025-10-31 22:12:44,940 - INFO - Request ID is ce547519-8fb2-4236-b3f9-48dd3b73dce0
2025-10-31 22:12:45,277 INFO status has been updated to accepted
2025-10-31 22:12:45,277 - INFO - status has been updated to accepted
2025-10-31 22:12:54,746 INFO status has been updated to running
2025-10-31

In [9]:
import cdsapi
import xarray as xr
import numpy as np
import pandas as pd
import os
import glob
import warnings
import zipfile
import shutil
from itertools import product
from pathlib import Path
import logging

# Setup logging untuk monitoring yang lebih baik
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress warnings yang tidak penting
warnings.filterwarnings('ignore')

# =============================================================================
# KONFIGURASI
# =============================================================================

class ERA5Config:
    """Konfigurasi untuk download dan processing data ERA5"""
    
    # API Configuration
    CDS_API_KEY = "0454455d-cef6-4a4e-ad46-f0e905629467" 
    CDS_API_URL = "https://cds.climate.copernicus.eu/api"
    
    # Target years dan months
    TARGET_YEARS = ['2022']
    TARGET_MONTHS = ['02', '03']  # February dan March
    
    # Area bounds untuk Jakarta dan sekitarnya
    AREA_BOUNDS = [-6.15, 106.75, -6.36, 106.92] 
    
    # Lokasi spesifik
    LOCATIONS = {
        'bundaran_hi': {'lat': -6.19466, 'lon': 106.8235},
        'kelapa_gading': {'lat': -6.15358, 'lon': 106.91089},
        'jagakarsa': {'lat': -6.35693, 'lon': 106.80367},
        'lubang_buaya': {'lat': -6.28889, 'lon': 106.90919},
        'kebun_jeruk': {'lat': -6.20735, 'lon': 106.75319}
    }
    
    # Variables yang akan didownload
    VARIABLES = [
        '10m_u_component_of_wind', '10m_v_component_of_wind', 
        '2m_dewpoint_temperature', '2m_temperature', 
        'total_precipitation'
    ]
    
    # Path configuration
    BASE_PATH = 'C:\\Users\\user\\OneDrive\\IPB\\Thesis\\02. Development\\01. Data Praprocessing'
    RAW_DATA_PATH = os.path.join(BASE_PATH, 'raw_data')
    PROCESSED_DATA_PATH = os.path.join(BASE_PATH, 'processed_data')
    CSV_OUTPUT_PATH = os.path.join(BASE_PATH, 'csv_output')
    TEMP_EXTRACT_PATH = os.path.join(BASE_PATH, 'temp_extract')

# =============================================================================
# FUNGSI UTILITY
# =============================================================================

def setup_directories():
    """Membuat direktori yang diperlukan"""
    paths = [
        ERA5Config.RAW_DATA_PATH,
        ERA5Config.PROCESSED_DATA_PATH, 
        ERA5Config.CSV_OUTPUT_PATH,
        ERA5Config.TEMP_EXTRACT_PATH
    ]
    
    for path in paths:
        os.makedirs(path, exist_ok=True)
        logger.info(f"Directory created/verified: {path}")

def check_dependencies():
    """Memeriksa semua dependencies yang diperlukan"""
    required_packages = {
        'xarray': 'xarray',
        'pandas': 'pandas', 
        'numpy': 'numpy',
        'cdsapi': 'cdsapi',
        'netcdf4': 'netCDF4'
    }
    
    missing_packages = []
    for package, import_name in required_packages.items():
        try:
            __import__(import_name)
            logger.info(f"‚úÖ {package} tersedia")
        except ImportError:
            missing_packages.append(package)
            logger.error(f"‚ùå {package} tidak tersedia")
    
    if missing_packages:
        logger.error(f"Package yang missing: {missing_packages}")
        logger.error("Install dengan: pip install " + " ".join(missing_packages))
        return False
    
    return True

def extract_zip_file(zip_path, extract_to=None):
    """
    Extract file ZIP dan cari file NetCDF di dalamnya
    """
    if extract_to is None:
        extract_to = ERA5Config.TEMP_EXTRACT_PATH
    
    try:
        logger.info(f"Mengekstrak file ZIP: {zip_path}")
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Dapatkan list file dalam ZIP
            file_list = zip_ref.namelist()
            logger.info(f"File dalam ZIP: {file_list}")
            
            # Extract semua file
            zip_ref.extractall(extract_to)
            
            # Cari file NetCDF
            nc_files = [f for f in file_list if f.endswith('.nc')]
            if nc_files:
                nc_file_path = os.path.join(extract_to, nc_files[0])
                logger.info(f"File NetCDF ditemukan: {nc_file_path}")
                return nc_file_path
            else:
                logger.warning("Tidak ada file NetCDF dalam ZIP")
                return None
                
    except Exception as e:
        logger.error(f"Error extracting ZIP file: {str(e)}")
        return None

def handle_zip_download(downloaded_file, target_nc_file):
    """
    Handle file yang didownload sebagai ZIP tetapi disimpan sebagai .nc
    """
    try:
        # Cek jika file sebenarnya adalah ZIP
        if zipfile.is_zipfile(downloaded_file):
            logger.info(f"File {downloaded_file} adalah file ZIP")
            
            # Extract ZIP file
            extracted_nc = extract_zip_file(downloaded_file)
            
            if extracted_nc and os.path.exists(extracted_nc):
                # Copy file yang diekstrak ke lokasi target
                shutil.copy2(extracted_nc, target_nc_file)
                logger.info(f"File NetCDF disimpan sebagai: {target_nc_file}")
                
                # Hapus file ZIP asli
                os.remove(downloaded_file)
                logger.info(f"File ZIP asli dihapus: {downloaded_file}")
                
                return True
            else:
                logger.error("Gagal mengekstrak file NetCDF dari ZIP")
                return False
        else:
            logger.info(f"File {downloaded_file} adalah file NetCDF asli")
            return True
            
    except Exception as e:
        logger.error(f"Error handling ZIP download: {str(e)}")
        return False

# =============================================================================
# DOWNLOAD DATA ERA5
# =============================================================================

class ERA5Downloader:
    """Class untuk handle download data ERA5"""
    
    def __init__(self):
        self.client = cdsapi.Client(
            url=ERA5Config.CDS_API_URL, 
            key=ERA5Config.CDS_API_KEY
        )
        logger.info("CDS API client initialized")
    
    def download_monthly_data(self, year, month):
        """Download data bulanan untuk tahun dan bulan tertentu"""
        
        output_file = os.path.join(
            ERA5Config.RAW_DATA_PATH, 
            f'data_era5_hourly_{year}_{month}.nc'
        )
        
        # Skip jika file sudah ada dan ukurannya reasonable
        if (os.path.exists(output_file) and 
            os.path.getsize(output_file) > 10_000_000):  # > 10MB
            logger.info(f"File {output_file} sudah ada, skip download")
            return output_file
        
        # Prepare request parameters
        all_days = [f'{i:02d}' for i in range(1, 32)]
        all_hours = [f'{h:02d}:00' for h in range(24)]
        
        request_params = {
            'product_type': 'reanalysis',
            'format': 'netcdf',
            'variable': ERA5Config.VARIABLES,
            'year': year,
            'month': month,
            'day': all_days,
            'time': all_hours,
            'area': ERA5Config.AREA_BOUNDS,
        }
        
        try:
            logger.info(f"Downloading data untuk {year}-{month}...")
            
            # Download ke temporary file dulu
            temp_file = output_file + '.temp'
            self.client.retrieve(
                'reanalysis-era5-single-levels',
                request_params,
                temp_file
            )
            
            # Handle kemungkinan file ZIP
            if handle_zip_download(temp_file, output_file):
                logger.info(f"‚úÖ Download dan ekstrak selesai: {output_file}")
                return output_file
            else:
                logger.error(f"‚ùå Gagal processing file untuk {year}-{month}")
                return None
            
        except Exception as e:
            logger.error(f"‚ùå Gagal download {year}-{month}: {str(e)}")
            # Hapus file temporary jika ada
            if os.path.exists(temp_file):
                os.remove(temp_file)
            return None
    
    def download_all_data(self):
        """Download semua data berdasarkan konfigurasi"""
        downloaded_files = []
        
        for year, month in product(ERA5Config.TARGET_YEARS, ERA5Config.TARGET_MONTHS):
            result = self.download_monthly_data(year, month)
            if result:
                downloaded_files.append(result)
        
        logger.info(f"Total files downloaded: {len(downloaded_files)}")
        return downloaded_files

# =============================================================================
# PROCESSING DATA NETCDF KE CSV
# =============================================================================

class ERA5Processor:
    """Class untuk processing data ERA5 dari NetCDF ke CSV"""
    
    @staticmethod
    def try_open_netcdf(file_path):
        """Mencoba berbagai engine untuk membuka file NetCDF"""
        engines_to_try = ['netcdf4', 'scipy', 'h5netcdf']
        
        for engine in engines_to_try:
            try:
                logger.info(f"Mencoba buka {file_path} dengan engine: {engine}")
                ds = xr.open_dataset(file_path, engine=engine)
                logger.info(f"‚úÖ Berhasil buka file dengan engine: {engine}")
                return ds
            except Exception as e:
                logger.warning(f"‚ùå Engine {engine} gagal: {str(e)}")
                continue
        
        # Jika semua engine gagal, coba tanpa specify engine
        try:
            logger.info("Mencoba buka file tanpa specify engine...")
            ds = xr.open_dataset(file_path)
            logger.info("‚úÖ Berhasil buka file tanpa specify engine")
            return ds
        except Exception as e:
            logger.error(f"‚ùå Semua metode gagal: {str(e)}")
            return None
    
    @staticmethod
    def convert_era5_to_csv(nc_file_path, output_dir=None):
        """
        Konversi file NetCDF ERA5 ke format CSV dengan error handling yang robust
        """
        try:
            logger.info(f"Memproses file: {nc_file_path}")
            
            # Validasi file
            if not os.path.exists(nc_file_path):
                logger.error(f"File tidak ditemukan: {nc_file_path}")
                return None
            
            file_size = os.path.getsize(nc_file_path) / (1024 * 1024)  # MB
            logger.info(f"Ukuran file: {file_size:.2f} MB")
            
            # Cek jika file adalah ZIP
            if zipfile.is_zipfile(nc_file_path):
                logger.info(f"File {nc_file_path} adalah ZIP file, mengekstrak...")
                extracted_file = extract_zip_file(nc_file_path)
                if extracted_file:
                    nc_file_path = extracted_file
                else:
                    logger.error("Gagal mengekstrak file ZIP")
                    return None
            
            # Buka file NetCDF dengan multiple engine fallback
            ds = ERA5Processor.try_open_netcdf(nc_file_path)
            if ds is None:
                return None
            
            # Ekstrak informasi dataset
            logger.info(f"Dimensi dataset: {dict(ds.dims)}")
            logger.info(f"Variabel yang tersedia: {list(ds.data_vars)}")
            
            # Process setiap variabel
            df_list = []
            for var_name in ds.data_vars:
                try:
                    logger.info(f"Memproses variabel: {var_name}")
                    
                    # Extract data untuk variabel
                    var_data = ds[var_name]
                    var_df = var_data.to_dataframe().reset_index()
                    
                    # Tambahkan metadata
                    var_df['variable'] = var_name
                    var_df['value'] = var_df[var_name]
                    
                    # Select kolom yang relevan
                    base_cols = ['time', 'latitude', 'longitude', 'variable', 'value']
                    available_cols = [col for col in base_cols if col in var_df.columns]
                    
                    # Tambahkan kolom optional jika ada
                    optional_cols = ['number', 'expver', 'step', 'height']
                    for opt_col in optional_cols:
                        if opt_col in var_df.columns:
                            available_cols.append(opt_col)
                    
                    var_df = var_df[available_cols]
                    df_list.append(var_df)
                    
                    logger.info(f"‚úÖ {var_name}: {len(var_df)} baris")
                    
                except Exception as var_error:
                    logger.error(f"‚ùå Error processing {var_name}: {str(var_error)}")
                    continue
            
            if not df_list:
                logger.error("Tidak ada variabel yang berhasil diproses")
                ds.close()
                return None
            
            # Gabungkan semua data
            combined_df = pd.concat(df_list, ignore_index=True)
            
            # Optimasi tipe data
            combined_df = ERA5Processor.optimize_dataframe(combined_df)
            
            # Simpan ke CSV
            base_name = Path(nc_file_path).stem
            if output_dir is None:
                output_dir = ERA5Config.CSV_OUTPUT_PATH
            
            output_file = os.path.join(output_dir, f"{base_name}.csv")
            
            # Handle large files dengan chunks
            if len(combined_df) > 1_000_000:
                logger.info("File besar, menyimpan dalam chunks...")
                ERA5Processor.save_large_dataframe(combined_df, output_file)
            else:
                combined_df.to_csv(output_file, index=False)
                logger.info(f"‚úÖ File CSV disimpan: {output_file}")
            
            # Log statistics
            ERA5Processor.log_data_statistics(combined_df)
            
            ds.close()
            return output_file
            
        except Exception as e:
            logger.error(f"‚ùå Error processing {nc_file_path}: {str(e)}")
            return None
    
    @staticmethod
    def optimize_dataframe(df):
        """Optimasi tipe data untuk menghemat memory"""
        # Optimasi numeric columns
        float_cols = df.select_dtypes(include=['float64']).columns
        for col in float_cols:
            df[col] = df[col].astype(np.float32)
        
        # Optimasi integer columns  
        int_cols = df.select_dtypes(include=['int64']).columns
        for col in int_cols:
            df[col] = df[col].astype(np.int32)
        
        return df
    
    @staticmethod
    def save_large_dataframe(df, output_file, chunk_size=500000):
        """Simpan dataframe besar dalam chunks"""
        base_name = Path(output_file).stem
        
        num_chunks = (len(df) // chunk_size) + 1
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, len(df))
            
            chunk_df = df.iloc[start_idx:end_idx]
            chunk_file = output_file.replace('.csv', f'_part_{i+1:02d}.csv')
            
            chunk_df.to_csv(chunk_file, index=False)
            logger.info(f"‚úÖ Chunk {i+1}/{num_chunks}: {chunk_file}")
    
    @staticmethod
    def log_data_statistics(df):
        """Log statistics data"""
        logger.info("üìä Data Statistics:")
        logger.info(f"   Total rows: {len(df):,}")
        logger.info(f"   Total columns: {len(df.columns)}")
        logger.info(f"   Time range: {df['time'].min()} to {df['time'].max()}")
        
        if 'latitude' in df.columns:
            logger.info(f"   Latitude range: {df['latitude'].min():.3f} to {df['latitude'].max():.3f}")
        if 'longitude' in df.columns:
            logger.info(f"   Longitude range: {df['longitude'].min():.3f} to {df['longitude'].max():.3f}")
        
        logger.info(f"   Variables: {df['variable'].unique().tolist()}")
    
    @staticmethod
    def batch_convert_era5_files(input_pattern=None, output_dir=None):
        """Konversi batch semua file ERA5"""
        
        if input_pattern is None:
            input_pattern = os.path.join(ERA5Config.RAW_DATA_PATH, "data_era5_hourly_*.nc")
        
        if output_dir is None:
            output_dir = ERA5Config.CSV_OUTPUT_PATH
        
        # Cari file NetCDF
        nc_files = glob.glob(input_pattern)
        
        if not nc_files:
            logger.warning(f"Tidak ada file ditemukan dengan pattern: {input_pattern}")
            
            # Coba pattern alternatif
            alternative_patterns = [
                os.path.join(ERA5Config.RAW_DATA_PATH, "*.nc"),
                "data_era5_hourly_*.nc",
                "era5_*.nc"
            ]
            
            for pattern in alternative_patterns:
                nc_files = glob.glob(pattern)
                if nc_files:
                    logger.info(f"Menggunakan pattern alternatif: {pattern}")
                    break
        
        if not nc_files:
            logger.error("Tidak ada file NetCDF yang ditemukan!")
            return []
        
        logger.info(f"Menemukan {len(nc_files)} file untuk dikonversi:")
        for i, nc_file in enumerate(sorted(nc_files), 1):
            logger.info(f"  {i:2d}. {nc_file}")
        
        # Process setiap file
        converted_files = []
        failed_files = []
        
        for nc_file in sorted(nc_files):
            result = ERA5Processor.convert_era5_to_csv(nc_file, output_dir)
            if result:
                converted_files.append(result)
            else:
                failed_files.append(nc_file)
        
        # Summary
        logger.info(f"\n{'='*50}")
        logger.info("SUMMARY KONVERSI:")
        logger.info(f"{'='*50}")
        logger.info(f"‚úÖ Berhasil: {len(converted_files)} file")
        logger.info(f"‚ùå Gagal: {len(failed_files)} file")
        
        if failed_files:
            logger.info("File yang gagal:")
            for ff in failed_files:
                logger.info(f"  - {ff}")
        
        return converted_files

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main function untuk menjalankan seluruh pipeline"""
    logger.info("üöÄ MEMULAI ERA5 DATA PROCESSING PIPELINE")
    
    # Step 1: Setup environment
    logger.info("üîß Setup environment...")
    if not check_dependencies():
        logger.error("Dependencies tidak lengkap, proses dihentikan")
        return
    
    setup_directories()
    
    # Step 2: Download data
    logger.info("üì• Download data ERA5...")
    downloader = ERA5Downloader()
    downloaded_files = downloader.download_all_data()
    
    if not downloaded_files:
        logger.warning("Tidak ada file yang didownload, lanjut ke file yang sudah ada")
    
    # Step 3: Process data ke CSV
    logger.info("üîÑ Konversi NetCDF ke CSV...")
    processor = ERA5Processor()
    converted_files = processor.batch_convert_era5_files()
    
    # Final summary
    logger.info(f"\nüéâ PIPELINE SELESAI!")
    logger.info(f"üìÅ File CSV tersimpan di: {ERA5Config.CSV_OUTPUT_PATH}")
    
    if converted_files:
        logger.info("File yang berhasil dikonversi:")
        for cf in converted_files:
            logger.info(f"  üìÑ {cf}")
    else:
        logger.error("‚ùå Tidak ada file yang berhasil dikonversi!")
        logger.info("üí° Tips troubleshooting:")
        logger.info("   1. Cek apakah file NetCDF valid dengan: ncdump -h filename.nc")
        logger.info("   2. Pastikan semua dependencies terinstall: pip install netcdf4 h5netcdf")
        logger.info("   3. Coba download ulang file yang bermasalah")

if __name__ == "__main__":
    main()

2025-10-31 22:21:07,394 - INFO - üöÄ MEMULAI ERA5 DATA PROCESSING PIPELINE
2025-10-31 22:21:07,396 - INFO - üîß Setup environment...
2025-10-31 22:21:07,397 - INFO - ‚úÖ xarray tersedia
2025-10-31 22:21:07,398 - INFO - ‚úÖ pandas tersedia
2025-10-31 22:21:07,399 - INFO - ‚úÖ numpy tersedia
2025-10-31 22:21:07,400 - INFO - ‚úÖ cdsapi tersedia
2025-10-31 22:21:07,401 - INFO - ‚úÖ netcdf4 tersedia
2025-10-31 22:21:07,404 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data
2025-10-31 22:21:07,405 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\processed_data
2025-10-31 22:21:07,407 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output
2025-10-31 22:21:07,410 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\temp_extract
2025-10-31 22:21

In [14]:
import cdsapi
import xarray as xr
import numpy as np
import pandas as pd
import os
import glob
import warnings
import zipfile
import shutil
from itertools import product
from pathlib import Path
import logging
from datetime import datetime, timedelta

# Setup logging untuk monitoring yang lebih baik
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress warnings yang tidak penting
warnings.filterwarnings('ignore')

# =============================================================================
# KONFIGURASI
# =============================================================================

class ERA5Config:
    """Konfigurasi untuk download dan processing data ERA5"""
    
    # API Configuration
    CDS_API_KEY = "0454455d-cef6-4a4e-ad46-f0e905629467" 
    CDS_API_URL = "https://cds.climate.copernicus.eu/api"
    
    # Target years dan months
    TARGET_YEARS = ['2024']
    TARGET_MONTHS = ['01','02','03' '04', '05', '06', '07', '08', '09', '10', '11', '12']  # February dan March
    
    # Lokasi spesifik yang ingin diambil datanya
    LOCATIONS = {
        'bundaran_hi': {'lat': -6.19466, 'lon': 106.8235},
        'kelapa_gading': {'lat': -6.15358, 'lon': 106.91089},
        'jagakarsa': {'lat': -6.35693, 'lon': 106.80367},
        'lubang_buaya': {'lat': -6.28889, 'lon': 106.90919},
        'kebun_jeruk': {'lat': -6.20735, 'lon': 106.75319}
    }
    
    # Area bounds untuk download (sedikit lebih besar dari area lokasi)
    AREA_BOUNDS = [
        max(loc['lat'] for loc in LOCATIONS.values()) + 0.1,  # North
        min(loc['lon'] for loc in LOCATIONS.values()) - 0.1,  # West  
        min(loc['lat'] for loc in LOCATIONS.values()) - 0.1,  # South
        max(loc['lon'] for loc in LOCATIONS.values()) + 0.1   # East
    ]
    
    # Variables yang akan didownload
    VARIABLES = [
        '10m_u_component_of_wind', '10m_v_component_of_wind', 
        '2m_dewpoint_temperature', '2m_temperature', 
        'total_precipitation'
    ]
    
    # Path configuration
    BASE_PATH = 'C:\\Users\\user\\OneDrive\\IPB\\Thesis\\02. Development\\01. Data Praprocessing'
    RAW_DATA_PATH = os.path.join(BASE_PATH, 'raw_data')
    PROCESSED_DATA_PATH = os.path.join(BASE_PATH, 'processed_data')
    CSV_OUTPUT_PATH = os.path.join(BASE_PATH, 'csv_output')
    TEMP_EXTRACT_PATH = os.path.join(BASE_PATH, 'temp_extract')

# =============================================================================
# FUNGSI UTILITY
# =============================================================================

def setup_directories():
    """Membuat direktori yang diperlukan"""
    paths = [
        ERA5Config.RAW_DATA_PATH,
        ERA5Config.PROCESSED_DATA_PATH, 
        ERA5Config.CSV_OUTPUT_PATH,
        ERA5Config.TEMP_EXTRACT_PATH
    ]
    
    for path in paths:
        os.makedirs(path, exist_ok=True)
        logger.info(f"Directory created/verified: {path}")

def check_dependencies():
    """Memeriksa semua dependencies yang diperlukan"""
    required_packages = {
        'xarray': 'xarray',
        'pandas': 'pandas', 
        'numpy': 'numpy',
        'cdsapi': 'cdsapi',
        'netcdf4': 'netCDF4'
    }
    
    missing_packages = []
    for package, import_name in required_packages.items():
        try:
            __import__(import_name)
            logger.info(f"‚úÖ {package} tersedia")
        except ImportError:
            missing_packages.append(package)
            logger.error(f"‚ùå {package} tidak tersedia")
    
    if missing_packages:
        logger.error(f"Package yang missing: {missing_packages}")
        logger.error("Install dengan: pip install " + " ".join(missing_packages))
        return False
    
    return True

def extract_zip_file(zip_path, extract_to=None):
    """
    Extract file ZIP dan cari file NetCDF di dalamnya
    """
    if extract_to is None:
        extract_to = ERA5Config.TEMP_EXTRACT_PATH
    
    try:
        logger.info(f"Mengekstrak file ZIP: {zip_path}")
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Dapatkan list file dalam ZIP
            file_list = zip_ref.namelist()
            logger.info(f"File dalam ZIP: {file_list}")
            
            # Extract semua file
            zip_ref.extractall(extract_to)
            
            # Cari file NetCDF
            nc_files = [f for f in file_list if f.endswith('.nc')]
            if nc_files:
                nc_file_path = os.path.join(extract_to, nc_files[0])
                logger.info(f"File NetCDF ditemukan: {nc_file_path}")
                return nc_file_path
            else:
                logger.warning("Tidak ada file NetCDF dalam ZIP")
                return None
                
    except Exception as e:
        logger.error(f"Error extracting ZIP file: {str(e)}")
        return None

def handle_zip_download(downloaded_file, target_nc_file):
    """
    Handle file yang didownload sebagai ZIP tetapi disimpan sebagai .nc
    """
    try:
        # Cek jika file sebenarnya adalah ZIP
        if zipfile.is_zipfile(downloaded_file):
            logger.info(f"File {downloaded_file} adalah file ZIP")
            
            # Extract ZIP file
            extracted_nc = extract_zip_file(downloaded_file)
            
            if extracted_nc and os.path.exists(extracted_nc):
                # Copy file yang diekstrak ke lokasi target
                shutil.copy2(extracted_nc, target_nc_file)
                logger.info(f"File NetCDF disimpan sebagai: {target_nc_file}")
                
                # Hapus file ZIP asli
                os.remove(downloaded_file)
                logger.info(f"File ZIP asli dihapus: {downloaded_file}")
                
                return True
            else:
                logger.error("Gagal mengekstrak file NetCDF dari ZIP")
                return False
        else:
            logger.info(f"File {downloaded_file} adalah file NetCDF asli")
            return True
            
    except Exception as e:
        logger.error(f"Error handling ZIP download: {str(e)}")
        return False

# =============================================================================
# DOWNLOAD DATA ERA5
# =============================================================================

class ERA5Downloader:
    """Class untuk handle download data ERA5"""
    
    def __init__(self):
        self.client = cdsapi.Client(
            url=ERA5Config.CDS_API_URL, 
            key=ERA5Config.CDS_API_KEY
        )
        logger.info("CDS API client initialized")
    
    def download_monthly_data(self, year, month):
        """Download data bulanan untuk tahun dan bulan tertentu"""
        
        output_file = os.path.join(
            ERA5Config.RAW_DATA_PATH, 
            f'data_era5_hourly_{year}_{month}.nc'
        )
        
        # Skip jika file sudah ada dan ukurannya reasonable
        if (os.path.exists(output_file) and 
            os.path.getsize(output_file) > 10_000_000):  # > 10MB
            logger.info(f"File {output_file} sudah ada, skip download")
            return output_file
        
        # Prepare request parameters
        all_days = [f'{i:02d}' for i in range(1, 32)]
        all_hours = [f'{h:02d}:00' for h in range(24)]
        
        request_params = {
            'product_type': 'reanalysis',
            'format': 'netcdf',
            'variable': ERA5Config.VARIABLES,
            'year': year,
            'month': month,
            'day': all_days,
            'time': all_hours,
            'area': ERA5Config.AREA_BOUNDS,
        }
        
        try:
            logger.info(f"Downloading data untuk {year}-{month}...")
            
            # Download ke temporary file dulu
            temp_file = output_file + '.temp'
            self.client.retrieve(
                'reanalysis-era5-single-levels',
                request_params,
                temp_file
            )
            
            # Handle kemungkinan file ZIP
            if handle_zip_download(temp_file, output_file):
                logger.info(f"‚úÖ Download dan ekstrak selesai: {output_file}")
                return output_file
            else:
                logger.error(f"‚ùå Gagal processing file untuk {year}-{month}")
                return None
            
        except Exception as e:
            logger.error(f"‚ùå Gagal download {year}-{month}: {str(e)}")
            # Hapus file temporary jika ada
            if os.path.exists(temp_file):
                os.remove(temp_file)
            return None
    
    def download_all_data(self):
        """Download semua data berdasarkan konfigurasi"""
        downloaded_files = []
        
        for year, month in product(ERA5Config.TARGET_YEARS, ERA5Config.TARGET_MONTHS):
            result = self.download_monthly_data(year, month)
            if result:
                downloaded_files.append(result)
        
        logger.info(f"Total files downloaded: {len(downloaded_files)}")
        return downloaded_files

# =============================================================================
# PROCESSING DATA NETCDF KE CSV (PER LOKASI SPESIFIK)
# =============================================================================

class ERA5Processor:
    """Class untuk processing data ERA5 dari NetCDF ke CSV per lokasi spesifik"""
    
    @staticmethod
    def try_open_netcdf(file_path):
        """Mencoba berbagai engine untuk membuka file NetCDF"""
        engines_to_try = ['netcdf4', 'scipy', 'h5netcdf']
        
        for engine in engines_to_try:
            try:
                logger.info(f"Mencoba buka {file_path} dengan engine: {engine}")
                ds = xr.open_dataset(file_path, engine=engine)
                logger.info(f"‚úÖ Berhasil buka file dengan engine: {engine}")
                return ds
            except Exception as e:
                logger.warning(f"‚ùå Engine {engine} gagal: {str(e)}")
                continue
        
        # Jika semua engine gagal, coba tanpa specify engine
        try:
            logger.info("Mencoba buka file tanpa specify engine...")
            ds = xr.open_dataset(file_path)
            logger.info("‚úÖ Berhasil buka file tanpa specify engine")
            return ds
        except Exception as e:
            logger.error(f"‚ùå Semua metode gagal: {str(e)}")
            return None

    @staticmethod
    def find_nearest_grid_point(ds, target_lat, target_lon):
        """Mencari titik grid terdekat dengan lokasi target"""
        try:
            # Calculate absolute differences
            lat_diff = np.abs(ds.latitude.values - target_lat)
            lon_diff = np.abs(ds.longitude.values - target_lon)
            
            # Find indices of minimum differences
            lat_idx = np.argmin(lat_diff)
            lon_idx = np.argmin(lon_diff)
            
            # Get actual coordinates
            actual_lat = ds.latitude.values[lat_idx]
            actual_lon = ds.longitude.values[lon_idx]
            
            logger.info(f"Target: ({target_lat:.4f}, {target_lon:.4f}) -> Grid: ({actual_lat:.4f}, {actual_lon:.4f})")
            
            return lat_idx, lon_idx, actual_lat, actual_lon
            
        except Exception as e:
            logger.error(f"Error finding nearest grid point: {str(e)}")
            return None, None, None, None

    @staticmethod
    def extract_data_for_location(ds, location_name, target_lat, target_lon):
        """Extract data untuk satu lokasi spesifik"""
        try:
            # Find nearest grid point
            lat_idx, lon_idx, actual_lat, actual_lon = ERA5Processor.find_nearest_grid_point(
                ds, target_lat, target_lon
            )
            
            if lat_idx is None:
                return None
            
            # Extract data untuk titik tersebut
            location_data = {}
            
            for var_name in ds.data_vars:
                try:
                    # Select data untuk titik lokasi
                    var_data = ds[var_name].isel(latitude=lat_idx, longitude=lon_idx)
                    
                    # Convert to pandas Series
                    var_series = var_data.to_series()
                    
                    # Store in dictionary
                    location_data[var_name] = var_series
                    
                except Exception as e:
                    logger.warning(f"Error extracting {var_name} for {location_name}: {str(e)}")
                    continue
            
            if not location_data:
                logger.error(f"Tidak ada data yang berhasil diekstrak untuk {location_name}")
                return None
            
            # Create DataFrame
            df = pd.DataFrame(location_data)
            df.reset_index(inplace=True)
            
            # Debug: Print column names untuk troubleshooting
            logger.info(f"Columns in DataFrame: {df.columns.tolist()}")
            
            # Cari kolom waktu yang benar
            time_column = None
            possible_time_columns = ['valid_time', 'time', 'datetime', 'index']
            for col in possible_time_columns:
                if col in df.columns:
                    time_column = col
                    break
            
            if time_column is None:
                # Jika tidak ada kolom waktu yang jelas, gunakan index
                df['datetime'] = df.index
                time_column = 'datetime'
            
            # Rename kolom waktu ke 'datetime'
            if time_column != 'datetime':
                df.rename(columns={time_column: 'datetime'}, inplace=True)
            
            # Add location information
            df['location_name'] = location_name
            df['latitude'] = actual_lat
            df['longitude'] = actual_lon
            
            # Rename columns untuk lebih jelas
            column_mapping = {
                't2m': 'temperature_2m',
                'd2m': 'dewpoint_temperature_2m',
                'u10': 'u_wind_10m',
                'v10': 'v_wind_10m',
                'tp': 'precipitation'
            }
            
            df.rename(columns=column_mapping, inplace=True)
            
            # Calculate additional variables
            if 'u_wind_10m' in df.columns and 'v_wind_10m' in df.columns:
                df['wind_speed_10m'] = np.sqrt(df['u_wind_10m']**2 + df['v_wind_10m']**2)
                df['wind_direction_10m'] = np.arctan2(df['v_wind_10m'], df['u_wind_10m']) * 180 / np.pi
                df['wind_direction_10m'] = (df['wind_direction_10m'] + 360) % 360  # Convert to 0-360
            
            # Convert temperature from Kelvin to Celsius
            if 'temperature_2m' in df.columns:
                df['temperature_2m_c'] = df['temperature_2m'] - 273.15
            
            if 'dewpoint_temperature_2m' in df.columns:
                df['dewpoint_temperature_2m_c'] = df['dewpoint_temperature_2m'] - 273.15
            
            logger.info(f"‚úÖ Data untuk {location_name}: {len(df)} records, columns: {df.columns.tolist()}")
            return df
            
        except Exception as e:
            logger.error(f"Error extracting data for {location_name}: {str(e)}")
            return None

    @staticmethod
    def convert_era5_to_csv_per_location(nc_file_path, output_dir=None):
        """
        Konversi file NetCDF ERA5 ke format CSV dengan data per lokasi spesifik
        """
        try:
            logger.info(f"Memproses file: {nc_file_path}")
            
            # Validasi file
            if not os.path.exists(nc_file_path):
                logger.error(f"File tidak ditemukan: {nc_file_path}")
                return None
            
            file_size = os.path.getsize(nc_file_path) / (1024 * 1024)  # MB
            logger.info(f"Ukuran file: {file_size:.2f} MB")
            
            # Cek jika file adalah ZIP
            if zipfile.is_zipfile(nc_file_path):
                logger.info(f"File {nc_file_path} adalah ZIP file, mengekstrak...")
                extracted_file = extract_zip_file(nc_file_path)
                if extracted_file:
                    nc_file_path = extracted_file
                else:
                    logger.error("Gagal mengekstrak file ZIP")
                    return None
            
            # Buka file NetCDF dengan multiple engine fallback
            ds = ERA5Processor.try_open_netcdf(nc_file_path)
            if ds is None:
                return None
            
            # Ekstrak informasi dataset
            logger.info(f"Dimensi dataset: {dict(ds.dims)}")
            logger.info(f"Koordinat dataset: {list(ds.coords)}")
            logger.info(f"Variabel yang tersedia: {list(ds.data_vars)}")
            
            # Extract data untuk setiap lokasi
            all_location_data = []
            
            for location_name, coords in ERA5Config.LOCATIONS.items():
                logger.info(f"üìç Memproses lokasi: {location_name}")
                
                location_df = ERA5Processor.extract_data_for_location(
                    ds, location_name, coords['lat'], coords['lon']
                )
                
                if location_df is not None:
                    all_location_data.append(location_df)
            
            if not all_location_data:
                logger.error("Tidak ada data yang berhasil diekstrak untuk semua lokasi")
                ds.close()
                return None
            
            # Gabungkan data semua lokasi
            combined_df = pd.concat(all_location_data, ignore_index=True)
            
            # Optimasi tipe data
            combined_df = ERA5Processor.optimize_dataframe(combined_df)
            
            # Urutkan data
            if 'datetime' in combined_df.columns:
                combined_df.sort_values(['datetime', 'location_name'], inplace=True)
            
            # Simpan ke CSV
            base_name = Path(nc_file_path).stem
            if output_dir is None:
                output_dir = ERA5Config.CSV_OUTPUT_PATH
            
            output_file = os.path.join(output_dir, f"{base_name}_per_location.csv")
            
            # Simpan ke CSV
            combined_df.to_csv(output_file, index=False)
            logger.info(f"‚úÖ File CSV disimpan: {output_file}")
            
            # Log statistics
            ERA5Processor.log_data_statistics(combined_df)
            
            ds.close()
            return output_file
            
        except Exception as e:
            logger.error(f"‚ùå Error processing {nc_file_path}: {str(e)}")
            import traceback
            logger.error(f"Traceback: {traceback.format_exc()}")
            return None

    @staticmethod
    def optimize_dataframe(df):
        """Optimasi tipe data untuk menghemat memory"""
        # Optimasi numeric columns
        float_cols = df.select_dtypes(include=['float64']).columns
        for col in float_cols:
            if col in df.columns:
                df[col] = df[col].astype(np.float32)
        
        return df
    
    @staticmethod
    def log_data_statistics(df):
        """Log statistics data"""
        logger.info("üìä Data Statistics:")
        logger.info(f"   Total rows: {len(df):,}")
        logger.info(f"   Total columns: {len(df.columns)}")
        
        if 'datetime' in df.columns:
            logger.info(f"   Time range: {df['datetime'].min()} to {df['datetime'].max()}")
        
        logger.info(f"   Locations: {df['location_name'].unique().tolist()}")
        logger.info(f"   Available variables: {[col for col in df.columns if col not in ['datetime', 'location_name', 'latitude', 'longitude']]}")
        
        # Preview data
        logger.info("üëÄ Preview data (first 5 rows):")
        print(df.head().to_string())

    @staticmethod
    def batch_convert_era5_files(input_pattern=None, output_dir=None):
        """Konversi batch semua file ERA5"""
        
        if input_pattern is None:
            input_pattern = os.path.join(ERA5Config.RAW_DATA_PATH, "data_era5_hourly_*.nc")
        
        if output_dir is None:
            output_dir = ERA5Config.CSV_OUTPUT_PATH
        
        # Cari file NetCDF
        nc_files = glob.glob(input_pattern)
        
        if not nc_files:
            logger.warning(f"Tidak ada file ditemukan dengan pattern: {input_pattern}")
            
            # Coba pattern alternatif
            alternative_patterns = [
                os.path.join(ERA5Config.RAW_DATA_PATH, "*.nc"),
                "data_era5_hourly_*.nc",
                "era5_*.nc"
            ]
            
            for pattern in alternative_patterns:
                nc_files = glob.glob(pattern)
                if nc_files:
                    logger.info(f"Menggunakan pattern alternatif: {pattern}")
                    break
        
        if not nc_files:
            logger.error("Tidak ada file NetCDF yang ditemukan!")
            return []
        
        logger.info(f"Menemukan {len(nc_files)} file untuk dikonversi:")
        for i, nc_file in enumerate(sorted(nc_files), 1):
            logger.info(f"  {i:2d}. {nc_file}")
        
        # Process setiap file
        converted_files = []
        failed_files = []
        
        for nc_file in sorted(nc_files):
            result = ERA5Processor.convert_era5_to_csv_per_location(nc_file, output_dir)
            if result:
                converted_files.append(result)
            else:
                failed_files.append(nc_file)
        
        # Summary
        logger.info(f"\n{'='*50}")
        logger.info("SUMMARY KONVERSI:")
        logger.info(f"{'='*50}")
        logger.info(f"‚úÖ Berhasil: {len(converted_files)} file")
        logger.info(f"‚ùå Gagal: {len(failed_files)} file")
        
        if failed_files:
            logger.info("File yang gagal:")
            for ff in failed_files:
                logger.info(f"  - {ff}")
        
        return converted_files

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main function untuk menjalankan seluruh pipeline"""
    logger.info("üöÄ MEMULAI ERA5 DATA PROCESSING PIPELINE")
    
    # Step 1: Setup environment
    logger.info("üîß Setup environment...")
    if not check_dependencies():
        logger.error("Dependencies tidak lengkap, proses dihentikan")
        return
    
    setup_directories()
    
    # Step 2: Download data
    logger.info("üì• Download data ERA5...")
    downloader = ERA5Downloader()
    downloaded_files = downloader.download_all_data()
    
    if not downloaded_files:
        logger.warning("Tidak ada file yang didownload, lanjut ke file yang sudah ada")
    
    # Step 3: Process data ke CSV per lokasi
    logger.info("üîÑ Konversi NetCDF ke CSV (per lokasi)...")
    processor = ERA5Processor()
    converted_files = processor.batch_convert_era5_files()
    
    # Final summary
    logger.info(f"\nüéâ PIPELINE SELESAI!")
    logger.info(f"üìÅ File CSV tersimpan di: {ERA5Config.CSV_OUTPUT_PATH}")
    
    if converted_files:
        logger.info("File yang berhasil dikonversi:")
        for cf in converted_files:
            logger.info(f"  üìÑ {cf}")
    else:
        logger.error("‚ùå Tidak ada file yang berhasil dikonversi!")

if __name__ == "__main__":
    main()

2025-11-01 06:55:07,110 - INFO - üöÄ MEMULAI ERA5 DATA PROCESSING PIPELINE
2025-11-01 06:55:07,120 - INFO - üîß Setup environment...
2025-11-01 06:55:07,120 - INFO - ‚úÖ xarray tersedia
2025-11-01 06:55:07,121 - INFO - ‚úÖ pandas tersedia
2025-11-01 06:55:07,122 - INFO - ‚úÖ numpy tersedia
2025-11-01 06:55:07,123 - INFO - ‚úÖ cdsapi tersedia
2025-11-01 06:55:07,124 - INFO - ‚úÖ netcdf4 tersedia
2025-11-01 06:55:07,129 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data
2025-11-01 06:55:07,131 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\processed_data
2025-11-01 06:55:07,134 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output
2025-11-01 06:55:07,136 - INFO - Directory created/verified: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\temp_extract
2025-11-01 06:55

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2022-01-01    2.772162    0.900436               296.432098      299.064911    bundaran_hi    -6.206  106.903999        2.914733           17.994507         25.914917                  23.282104
1488 2022-01-01    1.274543    0.846031               295.718964      297.858521      jagakarsa    -6.457  106.903999        1.529780           33.575867         24.708527                  22.568970
2976 2022-01-01    2.749457    1.250961               296.521790      299.121338    kebun_jeruk    -6.206  106.653000        3.020665           24.464813         25.971344                  23.371796
744  2022-01-01    2.772162    0.900436               296.432098      299.064911  kelapa_gading    -6.206  106.903999        2.914733           17.994507         25.914917                  23.282104
2232 

2025-11-01 07:26:00,875 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:00,884 - INFO - ‚úÖ Data untuk kelapa_gading: 744 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', 'latitude', 'longitude', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:00,885 - INFO - üìç Memproses lokasi: jagakarsa
2025-11-01 07:26:00,887 - INFO - Target: (-6.3569, 106.8037) -> Grid: (-6.4570, 106.9040)
2025-11-01 07:26:00,922 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:00,931 - INFO - ‚úÖ Data untuk jagakarsa: 744 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', 'latitude', 'longitude', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:00,932 - INFO - üìç Memproses 

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2022-04-01   -0.627819    0.551236               296.979675      298.482117    bundaran_hi    -6.206  106.903999        0.835475          138.716278         25.332123                  23.829681
1440 2022-04-01   -0.067577    0.890958               295.748962      296.750458      jagakarsa    -6.457  106.903999        0.893517           94.337463         23.600464                  22.598969
2880 2022-04-01   -0.115780    0.565709               296.837891      298.516876    kebun_jeruk    -6.206  106.653000        0.577436          101.566589         25.366882                  23.687897
720  2022-04-01   -0.627819    0.551236               296.979675      298.482117  kelapa_gading    -6.206  106.903999        0.835475          138.716278         25.332123                  23.829681
2160 

2025-11-01 07:26:01,415 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2022_06.nc
2025-11-01 07:26:01,417 - INFO - Ukuran file: 0.13 MB
2025-11-01 07:26:01,421 - INFO - Mencoba buka C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2022_06.nc dengan engine: netcdf4
2025-11-01 07:26:01,446 - INFO - ‚úÖ Berhasil buka file dengan engine: netcdf4
2025-11-01 07:26:01,448 - INFO - Dimensi dataset: {'valid_time': 720, 'latitude': 2, 'longitude': 2}
2025-11-01 07:26:01,449 - INFO - Koordinat dataset: ['number', 'valid_time', 'latitude', 'longitude', 'expver']
2025-11-01 07:26:01,450 - INFO - Variabel yang tersedia: ['u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:01,451 - INFO - üìç Memproses lokasi: bundaran_hi
2025-11-01 07:26:01,452 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:01,462 - INFO - Columns in DataFrame: ['valid_time', 'u

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2022-06-01    0.416042    1.764739               295.836578      296.928619    bundaran_hi    -6.206  106.903999        1.813118           76.734558         23.778625                  22.686584
1440 2022-06-01    0.426029    1.792865               294.285339      295.013641      jagakarsa    -6.457  106.903999        1.842787           76.632996         21.863647                  21.135345
2880 2022-06-01    0.345837    1.981212               295.915009      296.878571    kebun_jeruk    -6.206  106.653000        2.011169           80.098328         23.728577                  22.765015
720  2022-06-01    0.416042    1.764739               295.836578      296.928619  kelapa_gading    -6.206  106.903999        1.813118           76.734558         23.778625                  22.686584
2160 

2025-11-01 07:26:01,818 - INFO -    Time range: 2022-07-01 00:00:00 to 2022-07-31 23:00:00
2025-11-01 07:26:01,819 - INFO -    Locations: ['bundaran_hi', 'jagakarsa', 'kebun_jeruk', 'kelapa_gading', 'lubang_buaya']
2025-11-01 07:26:01,820 - INFO -    Available variables: ['u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:01,821 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:01,826 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2022_08.nc
2025-11-01 07:26:01,827 - INFO - Ukuran file: 0.13 MB
2025-11-01 07:26:01,830 - INFO - Mencoba buka C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2022_08.nc dengan engine: netcdf4
2025-11-01 07:26:01,846 - INFO - ‚úÖ Berhasil buka file dengan engine: netcdf4
2025-11-01 07:26:01,847 - I

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2022-07-01   -0.416493    1.447820               294.918335      296.342224    bundaran_hi    -6.206  106.903999        1.506535          106.048889         23.192230                  21.768341
1488 2022-07-01    0.032650    1.277471               293.002777      294.094971      jagakarsa    -6.457  106.903999        1.277888           88.535950         20.944977                  19.852783
2976 2022-07-01   -0.307804    1.274953               294.954071      296.159424    kebun_jeruk    -6.206  106.653000        1.311582          103.572845         23.009430                  21.804077
744  2022-07-01   -0.416493    1.447820               294.918335      296.342224  kelapa_gading    -6.206  106.903999        1.506535          106.048889         23.192230                  21.768341
2232 

2025-11-01 07:26:02,026 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:02,039 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:02,048 - INFO - ‚úÖ Data untuk bundaran_hi: 720 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', 'latitude', 'longitude', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:02,051 - INFO - üìç Memproses lokasi: kelapa_gading
2025-11-01 07:26:02,053 - INFO - Target: (-6.1536, 106.9109) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:02,063 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:02,082 - INFO - ‚úÖ Data untuk kelapa_gading: 720 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', 'latitude', 'longitude', 'wind_speed_10m', 'wind_direction_10m', '

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2022-09-01   -1.018948    0.844041               296.792725      299.220886    bundaran_hi    -6.206  106.903999        1.323125          140.363495         26.070892                  23.642731
1440 2022-09-01   -0.050808    1.054002               295.536499      297.370972      jagakarsa    -6.457  106.903999        1.055225           92.759796         24.220978                  22.386505
2880 2022-09-01   -0.601803    0.841080               296.868835      298.958801    kebun_jeruk    -6.206  106.653000        1.034206          125.584167         25.808807                  23.718842
720  2022-09-01   -1.018948    0.844041               296.792725      299.220886  kelapa_gading    -6.206  106.903999        1.323125          140.363495         26.070892                  23.642731
2160 

2025-11-01 07:26:02,458 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2022_11.nc
2025-11-01 07:26:02,459 - INFO - Ukuran file: 0.13 MB
2025-11-01 07:26:02,461 - INFO - Mencoba buka C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2022_11.nc dengan engine: netcdf4
2025-11-01 07:26:02,489 - INFO - ‚úÖ Berhasil buka file dengan engine: netcdf4
2025-11-01 07:26:02,491 - INFO - Dimensi dataset: {'valid_time': 720, 'latitude': 2, 'longitude': 2}
2025-11-01 07:26:02,492 - INFO - Koordinat dataset: ['number', 'valid_time', 'latitude', 'longitude', 'expver']
2025-11-01 07:26:02,493 - INFO - Variabel yang tersedia: ['u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:02,494 - INFO - üìç Memproses lokasi: bundaran_hi
2025-11-01 07:26:02,495 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:02,504 - INFO - Columns in DataFrame: ['valid_time', 'u

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2022-11-01   -0.634461    1.553469               296.826965      299.394592    bundaran_hi    -6.206  106.903999        1.678036          112.215881         26.244598                  23.676971
1440 2022-11-01   -0.204270    1.266443               295.752686      297.835663      jagakarsa    -6.457  106.903999        1.282811           99.162567         24.685669                  22.602692
2880 2022-11-01   -0.578835    1.542246               296.763794      299.290039    kebun_jeruk    -6.206  106.653000        1.647292          110.572083         26.140045                  23.613800
720  2022-11-01   -0.634461    1.553469               296.826965      299.394592  kelapa_gading    -6.206  106.903999        1.678036          112.215881         26.244598                  23.676971
2160 

2025-11-01 07:26:02,861 - INFO - ‚úÖ File CSV disimpan: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2022_12_per_location.csv
2025-11-01 07:26:02,863 - INFO - üìä Data Statistics:
2025-11-01 07:26:02,865 - INFO -    Total rows: 3,720
2025-11-01 07:26:02,867 - INFO -    Total columns: 12
2025-11-01 07:26:02,870 - INFO -    Time range: 2022-12-01 00:00:00 to 2022-12-31 23:00:00
2025-11-01 07:26:02,873 - INFO -    Locations: ['bundaran_hi', 'jagakarsa', 'kebun_jeruk', 'kelapa_gading', 'lubang_buaya']
2025-11-01 07:26:02,875 - INFO -    Available variables: ['u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:02,878 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:02,889 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2022-12-01    0.741255    0.528609               296.288940      298.946747    bundaran_hi    -6.206  106.903999        0.910432           35.493713         25.796753                  23.138947
1488 2022-12-01    0.437514    0.818648               295.130585      297.344757      jagakarsa    -6.457  106.903999        0.928226           61.878448         24.194763                  21.980591
2976 2022-12-01    0.884955    0.881194               296.058136      298.697235    kebun_jeruk    -6.206  106.653000        1.248859           44.877991         25.547241                  22.908142
744  2022-12-01    0.741255    0.528609               296.288940      298.946747  kelapa_gading    -6.206  106.903999        0.910432           35.493713         25.796753                  23.138947
2232 

2025-11-01 07:26:03,160 - INFO - ‚úÖ File CSV disimpan: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2023_01_per_location.csv
2025-11-01 07:26:03,162 - INFO - üìä Data Statistics:
2025-11-01 07:26:03,162 - INFO -    Total rows: 3,720
2025-11-01 07:26:03,163 - INFO -    Total columns: 12
2025-11-01 07:26:03,165 - INFO -    Time range: 2023-01-01 00:00:00 to 2023-01-31 23:00:00
2025-11-01 07:26:03,166 - INFO -    Locations: ['bundaran_hi', 'jagakarsa', 'kebun_jeruk', 'kelapa_gading', 'lubang_buaya']
2025-11-01 07:26:03,167 - INFO -    Available variables: ['u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:03,168 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:03,175 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2023-01-01    3.075758    0.363742               296.163696      297.533478    bundaran_hi    -6.206  106.903999        3.097192            6.744507         24.383484                  23.013702
1488 2023-01-01    1.991102   -0.163923               295.289459      296.147369      jagakarsa    -6.457  106.903999        1.997838          355.293579         22.997375                  22.139465
2976 2023-01-01    2.074171    0.687701               296.270691      297.626465    kebun_jeruk    -6.206  106.653000        2.185204           18.343170         24.476471                  23.120697
744  2023-01-01    3.075758    0.363742               296.163696      297.533478  kelapa_gading    -6.206  106.903999        3.097192            6.744507         24.383484                  23.013702
2232 

2025-11-01 07:26:03,375 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:03,382 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_05.nc
2025-11-01 07:26:03,384 - INFO - Ukuran file: 0.13 MB
2025-11-01 07:26:03,387 - INFO - Mencoba buka C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_05.nc dengan engine: netcdf4
2025-11-01 07:26:03,408 - INFO - ‚úÖ Berhasil buka file dengan engine: netcdf4
2025-11-01 07:26:03,409 - INFO - Dimensi dataset: {'valid_time': 744, 'latitude': 2, 'longitude': 2}
2025-11-01 07:26:03,409 - INFO - Koordinat dataset: ['number', 'valid_time', 'latitude', 'longitude', 'expver']
2025-11-01 07:26:03,410 - INFO - Variabel yang tersedia: ['u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:03,411 - INFO - üìç Memproses lokasi: bundaran_hi
2025-11-01 07:26:03,411 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2023-02-01    2.091321    0.208390               296.850830      298.195862    bundaran_hi    -6.206  106.903999        2.101677            5.690460         25.045868                  23.700836
1344 2023-02-01    1.059491    0.646531               295.754852      296.526428      jagakarsa    -6.457  106.903999        1.241178           31.392700         23.376434                  22.604858
2688 2023-02-01    1.012463    0.550752               296.784637      298.345642    kebun_jeruk    -6.206  106.653000        1.152566           28.544952         25.195648                  23.634644
672  2023-02-01    2.091321    0.208390               296.850830      298.195862  kelapa_gading    -6.206  106.903999        2.101677            5.690460         25.045868                  23.700836
2016 

2025-11-01 07:26:03,586 - INFO - Dimensi dataset: {'valid_time': 720, 'latitude': 2, 'longitude': 2}
2025-11-01 07:26:03,587 - INFO - Koordinat dataset: ['number', 'valid_time', 'latitude', 'longitude', 'expver']
2025-11-01 07:26:03,588 - INFO - Variabel yang tersedia: ['u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:03,588 - INFO - üìç Memproses lokasi: bundaran_hi
2025-11-01 07:26:03,590 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:03,598 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:03,603 - INFO - ‚úÖ Data untuk bundaran_hi: 720 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', 'latitude', 'longitude', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:03,604 - INFO - üìç Memproses lokasi: kelapa_gading
2025-11-01 07:26:03,605 - INFO - Target: (-6.1536, 106.9109) -> Grid: (-6.2

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2023-06-01   -0.381478    1.445623               296.112213      298.812134    bundaran_hi    -6.206  106.903999        1.495110          104.782501         25.662140                  22.962219
1440 2023-06-01    0.025749    1.441328               294.633972      296.503723      jagakarsa    -6.457  106.903999        1.441558           88.976562         23.353729                  21.483978
2880 2023-06-01   -0.218873    1.386560               296.223663      298.519104    kebun_jeruk    -6.206  106.653000        1.403729           98.970276         25.369110                  23.073669
720  2023-06-01   -0.381478    1.445623               296.112213      298.812134  kelapa_gading    -6.206  106.903999        1.495110          104.782501         25.662140                  22.962219
2160 

2025-11-01 07:26:04,051 - INFO - ‚úÖ File CSV disimpan: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2023_07_per_location.csv
2025-11-01 07:26:04,053 - INFO - üìä Data Statistics:
2025-11-01 07:26:04,055 - INFO -    Total rows: 3,720
2025-11-01 07:26:04,056 - INFO -    Total columns: 12
2025-11-01 07:26:04,058 - INFO -    Time range: 2023-07-01 00:00:00 to 2023-07-31 23:00:00
2025-11-01 07:26:04,061 - INFO -    Locations: ['bundaran_hi', 'jagakarsa', 'kebun_jeruk', 'kelapa_gading', 'lubang_buaya']
2025-11-01 07:26:04,062 - INFO -    Available variables: ['u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:04,067 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:04,082 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2023-07-01    0.192395    0.970269               296.788330      299.159546    bundaran_hi    -6.206  106.903999        0.989160           78.784302         26.009552                  23.638336
1488 2023-07-01    0.345147    1.163254               295.924835      297.001221      jagakarsa    -6.457  106.903999        1.213378           73.473938         23.851227                  22.774841
2976 2023-07-01    0.359372    1.011723               297.004822      298.800781    kebun_jeruk    -6.206  106.653000        1.073654           70.444580         25.650787                  23.854828
744  2023-07-01    0.192395    0.970269               296.788330      299.159546  kelapa_gading    -6.206  106.903999        0.989160           78.784302         26.009552                  23.638336
2232 

2025-11-01 07:26:04,318 - INFO - ‚úÖ File CSV disimpan: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2023_08_per_location.csv
2025-11-01 07:26:04,319 - INFO - üìä Data Statistics:
2025-11-01 07:26:04,321 - INFO -    Total rows: 3,720
2025-11-01 07:26:04,322 - INFO -    Total columns: 12
2025-11-01 07:26:04,324 - INFO -    Time range: 2023-08-01 00:00:00 to 2023-08-31 23:00:00
2025-11-01 07:26:04,325 - INFO -    Locations: ['bundaran_hi', 'jagakarsa', 'kebun_jeruk', 'kelapa_gading', 'lubang_buaya']
2025-11-01 07:26:04,327 - INFO -    Available variables: ['u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:04,328 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:04,337 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2023-08-01   -0.974089    1.624541               293.660248      297.564636    bundaran_hi    -6.206  106.903999        1.894197          120.947266         24.414642                  20.510254
1488 2023-08-01   -0.029326    1.511008               292.360748      295.029541      jagakarsa    -6.457  106.903999        1.511293           91.111877         21.879547                  19.210754
2976 2023-08-01   -0.648436    1.508925               294.194122      297.419922    kebun_jeruk    -6.206  106.653000        1.642354          113.254822         24.269928                  21.044128
744  2023-08-01   -0.974089    1.624541               293.660248      297.564636  kelapa_gading    -6.206  106.903999        1.894197          120.947266         24.414642                  20.510254
2232 

2025-11-01 07:26:04,581 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_10.nc
2025-11-01 07:26:04,583 - INFO - Ukuran file: 0.13 MB
2025-11-01 07:26:04,586 - INFO - Mencoba buka C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_10.nc dengan engine: netcdf4
2025-11-01 07:26:04,614 - INFO - ‚úÖ Berhasil buka file dengan engine: netcdf4
2025-11-01 07:26:04,617 - INFO - Dimensi dataset: {'valid_time': 744, 'latitude': 2, 'longitude': 2}
2025-11-01 07:26:04,619 - INFO - Koordinat dataset: ['number', 'valid_time', 'latitude', 'longitude', 'expver']
2025-11-01 07:26:04,620 - INFO - Variabel yang tersedia: ['u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:04,621 - INFO - üìç Memproses lokasi: bundaran_hi
2025-11-01 07:26:04,623 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:04,635 - INFO - Columns in DataFrame: ['valid_time', 'u

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2023-09-01   -0.403240    0.924969               295.054321      297.682251    bundaran_hi    -6.206  106.903999        1.009045          113.554779         24.532257                  21.904327
1440 2023-09-01    0.059269    1.299397               292.852539      295.142151      jagakarsa    -6.457  106.903999        1.300748           87.388397         21.992157                  19.702545
2880 2023-09-01   -0.121258    1.048833               294.450317      297.257935    kebun_jeruk    -6.206  106.653000        1.055819           96.594818         24.107941                  21.300323
720  2023-09-01   -0.403240    0.924969               295.054321      297.682251  kelapa_gading    -6.206  106.903999        1.009045          113.554779         24.532257                  21.904327
2160 

2025-11-01 07:26:04,819 - INFO - ‚úÖ File CSV disimpan: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2023_10_per_location.csv
2025-11-01 07:26:04,820 - INFO - üìä Data Statistics:
2025-11-01 07:26:04,821 - INFO -    Total rows: 3,720
2025-11-01 07:26:04,822 - INFO -    Total columns: 12
2025-11-01 07:26:04,823 - INFO -    Time range: 2023-10-01 00:00:00 to 2023-10-31 23:00:00
2025-11-01 07:26:04,824 - INFO -    Locations: ['bundaran_hi', 'jagakarsa', 'kebun_jeruk', 'kelapa_gading', 'lubang_buaya']
2025-11-01 07:26:04,825 - INFO -    Available variables: ['u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:04,826 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:04,834 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2023-10-01   -1.237019    0.699842               295.908295      299.254578    bundaran_hi    -6.206  106.903999        1.421265          150.501038         26.104584                  22.758301
1488 2023-10-01   -0.630329    0.990385               294.463593      296.853210      jagakarsa    -6.457  106.903999        1.173958          122.474670         23.703217                  21.313599
2976 2023-10-01   -0.911442    0.837797               295.552155      299.171753    kebun_jeruk    -6.206  106.653000        1.237994          137.410767         26.021759                  22.402161
744  2023-10-01   -1.237019    0.699842               295.908295      299.254578  kelapa_gading    -6.206  106.903999        1.421265          150.501038         26.104584                  22.758301
2232 

2025-11-01 07:26:05,042 - INFO - ‚úÖ File CSV disimpan: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2023_11_per_location.csv
2025-11-01 07:26:05,044 - INFO - üìä Data Statistics:
2025-11-01 07:26:05,044 - INFO -    Total rows: 3,600
2025-11-01 07:26:05,045 - INFO -    Total columns: 12
2025-11-01 07:26:05,047 - INFO -    Time range: 2023-11-01 00:00:00 to 2023-11-30 23:00:00
2025-11-01 07:26:05,050 - INFO -    Locations: ['bundaran_hi', 'jagakarsa', 'kebun_jeruk', 'kelapa_gading', 'lubang_buaya']
2025-11-01 07:26:05,050 - INFO -    Available variables: ['u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:05,051 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:05,058 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2023_

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2023-11-01   -0.248286    1.182060               297.310944      300.142761    bundaran_hi    -6.206  106.903999        1.207854          101.862244         26.992767                  24.160950
1440 2023-11-01    0.194707    0.941246               296.174866      298.400696      jagakarsa    -6.457  106.903999        0.961174           78.312561         25.250702                  23.024872
2880 2023-11-01   -0.310740    1.354133               296.770905      300.693665    kebun_jeruk    -6.206  106.653000        1.389330          102.924194         27.543671                  23.620911
720  2023-11-01   -0.248286    1.182060               297.310944      300.142761  kelapa_gading    -6.206  106.903999        1.207854          101.862244         26.992767                  24.160950
2160 

2025-11-01 07:26:05,259 - INFO - Ukuran file: 0.13 MB
2025-11-01 07:26:05,283 - INFO - Mencoba buka C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2024_01.nc dengan engine: netcdf4
2025-11-01 07:26:05,317 - INFO - ‚úÖ Berhasil buka file dengan engine: netcdf4
2025-11-01 07:26:05,319 - INFO - Dimensi dataset: {'valid_time': 744, 'latitude': 2, 'longitude': 2}
2025-11-01 07:26:05,322 - INFO - Koordinat dataset: ['number', 'valid_time', 'latitude', 'longitude', 'expver']
2025-11-01 07:26:05,324 - INFO - Variabel yang tersedia: ['u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:05,325 - INFO - üìç Memproses lokasi: bundaran_hi
2025-11-01 07:26:05,328 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:05,339 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:05,346 - INFO - ‚úÖ Data untuk bundaran_hi: 744 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2024-01-01    0.185202    1.067482               297.475189      299.127625    bundaran_hi    -6.206  106.903999        1.083429           80.157471         25.977631                  24.325195
1488 2024-01-01    0.151395    0.943985               296.344513      297.065979      jagakarsa    -6.457  106.903999        0.956048           80.888611         23.915985                  23.194519
2976 2024-01-01    0.179879    1.115333               297.601257      299.015564    kebun_jeruk    -6.206  106.653000        1.129746           80.838318         25.865570                  24.451263
744  2024-01-01    0.185202    1.067482               297.475189      299.127625  kelapa_gading    -6.206  106.903999        1.083429           80.157471         25.977631                  24.325195
2232 

2025-11-01 07:26:05,687 - INFO - ‚úÖ Berhasil buka file dengan engine: netcdf4
2025-11-01 07:26:05,688 - INFO - Dimensi dataset: {'valid_time': 744, 'latitude': 2, 'longitude': 2}
2025-11-01 07:26:05,689 - INFO - Koordinat dataset: ['number', 'valid_time', 'latitude', 'longitude', 'expver']
2025-11-01 07:26:05,690 - INFO - Variabel yang tersedia: ['u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:05,690 - INFO - üìç Memproses lokasi: bundaran_hi
2025-11-01 07:26:05,691 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:05,698 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:05,705 - INFO - ‚úÖ Data untuk bundaran_hi: 744 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', 'latitude', 'longitude', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:05,706 - INFO - üìç Memproses lokasi: kelapa_gad

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2024-05-01   -1.526669    0.446248               298.397278      300.037079    bundaran_hi    -6.206  106.903999        1.590551          163.706299         26.887085                  25.247284
1488 2024-05-01   -0.562496    0.783314               297.239197      298.413177      jagakarsa    -6.457  106.903999        0.964356          125.682037         25.263184                  24.089203
2976 2024-05-01   -1.041622    0.368023               298.403900      300.081177    kebun_jeruk    -6.206  106.653000        1.104725          160.540771         26.931183                  25.253906
744  2024-05-01   -1.526669    0.446248               298.397278      300.037079  kelapa_gading    -6.206  106.903999        1.590551          163.706299         26.887085                  25.247284
2232 

2025-11-01 07:26:06,083 - INFO - ‚úÖ File CSV disimpan: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2024_06_per_location.csv
2025-11-01 07:26:06,085 - INFO - üìä Data Statistics:
2025-11-01 07:26:06,087 - INFO -    Total rows: 3,600
2025-11-01 07:26:06,089 - INFO -    Total columns: 12
2025-11-01 07:26:06,090 - INFO -    Time range: 2024-06-01 00:00:00 to 2024-06-30 23:00:00
2025-11-01 07:26:06,092 - INFO -    Locations: ['bundaran_hi', 'jagakarsa', 'kebun_jeruk', 'kelapa_gading', 'lubang_buaya']
2025-11-01 07:26:06,094 - INFO -    Available variables: ['u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:06,095 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:06,110 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2024_

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2024-06-01    0.073136    1.273479               297.621948      299.176300    bundaran_hi    -6.206  106.903999        1.275578           86.713135         26.026306                  24.471954
1440 2024-06-01    0.188870    1.313763               296.414612      297.598389      jagakarsa    -6.457  106.903999        1.327269           81.819061         24.448395                  23.264618
2880 2024-06-01    0.009621    1.154423               297.807037      299.088837    kebun_jeruk    -6.206  106.653000        1.154463           89.522522         25.938843                  24.657043
720  2024-06-01    0.073136    1.273479               297.621948      299.176300  kelapa_gading    -6.206  106.903999        1.275578           86.713135         26.026306                  24.471954
2160 

2025-11-01 07:26:06,331 - INFO - ‚úÖ File CSV disimpan: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2024_07_per_location.csv
2025-11-01 07:26:06,334 - INFO - üìä Data Statistics:
2025-11-01 07:26:06,335 - INFO -    Total rows: 3,720
2025-11-01 07:26:06,336 - INFO -    Total columns: 12
2025-11-01 07:26:06,338 - INFO -    Time range: 2024-07-01 00:00:00 to 2024-07-31 23:00:00
2025-11-01 07:26:06,340 - INFO -    Locations: ['bundaran_hi', 'jagakarsa', 'kebun_jeruk', 'kelapa_gading', 'lubang_buaya']
2025-11-01 07:26:06,341 - INFO -    Available variables: ['u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:06,342 - INFO - üëÄ Preview data (first 5 rows):
2025-11-01 07:26:06,353 - INFO - Memproses file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2024_

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2024-07-01   -0.236145    0.102023               297.243591      298.854462    bundaran_hi    -6.206  106.903999        0.257241          156.633850         25.704468                  24.093597
1488 2024-07-01   -0.077522    0.221650               296.273499      297.211243      jagakarsa    -6.457  106.903999        0.234816          109.277252         24.061249                  23.123505
2976 2024-07-01   -0.455452    0.225404               297.064850      298.813599    kebun_jeruk    -6.206  106.653000        0.508176          153.669128         25.663605                  23.914856
744  2024-07-01   -0.236145    0.102023               297.243591      298.854462  kelapa_gading    -6.206  106.903999        0.257241          156.633850         25.704468                  24.093597
2232 

2025-11-01 07:26:06,558 - INFO - Mencoba buka C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\raw_data\data_era5_hourly_2024_09.nc dengan engine: netcdf4
2025-11-01 07:26:06,579 - INFO - ‚úÖ Berhasil buka file dengan engine: netcdf4
2025-11-01 07:26:06,581 - INFO - Dimensi dataset: {'valid_time': 720, 'latitude': 2, 'longitude': 2}
2025-11-01 07:26:06,582 - INFO - Koordinat dataset: ['number', 'valid_time', 'latitude', 'longitude', 'expver']
2025-11-01 07:26:06,582 - INFO - Variabel yang tersedia: ['u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:06,583 - INFO - üìç Memproses lokasi: bundaran_hi
2025-11-01 07:26:06,585 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:06,592 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:06,597 - INFO - ‚úÖ Data untuk bundaran_hi: 720 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', '

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2024-09-01   -0.836740    1.116497               295.653687      299.119812    bundaran_hi    -6.206  106.903999        1.395242          126.849121         25.969818                  22.503693
1440 2024-09-01   -0.304422    0.992313               294.443939      297.233276      jagakarsa    -6.457  106.903999        1.037959          107.054962         24.083282                  21.293945
2880 2024-09-01   -0.459482    1.174556               295.723785      299.010376    kebun_jeruk    -6.206  106.653000        1.261232          111.365234         25.860382                  22.573792
720  2024-09-01   -0.836740    1.116497               295.653687      299.119812  kelapa_gading    -6.206  106.903999        1.395242          126.849121         25.969818                  22.503693
2160 

2025-11-01 07:26:06,960 - INFO - ‚úÖ Berhasil buka file dengan engine: netcdf4
2025-11-01 07:26:06,962 - INFO - Dimensi dataset: {'valid_time': 720, 'latitude': 2, 'longitude': 2}
2025-11-01 07:26:06,962 - INFO - Koordinat dataset: ['number', 'valid_time', 'latitude', 'longitude', 'expver']
2025-11-01 07:26:06,963 - INFO - Variabel yang tersedia: ['u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:06,964 - INFO - üìç Memproses lokasi: bundaran_hi
2025-11-01 07:26:06,965 - INFO - Target: (-6.1947, 106.8235) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:06,974 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:06,979 - INFO - ‚úÖ Data untuk bundaran_hi: 720 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', 'latitude', 'longitude', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:06,980 - INFO - üìç Memproses lokasi: kelapa_gad

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2024-11-01   -0.186165    1.210639               296.925293      300.331543    bundaran_hi    -6.206  106.903999        1.224869           98.742126         27.181549                  23.775299
1440 2024-11-01    0.054893    1.245880               295.976990      298.171936      jagakarsa    -6.457  106.903999        1.247089           87.477173         25.021942                  22.826996
2880 2024-11-01   -0.013359    1.191086               296.543091      300.594055    kebun_jeruk    -6.206  106.653000        1.191161           90.642578         27.444061                  23.393097
720  2024-11-01   -0.186165    1.210639               296.925293      300.331543  kelapa_gading    -6.206  106.903999        1.224869           98.742126         27.181549                  23.775299
2160 

2025-11-01 07:26:07,373 - INFO - ‚úÖ Data untuk jagakarsa: 744 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', 'latitude', 'longitude', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:07,375 - INFO - üìç Memproses lokasi: lubang_buaya
2025-11-01 07:26:07,377 - INFO - Target: (-6.2889, 106.9092) -> Grid: (-6.2060, 106.9040)
2025-11-01 07:26:07,448 - INFO - Columns in DataFrame: ['valid_time', 'u10', 'v10', 'd2m', 't2m']
2025-11-01 07:26:07,458 - INFO - ‚úÖ Data untuk lubang_buaya: 744 records, columns: ['datetime', 'u_wind_10m', 'v_wind_10m', 'dewpoint_temperature_2m', 'temperature_2m', 'location_name', 'latitude', 'longitude', 'wind_speed_10m', 'wind_direction_10m', 'temperature_2m_c', 'dewpoint_temperature_2m_c']
2025-11-01 07:26:07,471 - INFO - üìç Memproses lokasi: kebun_jeruk
2025-11-01 07:26:07,473 - INFO - Target: (-6.2073, 106.7532) -> Grid: (-6.20

       datetime  u_wind_10m  v_wind_10m  dewpoint_temperature_2m  temperature_2m  location_name  latitude   longitude  wind_speed_10m  wind_direction_10m  temperature_2m_c  dewpoint_temperature_2m_c
0    2024-12-01    1.508229    2.084509               296.948364      299.915802    bundaran_hi    -6.206  106.903999        2.572923           54.112701         26.765808                  23.798370
1488 2024-12-01    0.708638    2.061834               296.045746      298.481720      jagakarsa    -6.457  106.903999        2.180213           71.032532         25.331726                  22.895752
2976 2024-12-01    1.737996    2.173889               296.996887      300.255432    kebun_jeruk    -6.206  106.653000        2.783240           51.358124         27.105438                  23.846893
744  2024-12-01    1.508229    2.084509               296.948364      299.915802  kelapa_gading    -6.206  106.903999        2.572923           54.112701         26.765808                  23.798370
2232 

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import logging
from pathlib import Path

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class CSVCombiner:
    """Class untuk menggabungkan semua file CSV menjadi satu file daily"""
    
    def __init__(self, base_path):
        self.BASE_PATH = base_path
        self.CSV_OUTPUT_PATH = os.path.join(base_path, 'csv_output')
        self.COMBINED_OUTPUT_PATH = os.path.join(base_path, 'combined_output')
        
        # Buat direktori jika belum ada
        os.makedirs(self.COMBINED_OUTPUT_PATH, exist_ok=True)
    
    def find_all_csv_files(self):
        """Mencari semua file CSV di folder csv_output"""
        pattern = os.path.join(self.CSV_OUTPUT_PATH, "*.csv")
        csv_files = glob.glob(pattern)
        
        logger.info(f"Menemukan {len(csv_files)} file CSV:")
        for i, file in enumerate(sorted(csv_files), 1):
            logger.info(f"  {i:2d}. {file}")
        
        return sorted(csv_files)
    
    def read_and_combine_csv(self, csv_files):
        """Membaca dan menggabungkan semua file CSV"""
        all_dataframes = []
        
        for csv_file in csv_files:
            try:
                logger.info(f"Membaca file: {csv_file}")
                
                # Baca CSV file
                df = pd.read_csv(csv_file)
                
                # Validasi kolom yang diperlukan
                required_cols = ['datetime', 'location_name', 'latitude', 'longitude']
                missing_cols = [col for col in required_cols if col not in df.columns]
                
                if missing_cols:
                    logger.warning(f"File {csv_file} missing columns: {missing_cols}")
                    continue
                
                # Konversi kolom datetime ke format datetime
                df['datetime'] = pd.to_datetime(df['datetime'])
                
                # Tambahkan source file info untuk debugging
                df['source_file'] = Path(csv_file).name
                
                all_dataframes.append(df)
                logger.info(f"‚úÖ Berhasil membaca {len(df)} rows dari {csv_file}")
                
            except Exception as e:
                logger.error(f"‚ùå Gagal membaca {csv_file}: {str(e)}")
                continue
        
        if not all_dataframes:
            logger.error("Tidak ada data yang berhasil dibaca!")
            return None
        
        # Gabungkan semua dataframe
        combined_df = pd.concat(all_dataframes, ignore_index=True, sort=False)
        logger.info(f"üìä Total data setelah digabung: {len(combined_df):,} rows")
        
        return combined_df
    
    def convert_to_daily(self, df):
        """Mengkonversi data hourly ke daily"""
        try:
            logger.info("Mengkonversi data hourly ke daily...")
            
            # Pastikan datetime dalam format yang benar
            df['datetime'] = pd.to_datetime(df['datetime'])
            
            # Extract date untuk grouping
            df['date'] = df['datetime'].dt.date
            
            # Group by date dan location, lalu aggregate
            daily_data = []
            
            for (date, location_name), group in df.groupby(['date', 'location_name']):
                daily_record = {
                    'date': date,
                    'location_name': location_name,
                    'latitude': group['latitude'].iloc[0],
                    'longitude': group['longitude'].iloc[0]
                }
                
                # Aggregasi untuk setiap variabel meteorologi
                variable_aggregations = {
                    'temperature_2m': ['mean', 'min', 'max'],
                    'temperature_2m_c': ['mean', 'min', 'max'],
                    'dewpoint_temperature_2m': ['mean'],
                    'dewpoint_temperature_2m_c': ['mean'],
                    'u_wind_10m': ['mean'],
                    'v_wind_10m': ['mean'],
                    'wind_speed_10m': ['mean', 'max'],
                    'wind_direction_10m': ['mean'],
                    'precipitation': ['sum']  # Total harian untuk precipitation
                }
                
                # Apply aggregations
                for var, aggs in variable_aggregations.items():
                    if var in group.columns:
                        for agg_func in aggs:
                            if agg_func == 'mean':
                                daily_record[f'{var}_daily_avg'] = group[var].mean()
                            elif agg_func == 'min':
                                daily_record[f'{var}_daily_min'] = group[var].min()
                            elif agg_func == 'max':
                                daily_record[f'{var}_daily_max'] = group[var].max()
                            elif agg_func == 'sum':
                                daily_record[f'{var}_daily_total'] = group[var].sum()
                
                # Tambahkan count observations
                daily_record['hourly_observations_count'] = len(group)
                
                daily_data.append(daily_record)
            
            daily_df = pd.DataFrame(daily_data)
            
            # Urutkan data
            daily_df.sort_values(['date', 'location_name'], inplace=True)
            daily_df.reset_index(drop=True, inplace=True)
            
            logger.info(f"‚úÖ Data daily berhasil dibuat: {len(daily_df):,} records")
            return daily_df
            
        except Exception as e:
            logger.error(f"‚ùå Error converting to daily: {str(e)}")
            import traceback
            logger.error(f"Traceback: {traceback.format_exc()}")
            return None
    
    def optimize_dataframe(self, df):
        """Optimasi tipe data untuk menghemat memory"""
        # Optimasi numeric columns
        float_cols = df.select_dtypes(include=['float64']).columns
        for col in float_cols:
            if col in df.columns:
                df[col] = df[col].astype(np.float32)
        
        # Convert date column to datetime
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
        
        return df
    
    def save_combined_data(self, hourly_df, daily_df):
        """Menyimpan data combined"""
        try:
            # File paths
            hourly_output = os.path.join(self.COMBINED_OUTPUT_PATH, "era5_combined_hourly.csv")
            daily_output = os.path.join(self.COMBINED_OUTPUT_PATH, "era5_combined_daily.csv")
            
            # Simpan hourly data
            hourly_df.to_csv(hourly_output, index=False)
            logger.info(f"‚úÖ Data hourly disimpan: {hourly_output}")
            
            # Simpan daily data
            daily_df.to_csv(daily_output, index=False)
            logger.info(f"‚úÖ Data daily disimpan: {daily_output}")
            
            return hourly_output, daily_output
            
        except Exception as e:
            logger.error(f"‚ùå Error saving combined data: {str(e)}")
            return None, None
    
    def generate_summary_report(self, hourly_df, daily_df):
        """Generate summary report"""
        logger.info("\n" + "="*60)
        logger.info("üìä SUMMARY REPORT")
        logger.info("="*60)
        
        logger.info(f"HOURLY DATA:")
        logger.info(f"  ‚Ä¢ Total records: {len(hourly_df):,}")
        logger.info(f"  ‚Ä¢ Date range: {hourly_df['datetime'].min()} to {hourly_df['datetime'].max()}")
        logger.info(f"  ‚Ä¢ Locations: {hourly_df['location_name'].unique().tolist()}")
        logger.info(f"  ‚Ä¢ Variables: {[col for col in hourly_df.columns if col not in ['datetime', 'date', 'location_name', 'latitude', 'longitude', 'source_file']]}")
        
        logger.info(f"\nDAILY DATA:")
        logger.info(f"  ‚Ä¢ Total records: {len(daily_df):,}")
        logger.info(f"  ‚Ä¢ Date range: {daily_df['date'].min()} to {daily_df['date'].max()}")
        logger.info(f"  ‚Ä¢ Locations: {daily_df['location_name'].unique().tolist()}")
        logger.info(f"  ‚Ä¢ Variables: {[col for col in daily_df.columns if col not in ['date', 'location_name', 'latitude', 'longitude', 'hourly_observations_count']]}")
        
        # Data preview
        logger.info(f"\nüëÄ DAILY DATA PREVIEW (first 10 rows):")
        print(daily_df.head(10).to_string(index=False))
    
    def run_combination(self):
        """Menjalankan proses penggabungan"""
        logger.info("üöÄ MEMULAI PROCESS GABUNGKAN CSV FILES")
        
        # Step 1: Cari semua file CSV
        csv_files = self.find_all_csv_files()
        if not csv_files:
            logger.error("Tidak ada file CSV yang ditemukan!")
            return
        
        # Step 2: Baca dan gabungkan semua CSV
        combined_hourly = self.read_and_combine_csv(csv_files)
        if combined_hourly is None:
            return
        
        # Step 3: Konversi ke daily
        combined_daily = self.convert_to_daily(combined_hourly)
        if combined_daily is None:
            return
        
        # Step 4: Optimasi data
        combined_hourly = self.optimize_dataframe(combined_hourly)
        combined_daily = self.optimize_dataframe(combined_daily)
        
        # Step 5: Simpan data
        hourly_file, daily_file = self.save_combined_data(combined_hourly, combined_daily)
        
        # Step 6: Generate report
        self.generate_summary_report(combined_hourly, combined_daily)
        
        logger.info(f"\nüéâ PROCESS SELESAI!")
        logger.info(f"üìÅ File output tersimpan di: {self.COMBINED_OUTPUT_PATH}")
        logger.info(f"üìÑ File hourly: {hourly_file}")
        logger.info(f"üìÑ File daily: {daily_file}")

def main():
    """Main function"""
    base_path = 'C:\\Users\\user\\OneDrive\\IPB\\Thesis\\02. Development\\01. Data Praprocessing'
    
    combiner = CSVCombiner(base_path)
    combiner.run_combination()

if __name__ == "__main__":
    main()

2025-11-03 12:25:47,487 - INFO - üöÄ MEMULAI PROCESS GABUNGKAN CSV FILES
2025-11-03 12:25:47,491 - INFO - Menemukan 34 file CSV:
2025-11-03 12:25:47,493 - INFO -    1. C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2022_01_per_location.csv
2025-11-03 12:25:47,494 - INFO -    2. C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2022_02.csv
2025-11-03 12:25:47,495 - INFO -    3. C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2022_02_per_location.csv
2025-11-03 12:25:47,496 - INFO -    4. C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2022_03.csv
2025-11-03 12:25:47,497 - INFO -    5. C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\csv_output\data_era5_hourly_2022_03_per_location.csv
2025-11-03 12:25:47,499 - INFO -    6. C:\Users\user\OneDrive\IPB\Thesi

      date location_name  latitude  longitude  temperature_2m_daily_avg  temperature_2m_daily_min  temperature_2m_daily_max  temperature_2m_c_daily_avg  temperature_2m_c_daily_min  temperature_2m_c_daily_max  dewpoint_temperature_2m_daily_avg  dewpoint_temperature_2m_c_daily_avg  u_wind_10m_daily_avg  v_wind_10m_daily_avg  wind_speed_10m_daily_avg  wind_speed_10m_daily_max  wind_direction_10m_daily_avg  hourly_observations_count
2022-01-01   bundaran_hi    -6.206 106.903999                300.702637                297.792480                304.699585                   27.552633                   24.642487                   31.549591                         296.441681                            23.291676              3.336495              0.671866                  3.462669                  5.167665                     73.572205                         24
2022-01-01     jagakarsa    -6.457 106.903999                299.498322                296.033752                303.750977           