In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def create_complete_daily_timeseries(csv_file):
    # Load data
    df = pd.read_csv(r"C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\04. Data AOD\20251003-Jakarta_AOD_MODIS_Daily_2022_2024.csv")
    
    # Convert date column
    df['date'] = pd.to_datetime(df['date'])
    
    # Filter untuk tahun 2022-2024
    df = df[df['date'].dt.year.isin([2022, 2023, 2024])].copy()
    
    # Get unique locations
    locations = df['name'].unique()
    
    # Create complete date range
    start_date = pd.Timestamp('2022-01-01')
    end_date = pd.Timestamp('2024-12-31')
    all_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Create empty DataFrame untuk hasil akhir
    complete_data = []
    
    for location in locations:
        # Filter data untuk lokasi tertentu
        loc_data = df[df['name'] == location].copy()
        
        # Create DataFrame dengan semua tanggal
        loc_full_dates = pd.DataFrame({'date': all_dates})
        loc_full_dates['name'] = location
        
        # Merge dengan data existing
        loc_merged = pd.merge(loc_full_dates, loc_data[['date', 'Optical_Depth_047', 'system:index', '.geo']], 
                             on='date', how='left')
        
        # Sort by date
        loc_merged = loc_merged.sort_values('date')
        
        # Interpolasi untuk mengisi missing values
        loc_merged['Optical_Depth_047_filled'] = loc_merged['Optical_Depth_047'].interpolate(
            method='linear', limit_direction='both'
        )
        
        # Fill remaining NaN dengan seasonal pattern (monthly average)
        loc_merged['month'] = loc_merged['date'].dt.month
        monthly_avg = loc_merged.groupby('month')['Optical_Depth_047_filled'].transform('mean')
        loc_merged['Optical_Depth_047_filled'] = loc_merged['Optical_Depth_047_filled'].fillna(monthly_avg)
        
        # Jika masih ada NaN, gunakan overall average
        overall_avg = loc_merged['Optical_Depth_047_filled'].mean()
        loc_merged['Optical_Depth_047_filled'] = loc_merged['Optical_Depth_047_filled'].fillna(overall_avg)
        
        complete_data.append(loc_merged)
    
    # Combine semua lokasi
    final_df = pd.concat(complete_data, ignore_index=True)
    
    return final_df

# Fungsi untuk smoothing data (optional)
def apply_smoothing(df, window=7):
    """Apply moving average smoothing to reduce noise"""
    df_smoothed = df.copy()
    
    for location in df['name'].unique():
        mask = df['name'] == location
        df_smoothed.loc[mask, 'Optical_Depth_047_smoothed'] = (
            df.loc[mask, 'Optical_Depth_047_filled']
            .rolling(window=window, center=True, min_periods=1)
            .mean()
        )
    
    return df_smoothed

# Eksekusi utama
def main():
    # Process data
    print("Membuat time series daily 2022-2024...")
    result_df = create_complete_daily_timeseries('20251003-Jakarta_AOD_MODIS_Daily_2022_2024.csv')
    
    # Apply smoothing (optional)
    result_df = apply_smoothing(result_df, window=7)
    
    # Print summary
    print(f"\n=== SUMMARY ===")
    print(f"Periode: {result_df['date'].min()} sampai {result_df['date'].max()}")
    print(f"Total days: {result_df['date'].nunique()}")
    print(f"Lokasi: {list(result_df['name'].unique())}")
    print(f"Total records: {len(result_df)}")
    
    # Check data quality
    original_data_points = result_df['Optical_Depth_047'].notna().sum()
    filled_data_points = len(result_df) - original_data_points
    print(f"Data points asli: {original_data_points}")
    print(f"Data points diisi: {filled_data_points}")
    print(f"Persentase data diisi: {filled_data_points/len(result_df)*100:.2f}%")
    
    # Simpan hasil
    output_columns = ['date', 'name', 'Optical_Depth_047', 'Optical_Depth_047_filled', 
                     'Optical_Depth_047_smoothed', 'system:index', '.geo']
    
    # Hanya ambil kolom yang ada
    available_columns = [col for col in output_columns if col in result_df.columns]
    result_df[available_columns].to_csv('Jakarta_AOD_Daily_2022_2024_Complete.csv', index=False)
    
    print(f"\nData disimpan ke: Jakarta_AOD_Daily_2022_2024_Complete.csv")
    
    # Preview data
    print(f"\n=== PREVIEW DATA ===")
    print(result_df[available_columns].head(10))
    
    return result_df

# Jalankan script
if __name__ == "__main__":
    final_data = main()

Membuat time series daily 2022-2024...

=== SUMMARY ===
Periode: 2022-01-01 00:00:00 sampai 2024-12-31 00:00:00
Total days: 1096
Lokasi: ['jagakarsa', 'kebun_jeruk', 'bundaran_hi', 'kelapa_gading', 'lubang_buaya']
Total records: 5712
Data points asli: 1237
Data points diisi: 4475
Persentase data diisi: 78.34%

Data disimpan ke: Jakarta_AOD_Daily_2022_2024_Complete.csv

=== PREVIEW DATA ===
        date       name  Optical_Depth_047  Optical_Depth_047_filled  \
0 2022-01-01  jagakarsa                NaN                417.000000   
1 2022-01-02  jagakarsa                NaN                417.000000   
2 2022-01-03  jagakarsa              417.0                417.000000   
3 2022-01-04  jagakarsa                NaN                393.833333   
4 2022-01-05  jagakarsa                NaN                370.666667   
5 2022-01-06  jagakarsa                NaN                347.500000   
6 2022-01-07  jagakarsa                NaN                324.333333   
7 2022-01-08  jagakarsa        