In [2]:
import pandas as pd
import numpy as np  # Kita perlu numpy untuk membuat nilai 'NaN'
import os

# --- 1. PENGATURAN FILE ---

input_path = r"C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\01. Data PM25\processed_data\ispu_daily_complete.csv"
output_path = r"C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\01. Data PM25\processed_data\ispu_pm25_interpolated_no_zeros.csv"

print(f"Membaca file: {input_path}")

try:
    # --- 2. MEMBACA & MEMPERSIAPKAN DATA ---
    
    df = pd.read_csv(input_path)
    
    # Menggunakan format='mixed' untuk mengatasi tanggal yang tidak konsisten
    df['TANGGAL'] = pd.to_datetime(df['TANGGAL'], format='mixed')
    
    df = df.set_index('TANGGAL')
    
    df_pm25 = df[['ISPU_PM2_5']]

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # --- INI ADALAH PERBAIKAN PENTING ---
    # Mengganti semua nilai 0 dengan 'NaN' (Not a Number / Kosong)
    print("Mengganti semua nilai 0 di ISPU_PM2_5 menjadi NaN...")
    df_pm25['ISPU_PM2_5'] = df_pm25['ISPU_PM2_5'].replace(0, np.nan)
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    # --- 3. MEMBUAT RENTANG TANGGAL LENGKAP ---
    
    start_date = df_pm25.index.min()
    end_date = df_pm25.index.max()
    
    print(f"Rentang data: {start_date.date()} sampai {end_date.date()}")
    
    daily_index = pd.date_range(start=start_date, end=end_date, freq='D')

    # --- 4. RE-INDEX & INTERPOLASI ---
    
    df_reindexed = df_pm25.reindex(daily_index)
    
    print("Melakukan interpolasi linear pada data yang hilang (termasuk yang 0)...")
    
    # 1. Lakukan interpolasi linear
    df_interpolated = df_reindexed.interpolate(method='linear')
    
    # 2. Mengisi sisa 'NA' di Awal atau Akhir
    # (Interpolasi linear tidak bisa mengisi NA di awal/akhir data)
    print("Mengisi sisa NA di awal/akhir data (back-fill & forward-fill)...")
    df_interpolated = df_interpolated.bfill() # Isi NA di awal (menggunakan data valid pertama)
    df_interpolated = df_interpolated.ffill() # Isi NA di akhir (jika masih ada)
    
    # Pembulatan
    df_interpolated['ISPU_PM2_5'] = df_interpolated['ISPU_PM2_5'].round(2)
    df_interpolated.index.name = 'TANGGAL'

    # --- 5. SIMPAN HASIL ---
    
    df_interpolated.to_csv(output_path)
    
    print("\n--- PROSES SELESAI ---")
    print(f"Data hasil interpolasi (NA dan 0) telah disimpan ke:\n{output_path}")
    
    # Menampilkan contoh data yang telah diisi (berdasarkan data Anda)
    print("\nContoh hasil interpolasi (mengisi gap tgl 13 s/d 18):")
    print(df_interpolated.loc['2022-01-13':'2022-01-18'])


except FileNotFoundError:
    print(f"ERROR: File tidak ditemukan di path:\n{input_path}")
except Exception as e:
    print(f"Terjadi error: {e}")

Membaca file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\01. Data PM25\processed_data\ispu_daily_complete.csv
Mengganti semua nilai 0 di ISPU_PM2_5 menjadi NaN...
Rentang data: 2022-01-01 sampai 2025-01-01
Melakukan interpolasi linear pada data yang hilang (termasuk yang 0)...
Mengisi sisa NA di awal/akhir data (back-fill & forward-fill)...

--- PROSES SELESAI ---
Data hasil interpolasi (NA dan 0) telah disimpan ke:
C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\01. Data PM25\processed_data\ispu_pm25_interpolated_no_zeros.csv

Contoh hasil interpolasi (mengisi gap tgl 13 s/d 18):
            ISPU_PM2_5
TANGGAL               
2022-01-13        59.0
2022-01-14        65.6
2022-01-15        72.2
2022-01-16        78.8
2022-01-17        85.4
2022-01-18        92.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pm25['ISPU_PM2_5'] = df_pm25['ISPU_PM2_5'].replace(0, np.nan)


In [3]:
import pandas as pd
import numpy as np
import os

# --- 1. PENGATURAN FILE ---

input_path = r"C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\01. Data PM25\processed_data\rawdata_pm25.xlsx"
output_path = r"C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\01. Data PM25\processed_data\final_ispupm25.csv"

print(f"Membaca file: {input_path}")

try:
    # --- 2. MEMBACA & TRANSFORMASI DATA ---
    
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # --- INI ADALAH PERBAIKANNYA ---
    # Gunakan pd.read_excel() untuk file .xlsx
    df = pd.read_excel(input_path)
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    
    # Mengidentifikasi kolom ID dan kolom yang akan di-melt
    # (Asumsi nama kolom di Excel Anda sama seperti contoh sebelumnya)
    id_vars = ['datetime']
    value_vars = [col for col in df.columns if col.startswith('ISPU_PM25_') or col.startswith('ISPU PM25_')]
    
    print(f"Kolom stasiun yang ditemukan: {value_vars}")
    
    # Mengubah dari format LEBAR ke format PANJANG
    df_long = df.melt(
        id_vars=id_vars,
        value_vars=value_vars,
        var_name='station_name',
        value_name='ispu_pm25'
    )
    
    # --- 3. PEMBERSIHAN DATA ---
    
    # Membersihkan nama stasiun
    df_long['station_name'] = df_long['station_name'].str.replace('ISPU_PM25_', '').str.replace('ISPU PM25_', '')
    
    # Mengonversi 'datetime' ke format tanggal (Excel biasanya sudah benar,
    # tapi 'mixed' tetap aman jika ada format teks)
    df_long['datetime'] = pd.to_datetime(df_long['datetime'], format='mixed')
    
    # Mengonversi nilai '0' dan string kosong menjadi NaN (data hilang)
    df_long['ispu_pm25'] = pd.to_numeric(df_long['ispu_pm25'], errors='coerce') 
    df_long['ispu_pm25'] = df_long['ispu_pm25'].replace(0, np.nan) 
    
    # Menyortir data berdasarkan stasiun dan tanggal
    df_long = df_long.sort_values(by=['station_name', 'datetime'])

    print("Data berhasil di-transformasi ke format 'long'.")
    print("Mengganti nilai 0 dan kosong menjadi NaN...")

    # --- 4. INTERPOLASI LINEAR (PER STASIUN) ---
    
    print("Melakukan interpolasi linear per stasiun...")

    # Grup berdasarkan stasiun dan lakukan interpolasi
    df_long['ispu_pm25'] = df_long.groupby('station_name')['ispu_pm25'].transform(
        lambda x: x.interpolate(method='linear').bfill().ffill()
    )
    
    df_long['ispu_pm25'] = df_long['ispu_pm25'].round(2)

    # --- 5. SIMPAN HASIL ---
    
    df_final = df_long.sort_values(by=['datetime', 'station_name'])
    
    df_final.to_csv(output_path, index=False)
    
    print("\n--- PROSES SELESAI ---")
    print(f"Data hasil transformasi dan interpolasi telah disimpan ke:\n{output_path}")

    print("\nContoh 10 baris pertama data hasil akhir:")
    print(df_final.head(10))

except FileNotFoundError:
    print(f"ERROR: File tidak ditemukan di path:\n{input_path}")
except ImportError:
    print("ERROR: Library 'openpyxl' tidak ditemukan.")
    print("Harap jalankan 'pip install openpyxl' di terminal Anda.")
except Exception as e:
    print(f"Terjadi error: {e}")

Membaca file: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\01. Data PM25\processed_data\rawdata_pm25.xlsx
Kolom stasiun yang ditemukan: ['ISPU_PM25_BUNDARANHI', 'ISPU_PM25_JAGAKARSA', 'ISPU_PM25_KEBUNJERUK', 'ISPU PM25_LUBANGBUAYA', 'ISPU_PM25_KELAPAGADING']
Data berhasil di-transformasi ke format 'long'.
Mengganti nilai 0 dan kosong menjadi NaN...
Melakukan interpolasi linear per stasiun...

--- PROSES SELESAI ---
Data hasil transformasi dan interpolasi telah disimpan ke:
C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\01. Data PM25\processed_data\final_ispupm25.csv

Contoh 10 baris pertama data hasil akhir:
       datetime  station_name  ispu_pm25
0    2022-01-01    BUNDARANHI      53.00
1096 2022-01-01     JAGAKARSA      52.75
2192 2022-01-01    KEBUNJERUK      51.75
4384 2022-01-01  KELAPAGADING      86.55
3288 2022-01-01   LUBANGBUAYA      50.75
1    2022-01-02    BUNDARANHI      53.00
1097 2022-01-02     JAGAKARSA      52.75
21

In [4]:
df.describe()

Unnamed: 0,datetime,ISPU_PM25_BUNDARANHI,ISPU_PM25_JAGAKARSA,ISPU_PM25_KEBUNJERUK,ISPU PM25_LUBANGBUAYA,ISPU_PM25_KELAPAGADING
count,1096,895.0,895.0,895.0,895.0,994.0
mean,2023-07-02 12:00:00,70.16779,71.611575,56.498878,72.479265,67.64451
min,2022-01-01 00:00:00,0.0,0.0,0.0,0.0,0.719603
25%,2022-10-01 18:00:00,55.575188,62.027778,4.611111,57.574405,54.091129
50%,2023-07-02 12:00:00,73.5,72.611111,68.8,75.555556,69.525322
75%,2024-04-01 06:00:00,86.088346,82.90873,84.95,88.25,82.393786
max,2024-12-31 00:00:00,125.166667,137.166667,153.777778,131.722222,109.554816
std,,21.520937,19.450265,37.947254,20.705581,19.614856


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   datetime                1096 non-null   datetime64[ns]
 1   ISPU_PM25_BUNDARANHI    895 non-null    float64       
 2   ISPU_PM25_JAGAKARSA     895 non-null    float64       
 3   ISPU_PM25_KEBUNJERUK    895 non-null    float64       
 4   ISPU PM25_LUBANGBUAYA   895 non-null    float64       
 5   ISPU_PM25_KELAPAGADING  994 non-null    float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 51.5 KB


In [6]:
df.isnull().sum()

datetime                    0
ISPU_PM25_BUNDARANHI      201
ISPU_PM25_JAGAKARSA       201
ISPU_PM25_KEBUNJERUK      201
ISPU PM25_LUBANGBUAYA     201
ISPU_PM25_KELAPAGADING    102
dtype: int64

In [7]:
# Transform dari long ke wide format
df_wide = df_final.pivot(index='datetime', columns='station_name', values='ispu_pm25')

# Reset index agar datetime jadi kolom biasa
df_wide = df_wide.reset_index()

# Rename columns jika perlu
df_wide.columns.name = None

print("Data setelah di-transform ke wide format:")
print(df_wide.head())
print(f"\nShape: {df_wide.shape}")
print(f"\nColumns: {df_wide.columns.tolist()}")

# Simpan hasil
df_wide.to_csv(output_path, index=False)

print(f"\nFile wide format disimpan di: {output_path}")

# Sekarang bisa cek summary statistics
print("\n" + "="*50)
print("SUMMARY STATISTICS:")
print("="*50)
print(df_wide.describe())

print("\n" + "="*50)
print("INFO:")
print("="*50)
print(df_wide.info())




Data setelah di-transform ke wide format:
    datetime  BUNDARANHI  JAGAKARSA  KEBUNJERUK  KELAPAGADING  LUBANGBUAYA
0 2022-01-01        53.0      52.75       51.75         86.55        50.75
1 2022-01-02        53.0      52.75       51.75         86.55        50.75
2 2022-01-03        53.0      52.75       51.75         86.55        50.75
3 2022-01-04        53.0      52.75       51.75         86.55        50.75
4 2022-01-05        53.0      52.75       51.75         86.55        50.75

Shape: (1096, 6)

Columns: ['datetime', 'BUNDARANHI', 'JAGAKARSA', 'KEBUNJERUK', 'KELAPAGADING', 'LUBANGBUAYA']

File wide format disimpan di: C:\Users\user\OneDrive\IPB\Thesis\02. Development\01. Data Praprocessing\01. Data PM25\processed_data\final_ispupm25.csv

SUMMARY STATISTICS:
                  datetime   BUNDARANHI    JAGAKARSA   KEBUNJERUK  \
count                 1096  1096.000000  1096.000000  1096.000000   
mean   2023-07-02 12:00:00    70.215119    71.167628    70.030201   
min    2022-01-

In [8]:
df_wide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   datetime      1096 non-null   datetime64[ns]
 1   BUNDARANHI    1096 non-null   float64       
 2   JAGAKARSA     1096 non-null   float64       
 3   KEBUNJERUK    1096 non-null   float64       
 4   KELAPAGADING  1096 non-null   float64       
 5   LUBANGBUAYA   1096 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 51.5 KB


In [9]:
df_wide.describe()

Unnamed: 0,datetime,BUNDARANHI,JAGAKARSA,KEBUNJERUK,KELAPAGADING,LUBANGBUAYA
count,1096,1096.0,1096.0,1096.0,1096.0,1096.0
mean,2023-07-02 12:00:00,70.215119,71.167628,70.030201,66.407162,71.471743
min,2022-01-01 00:00:00,3.33,2.55,4.22,0.72,7.39
25%,2022-10-01 18:00:00,55.835,60.9825,59.3875,52.675,56.2125
50%,2023-07-02 12:00:00,73.03,71.84,68.91,68.13,75.08
75%,2024-04-01 06:00:00,84.745,81.795,82.995,81.8,86.6825
max,2024-12-31 00:00:00,125.17,137.17,153.78,109.55,131.72
std,,19.733289,17.889785,20.925447,20.22993,20.391049
