In [1]:
# ==============================================================================
# KODE GABungan UNTUK EKSPLORASI DATASET TESIS (VERSI FINAL REVISI 8)
# ==============================================================================
# Perubahan:
# - Menambahkan perhitungan jumlah data outlier menggunakan metode IQR
#   (Interquartile Range).
# - Menambahkan kolom "Jumlah Outlier" dan "% Outlier" pada tabel
#   rangkuman komprehensif di akhir eksekusi.
#
# Perbaikan sebelumnya dipertahankan.
# ==============================================================================

# --- Instalasi Library (jika belum ada) ---
# !pip install neuralforecast statsmodels requests pandas numpy matplotlib seaborn -q
# print("Instalasi library eksternal selesai.")

# --- Import Libraries Utama ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from scipy.stats import describe
import requests
from io import StringIO
import os
import traceback
import random
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

print("Library Python berhasil diimpor.")

# --- Konfigurasi & Seed ---
seed = 42
np.random.seed(seed)
random.seed(seed)

# --- KONFIGURASI DATASET ---
DATASET_CONFIG = {
    1: {
        'name': 'Bike Sharing', 'type': 'csv',
        'source': "https://raw.githubusercontent.com/kanadakurniawan/loss-function-comparison/8ae1f330d2d94645a6b647ab357fa786a5e1f956/dataset/bike-sharing.csv",
        'freq': 'H', 'parser_variant': None, 'value_column': 'cnt', 'time_column': 'datetime',
        'justification_points': ["Frekuensi tinggi (jam)", "Domain transportasi", "Musiman ganda (harian, mingguan)", "Pengaruh faktor eksternal (cuaca)"],
    },
    2: {
        'name': 'Pasut BMKG', 'type': 'csv',
        'source': 'https://raw.githubusercontent.com/kanadakurniawan/loss-function-comparison/6912fb758bcf4f6984c3d09c70dee9972b987a4b/dataset/db-2022-2024-pasut.csv',
        'freq': 'H', 'parser_variant': None, 'value_column': 'pasut', 'time_column': 'datetime',
        'justification_points': ["Frekuensi tinggi (jam)", "Domain lingkungan/oseanografi", "Pola musiman sangat kuat dan reguler (pasang surut)", "Data dunia nyata dari BMKG"],
    },
    3: {
        'name': 'Parking Birmingham', 'type': 'csv',
        'source': 'https://raw.githubusercontent.com/kanadakurniawan/loss-function-comparison/8ae1f330d2d94645a6b647ab357fa786a5e1f956/dataset/parking-birmingham.csv',
        'freq': '30min', 'parser_variant': None, 'value_column': 'Occupancy', 'time_column': 'datetime',
        'justification_points': ["Frekuensi tinggi (30 menit)", "Domain urban/transportasi", "Potensi pola harian dan mingguan yang kompleks", "Mengandung nilai nol atau mendekati nol (malam hari)"],
    },
    4: {
        'name': 'Solar Power Generation', 'type': 'csv',
        'source': 'https://raw.githubusercontent.com/kanadakurniawan/loss-function-comparison/8ae1f330d2d94645a6b647ab357fa786a5e1f956/dataset/Actual_31.85_-110.85_2006_UPV_100MW_5_Min.csv',
        'freq': '5min', 'parser_variant': None, 'value_column': 'Power(MW)', 'time_column': 'datetime',
        'justification_points': ["Frekuensi sangat tinggi (5 menit)", "Domain energi terbarukan", "Intermittency tinggi (nilai nol di malam hari)", "Sangat dipengaruhi cuaca (volatilitas)"],
    },
    5: {
        'name': 'Cacar Air Hungaria', 'type': 'csv',
        'source': 'https://raw.githubusercontent.com/kanadakurniawan/loss-function-comparison/8ae1f330d2d94645a6b647ab357fa786a5e1f956/dataset/cacar-air-hungaria.csv',
        'freq': 'W', 'parser_variant': None, 'value_column': 'BUDAPEST', 'time_column': 'datetime',
        'justification_points': ["Frekuensi rendah (mingguan)", "Domain epidemiologi/kesehatan", "Musiman tahunan yang jelas", "Contoh data count (jumlah kasus)"],
        'notes': "File ini berisi banyak series (per wilayah). Kode ini akan menganalisis kolom 'BUDAPEST' sebagai contoh."
    },
    6: {
        'name': 'M4 Hourly Dataset', 'type': 'tsf',
        'source': 'https://raw.githubusercontent.com/kanadakurniawan/loss-function-comparison/8f605f9fbc107e6303174d3f615a5d591785d55e/dataset/m4_hourly_dataset.tsf',
        'freq': 'H', 'parser_variant': 'standard', 'value_column': None, 'time_column': None,
        'justification_points': ["Frekuensi tinggi (jam)", "Benchmark klasik dari kompetisi M4", "Berisi beragam jenis data (industri, finansial, dll.)", "Tantangan dalam noise dan variabilitas pola"],
    }
}
NAN_IMPUTATION_METHOD = 'ffill_bfill'
os.environ['NIXTLA_ID_AS_COL'] = '1'

print("Konfigurasi dataset berhasil diperbarui.")

# --- Fungsi Helper (Loading, Parsing, Cleaning, Preparing) ---

def load_data_from_source(source_path_or_url):
    source_str = str(source_path_or_url)
    if source_str.startswith('http'):
        print(f"Mengunduh data dari: {source_str}")
        try:
            response = requests.get(source_str, timeout=60)
            response.raise_for_status()
            print("Data berhasil diunduh.")
            try:
                encoding = response.encoding if response.encoding else 'utf-8'
                return response.content.decode(encoding)
            except UnicodeDecodeError:
                return response.content.decode('iso-8859-1')
        except requests.exceptions.RequestException as e: print(f"Error mengunduh data: {e}"); return None
    else:
        if not os.path.exists(source_str): print(f"Error: File tdk ditemukan di {source_str}"); return None
        try:
            with open(source_str, 'r', encoding='utf-8') as f: return f.read()
        except UnicodeDecodeError:
            with open(source_str, 'r', encoding='iso-8859-1') as f: return f.read()
        except Exception as e: print(f"Error membaca file lokal: {e}"); return None

def parse_tsf_data(raw_content, parser_variant):
    parsed_series, series_ids, start_times = [], [], []
    print(f"Memulai parsing TSF (varian: {parser_variant})...")
    lines = raw_content.splitlines(); reading_data = False; skipped_lines = 0; parsed_count = 0
    for line in lines:
        line = line.strip()
        if not line or line.startswith(("#", "@relation", "@attribute", "@frequency", "@horizon", "@missing", "@equallength")): continue
        if line.startswith("@data"): reading_data = True; continue
        if reading_data:
            parts = line.split(":")
            try:
                if len(parts) < 3: skipped_lines += 1; continue
                series_name, start_time_str, values_str = parts[0], parts[1], parts[2]
                start_time = pd.to_datetime(start_time_str)
                time_series = []
                for val_str in values_str.split(","):
                    val_str = val_str.strip()
                    if val_str and val_str != '?': time_series.append(float(val_str))
                    elif val_str == '?': time_series.append(np.nan)
                if time_series:
                    parsed_series.append(time_series); series_ids.append(series_name); start_times.append(start_time); parsed_count += 1
                else: skipped_lines += 1
            except Exception: skipped_lines += 1; continue
    if skipped_lines > 0: print(f"Peringatan: Melewati {skipped_lines} baris saat parsing TSF.")
    print(f"Parsing TSF selesai. {parsed_count} series berhasil diparsing.")
    return series_ids, start_times, parsed_series

def parse_csv_data(raw_content, time_col, value_col, dataset_name):
    print(f"Memulai parsing CSV (time: '{time_col}', value: '{value_col}')...")
    try:
        df = pd.read_csv(StringIO(raw_content))
        if time_col not in df.columns: raise ValueError(f"Kolom waktu '{time_col}' TIDAK DITEMUKAN. Kolom: {df.columns.tolist()}")
        if value_col not in df.columns: raise ValueError(f"Kolom nilai '{value_col}' TIDAK DITEMUKAN. Kolom: {df.columns.tolist()}")
        df[time_col] = pd.to_datetime(df[time_col]); df[value_col] = pd.to_numeric(df[value_col], errors='coerce')
        df = df.sort_values(by=time_col).reset_index(drop=True)
        unique_id = f"{dataset_name.replace(' ', '_')}_Series"
        return [unique_id], [df[time_col].iloc[0]], [df[value_col].tolist()]
    except Exception as e: print(f"Error parsing CSV: {e}\n{traceback.format_exc()}"); return [], [], []

def load_and_parse_data(dataset_index, config):
    cfg = config.get(dataset_index); dataset_name = cfg['name']
    print(f"\n-- Memuat Dataset {dataset_index}: {dataset_name} --")
    raw_content = load_data_from_source(cfg['source'])
    if raw_content is None: return None, None, None, None, None
    if cfg['type'] == 'tsf': return (*parse_tsf_data(raw_content, cfg['parser_variant']), cfg['freq'], dataset_name)
    elif cfg['type'] == 'csv': return (*parse_csv_data(raw_content, cfg['time_column'], cfg['value_column'], dataset_name), cfg['freq'], dataset_name)
    return None, None, None, None, None

def select_series(all_ids, all_start_times, all_series_data, index_or_name, dataset_name):
    if not all_ids: raise ValueError("Tdk ada data series tersedia u/ dipilih.")
    selected_index = -1
    if isinstance(index_or_name, int):
        if 0 <= index_or_name < len(all_ids): selected_index = index_or_name
        else: raise ValueError(f"Indeks {index_or_name} di luar jangkauan (0-{len(all_ids) - 1}).")
    elif isinstance(index_or_name, str):
        try: selected_index = all_ids.index(index_or_name)
        except ValueError: raise ValueError(f"Nama series '{index_or_name}' tdk ditemukan.")
    else: raise TypeError("'index_or_name' hrs int/str.")
    selected_id = all_ids[selected_index]
    print(f"\nMemilih series: '{selected_id}' (Index: {selected_index}) dari '{dataset_name}'")
    return selected_id, all_start_times[selected_index], all_series_data[selected_index]

def handle_nan_values(ts, method='ffill_bfill'):
    ts_series = pd.Series(ts, dtype=float)
    initial_nan_count = ts_series.isna().sum()
    if initial_nan_count == 0:
        print("Tidak ada nilai NaN.")
        return ts_series.tolist(), initial_nan_count
    print(f"Menangani {initial_nan_count}/{len(ts_series)} NaN dgn metode: {method}")
    if method == 'ffill_bfill': filled_ts = ts_series.ffill().bfill()
    else: filled_ts = ts_series.ffill().bfill()
    if filled_ts.isna().sum() > 0: filled_ts = filled_ts.fillna(0)
    print("Semua NaN berhasil ditangani.")
    return filled_ts.tolist(), initial_nan_count

def prepare_dataframe_for_neuralforecast(time_series, unique_id, start_time, freq):
    timestamps = pd.date_range(start=start_time, periods=len(time_series), freq=freq)
    df = pd.DataFrame({"ds": timestamps, "y": time_series}); df["unique_id"] = unique_id
    return df

# --- Fungsi Analisis & Visualisasi ---

def plot_time_series_analysis(df, series_name, freq):
    if not isinstance(df.index, pd.DatetimeIndex): df = df.set_index('ds')
    y = df['y']
    print(f"\n--- Analisis Visual Time Series: {series_name} ---")
    plt.figure(figsize=(15, 6)); plt.plot(y, label=f'Observed ({series_name})', linewidth=0.8); plt.title(f'Time Series Plot: {series_name}'); plt.xlabel('Timestamp'); plt.ylabel('Value'); plt.legend(); plt.grid(True); plt.tight_layout(); plt.show()
    MAX_POINTS_FOR_DETAIL = 3 * 365 * 24
    y_analysis = y.tail(MAX_POINTS_FOR_DETAIL) if len(y) > MAX_POINTS_FOR_DETAIL else y
    is_subset = len(y) > MAX_POINTS_FOR_DETAIL
    if is_subset: print(f"INFO: Analisis detail dilakukan pada {MAX_POINTS_FOR_DETAIL} poin terakhir.")
    period_map = {'5min': 12*24, '30min': 48, 'H': 24, 'D': 7, 'W': 52, 'MS': 12}
    period = period_map.get(freq)
    if period and len(y_analysis) > 2 * period:
        print(f"Melakukan dekomposisi (Model Aditif, Periode={period})...")
        try:
            decomposition = seasonal_decompose(y_analysis.dropna(), model='additive', period=period)
            fig = decomposition.plot(); fig.set_size_inches(15, 10); plt.suptitle(f'Dekomposisi: {series_name}{" (Subset)" if is_subset else ""}', y=1.01); plt.show()
        except Exception as e: print(f"  Gagal dekomposisi: {e}")
    else: print("Dekomposisi musiman dilewati.")
    lags = min(48, len(y_analysis) // 2 - 1)
    if lags > 5:
        print(f"Melakukan plotting ACF dan PACF (lags={lags})...")
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        plot_acf(y_analysis.dropna(), lags=lags, ax=axes[0]); plot_pacf(y_analysis.dropna(), lags=lags, ax=axes[1], method='ywm')
        plt.suptitle(f'ACF & PACF: {series_name}{" (Subset)" if is_subset else ""}', y=1.01); plt.show()

def calculate_and_display_statistics(df, series_name, initial_nan_count):
    y = df['y']
    print(f"\n--- Analisis Statistik & Distribusi: {series_name} ---")
    desc_stats = y.describe()
    print("\nStatistik Deskriptif:"); print(desc_stats.to_string())
    skewness = y.skew(); kurtosis = y.kurt()
    zero_count = (y == 0).sum(); zero_pct = zero_count / len(y) * 100

    # --- Perhitungan Outlier (Metode IQR) ---
    Q1 = y.quantile(0.25)
    Q3 = y.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_count = ((y < lower_bound) | (y > upper_bound)).sum()
    outlier_pct = outlier_count / len(y) * 100

    print(f"\n  Skewness: {skewness:.4f}, Kurtosis: {kurtosis:.4f}")
    print(f"  Jumlah Nilai Nol: {zero_count} ({zero_pct:.2f}%)")
    print(f"  Jumlah Outlier (IQR): {outlier_count} ({outlier_pct:.2f}%)")

    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    sns.histplot(y.dropna(), kde=True, bins=50, ax=axes[0]); axes[0].set_title(f'Distribusi Nilai: {series_name}')
    sns.boxplot(y=y.dropna(), ax=axes[1]); axes[1].set_title(f'Box Plot Nilai: {series_name}')
    plt.tight_layout(); plt.show()

    stats_summary = {
        'Observasi': int(desc_stats['count']), 'Mean': desc_stats['mean'], 'Std Dev': desc_stats['std'],
        'Min': desc_stats['min'], 'Max': desc_stats['max'], 'Skewness': skewness, 'Kurtosis': kurtosis,
        'Nilai Nol': zero_count, '% Nol': zero_pct, 'Missing (Awal)': initial_nan_count,
        'Jumlah Outlier': outlier_count, '% Outlier': outlier_pct
    }
    return stats_summary

print("Fungsi helper eksplorasi dimuat.")

# --- Fungsi Utama untuk Menjalankan Analisis per Dataset ---
def explore_dataset(dataset_index, series_index_or_name=0):
    config = DATASET_CONFIG.get(dataset_index)
    if not config: print(f"Error: Config u/ dataset index {dataset_index} tdk ditemukan."); return None
    print(f"\n{'='*80}\n Memulai Eksplorasi Dataset {dataset_index}: {config['name']} \n{'='*80}")
    print("Justifikasi Kunci:"); [print(f"- {point}") for point in config.get('justification_points', [])]
    if 'notes' in config: print(f"Catatan: {config['notes']}")
    print("-" * 30)
    all_ids, all_starts, all_series, freq, name = load_and_parse_data(dataset_index, DATASET_CONFIG)
    if not all_ids: return None
    try:
        selector = series_index_or_name if config['type'] == 'tsf' else 0
        selected_id, start_time, ts_raw = select_series(all_ids, all_starts, all_series, selector, name)
    except (ValueError, IndexError, TypeError) as e: print(f"Error saat memilih series: {e}"); return None
    ts_cleaned, initial_nan_count = handle_nan_values(ts_raw, method=NAN_IMPUTATION_METHOD)
    if not ts_cleaned: print("Error: Data kosong setelah cleaning."); return None
    try:
        df = prepare_dataframe_for_neuralforecast(ts_cleaned, selected_id, start_time, freq)
        df = df.set_index('ds')
        print(f"DataFrame siap ({len(df)} baris). Contoh:\n{df.head().to_string()}")
    except Exception as e: print(f"Error saat mempersiapkan DataFrame: {e}"); return None

    stats_summary = calculate_and_display_statistics(df, selected_id, initial_nan_count)
    plot_time_series_analysis(df, selected_id, freq)

    full_summary = {
        'Dataset': config['name'], 'Series ID': selected_id,
        'Start Date': df.index.min().strftime('%Y-%m-%d'),
        'End Date': df.index.max().strftime('%Y-%m-%d'),
        'Frekuensi': freq, **stats_summary
    }
    print(f"\n{'='*80}\n Eksplorasi Selesai: {config['name']} (Series: {selected_id})\n{'='*80}\n\n")
    return full_summary

# --- FUNGSI BARU UNTUK MENAMPILKAN RANGKUMAN AKHIR ---
def display_final_summary(summaries):
    """Menampilkan rangkuman akhir dari semua dataset dalam bentuk tabel."""
    if not summaries:
        print("Tidak ada data untuk dirangkum.")
        return

    df_summary = pd.DataFrame(summaries)

    float_cols = ['Mean', 'Std Dev', 'Min', 'Max', 'Skewness', 'Kurtosis']
    for col in float_cols:
        df_summary[col] = df_summary[col].map('{:,.2f}'.format)
    df_summary['% Nol'] = df_summary['% Nol'].map('{:.2f}%'.format)
    df_summary['% Outlier'] = df_summary['% Outlier'].map('{:.2f}%'.format)

    print(f"\n{'='*140}")
    print(f"{'RANGKUMAN KOMPREHENSIF SEMUA DATASET'.center(140)}")
    print(f"{'='*140}")
    print(df_summary.to_string())
    print(f"{'='*140}")

# ==============================================================================
# EKSEKUSI ANALISIS PER DATASET (VERSI OTOMATIS)
# ==============================================================================
datasets_to_run = [1, 2, 3, 4, 5, 6]
series_selectors = {
    6: 0
}
all_summaries = []

# --- Loop Eksekusi ---
for index in datasets_to_run:
    selector = series_selectors.get(index, 0)
    summary = explore_dataset(dataset_index=index, series_index_or_name=selector)
    if summary:
        all_summaries.append(summary)

# --- Tampilkan Rangkuman Akhir ---
display_final_summary(all_summaries)

print("\n=== SEMUA EKSEKUSI DAN RANGKUMAN TELAH SELESAI ===")

Output hidden; open in https://colab.research.google.com to view.