# Prediksi Konsumsi Listrik Kota L
Notebook ini berisi langkah-langkah memuat data, pra-pemrosesan, pelatihan model XGBoost, LSTM multivariat, ensemble, evaluasi RMSE, dan menyimpan prediksi sesuai format submission.

In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

print('Libraries loaded')

Libraries loaded


In [2]:
# Load data
df_train = pd.read_csv('data_train.csv', parse_dates=['tanggal_waktu'])
df_test = pd.read_csv('data_test.csv', parse_dates=['tanggal_waktu'])
submission = pd.read_csv('submission_format.csv')

print('Data loaded:', df_train.shape, df_test.shape, submission.shape)

Data loaded: (24336, 8) (2160, 7) (2160, 2)


In [3]:
# Pra-pemrosesan data
# Ganti nilai 99999 menjadi NaN
df_train['konsumsi_listrik'] = df_train['konsumsi_listrik'].replace(99999, np.nan)
# Urutkan berdasarkan waktu dan interpolasi linear
df_train = df_train.sort_values('tanggal_waktu').set_index('tanggal_waktu')
df_train['konsumsi_listrik'] = df_train['konsumsi_listrik'].interpolate()

# Label encoding untuk kolom kategorikal
le = LabelEncoder()
df_train['dampak_env_enc'] = le.fit_transform(df_train['dampak_lingkungan'])
df_test = df_test.sort_values('tanggal_waktu').set_index('tanggal_waktu')
df_test['dampak_env_enc'] = le.transform(df_test['dampak_lingkungan'])

# Ekstraksi fitur waktu
for df in [df_train, df_test]:
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek

# Buat fitur lag pada train
df_train['lag1'] = df_train['konsumsi_listrik'].shift(1)
df_train['lag24'] = df_train['konsumsi_listrik'].shift(24)
df_train = df_train.dropna()

features = ['suhu', 'beban_listrik', 'kecepatan_angin', 'tekanan_udara', 'dampak_env_enc', 'hour', 'dayofweek', 'lag1', 'lag24']
X = df_train[features]
y = df_train['konsumsi_listrik']

print('Preprocessing selesai, fitur shape:', X.shape)

Preprocessing selesai, fitur shape: (24312, 9)


In [4]:
# Split data train dan validasi
total = len(df_train)
idx = int(total * 0.8)
X_tr, X_val = X.iloc[:idx], X.iloc[idx:]
y_tr, y_val = y.iloc[:idx], y.iloc[idx:]

print('Train shape:', X_tr.shape, 'Validation shape:', X_val.shape)

Train shape: (19449, 9) Validation shape: (4863, 9)


In [5]:
# XGBoost Regressor
xgb_model = XGBRegressor(n_estimators=200, max_depth=6, random_state=42)
xgb_model.fit(X_tr, y_tr)
pred_xgb_val = xgb_model.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, pred_xgb_val))
print('RMSE XGBoost:', rmse_xgb)

RMSE XGBoost: 34.30677572861834


In [6]:
# LSTM Multivariat
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_train[features])

def create_sequences(data, target, seq_len=24):
    Xs, ys = [], []
    for i in range(seq_len, len(data)):
        Xs.append(data[i-seq_len:i])
        ys.append(target.values[i])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(data_scaled, df_train['konsumsi_listrik'], seq_len=24)
split_seq = int(len(X_seq) * 0.8)
X_tr_seq, X_val_seq = X_seq[:split_seq], X_seq[split_seq:]
y_tr_seq, y_val_seq = y_seq[:split_seq], y_seq[split_seq:]

model_lstm = Sequential([
    LSTM(50, input_shape=(X_tr_seq.shape[1], X_tr_seq.shape[2])),
    Dense(1)
])
model_lstm.compile(optimizer='adam', loss='mse')
es = EarlyStopping(patience=3, restore_best_weights=True)
model_lstm.fit(X_tr_seq, y_tr_seq, epochs=10, batch_size=32, validation_data=(X_val_seq, y_val_seq), callbacks=[es])
pred_lstm_val = model_lstm.predict(X_val_seq).flatten()
rmse_lstm = np.sqrt(mean_squared_error(y_val_seq, pred_lstm_val))
print('RMSE LSTM:', rmse_lstm)

  super().__init__(**kwargs)


Epoch 1/10
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 14ms/step - loss: 93558.4688 - val_loss: 75991.7500
Epoch 2/10
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - loss: 74035.3750 - val_loss: 61885.4922
Epoch 3/10
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - loss: 61447.9648 - val_loss: 50158.8359
Epoch 4/10
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - loss: 49439.1445 - val_loss: 40404.3477
Epoch 5/10
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - loss: 40004.5547 - val_loss: 32394.7402
Epoch 6/10
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - loss: 32223.8281 - val_loss: 25947.6523
Epoch 7/10
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - loss: 26687.3633 - val_loss: 20911.3770
Epoch 8/10
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - loss: 2

In [7]:
# Ensemble (rata-rata prediksi XGB dan LSTM)
pred_xgb_for_ens = pred_xgb_val[-len(pred_lstm_val):]
pred_ensemble_val = (pred_xgb_for_ens + pred_lstm_val) / 2
rmse_ensemble = np.sqrt(mean_squared_error(y_val_seq, pred_ensemble_val))
print('RMSE Ensemble:', rmse_ensemble)

# Tabel ringkasan RMSE
rmse_df = pd.DataFrame({
    'Model': ['XGBoost', 'LSTM Multivariat', 'Ensemble'],
    'RMSE': [rmse_xgb, rmse_lstm, rmse_ensemble]
})
print(rmse_df)

RMSE Ensemble: 64.61502777547538
              Model        RMSE
0           XGBoost   34.306776
1  LSTM Multivariat  112.125757
2          Ensemble   64.615028


In [8]:
# Prediksi pada data test menggunakan rolling forecast
df_test_full = df_test.copy()
# Siapkan kolom lag
df_test_full['lag1'] = np.nan
df_test_full['lag24'] = np.nan

# Mulai dari nilai terakhir train untuk lag
last_values = df_train['konsumsi_listrik'].values[-24:]

preds = []
for idx, (ts, row) in enumerate(df_test_full.iterrows()):
    if idx == 0:
        lag1 = df_train['konsumsi_listrik'].iloc[-1]
        lag24 = last_values[0]
    else:
        lag1 = preds[-1]
        lag24 = preds[-24] if idx >= 24 else last_values[idx]
    features_row = np.array([[
        row['suhu'], row['beban_listrik'], row['kecepatan_angin'], row['tekanan_udara'],
        row['dampak_env_enc'], row['hour'], row['dayofweek'], lag1, lag24
    ]])
    # Prediksi XGB
    p_xgb = xgb_model.predict(features_row)[0]
    # Prediksi LSTM: perlu scaling dan membuat sequence terakhir
    seq_input = np.vstack([data_scaled[-24+idx:]]).copy()  # adjust if needed
    # Simplifikasi: gunakan XGB untuk rolling, LSTM forecast panjang memerlukan window, skip LSTM dynamic here
    p_ens = p_xgb  # jika tidak bisa rolling LSTM, pakai XGB saja di test
    preds.append(p_ens)

df_test_full['konsumsi_listrik'] = preds
# Simpan hasil prediksi sesuai format submission
submission['konsumsi_listrik'] = df_test_full['konsumsi_listrik'].values
submission.to_csv('prediksi_submission.csv', index=False)
print('File prediksi disimpan: prediksi_submission.csv')

File prediksi disimpan: prediksi_submission.csv
