# Analisis Prediksi Harga Saham dengan XGBoost
## Eksplorasi Data, Modeling, Prediksi, dan Visualisasi

Notebook ini menggabungkan seluruh proses analisis dari eksplorasi data hingga visualisasi hasil prediksi untuk 5 saham: BBCA, TPIA, TLKM, BRPT, dan ASII.


## 1. Import Library dan Setup


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
from datetime import timedelta
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# =====================================================
# KONFIGURASI PANDAS: TAMPILKAN SEMUA OUTPUT TANPA DIPOTONG
# =====================================================
pd.set_option('display.max_rows', None)          # Tampilkan semua baris
pd.set_option('display.max_columns', None)        # Tampilkan semua kolom
pd.set_option('display.width', None)              # Lebar display tidak terbatas
pd.set_option('display.max_colwidth', None)       # Lebar kolom tidak terbatas
pd.set_option('display.expand_frame_repr', False) # Jangan wrap DataFrame
pd.set_option('display.precision', 4)            # Presisi angka desimal
pd.set_option('display.float_format', lambda x: f'{x:.4f}')  # Format float

# Konfigurasi matplotlib dan seaborn
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Parameter
NLAGS = 5
MA_WINDOWS = [3, 5]
TRAIN_RATIO = 0.8
HORIZON = 12  # 12 weeks ahead
STOCKS = ["BBCA", "TPIA", "TLKM", "BRPT", "ASII"]

# Model Parameters
MODEL_PARAMS = dict(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2.0,
    min_child_weight=3,
    objective='reg:squarederror',
    random_state=42,
    verbosity=0
)

print("=" * 60)
print("LIBRARY BERHASIL DIIMPORT")
print("=" * 60)


## 2. Eksplorasi Data (Data Exploration)


In [None]:
# Load Data
df = pd.read_csv("DATA 5 SAHAM TERBESAR TERPERCAYA DAN TERGILA.csv", sep=';')

# Parse Date column untuk konsistensi
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', dayfirst=True)

print("=" * 60)
print("INFORMASI DATASET")
print("=" * 60)
print(f"Shape: {df.shape}")
print(f"\nKolom: {list(df.columns)}")
print(f"\nTipe Data:")
print(df.dtypes.to_string())
print(f"\n5 Baris Pertama:")
print(df.head().to_string())
print(f"\n5 Baris Terakhir:")
print(df.tail().to_string())
print(f"\nSEMUA DATA (Full Dataset):")
print(df.to_string(index=False))


In [None]:
print("\n" + "=" * 60)
print("STATISTIKA DESKRIPTIF")
print("=" * 60)
print(df.describe().to_string())


In [None]:
print("\n" + "=" * 60)
print("CEK MISSING VALUE")
print("=" * 60)
missing = df.isnull().sum()
print(missing.to_string())
print(f"\nTotal missing value: {missing.sum()}")


In [None]:
print("\n" + "=" * 60)
print("DETEKSI OUTLIER")
print("=" * 60)

def tampilkan_outlier(data):
    for col in data.columns[1:]:  # Skip Date column
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1

        Batas_Bawah = Q1 - 1.5 * IQR
        Batas_Atas = Q3 + 1.5 * IQR

        outlier_df = data[(data[col] < Batas_Bawah) | (data[col] > Batas_Atas)]

        if not outlier_df.empty:
            print(f"\n{'='*50}")
            print(f"Outlier pada kolom {col}")
            print(f"{'='*50}")
            print(f"Jumlah outlier: {len(outlier_df)}")
            print(f"Batas Bawah: {Batas_Bawah:.2f}, Batas Atas: {Batas_Atas:.2f}")
            print(f"\nData Outlier:")
            print(outlier_df[['Date', col]].reset_index(drop=True).to_string(index=False))
        else:
            print(f"\nâœ“ Tidak ada outlier pada kolom {col}")

tampilkan_outlier(df)


## 3. Feature Engineering dan Split Data Train-Test


In [None]:
print("=" * 60)
print("FEATURE ENGINEERING: LAG & MOVING AVERAGE")
print("=" * 60)

# Simpan data untuk setiap saham
data_dict = {}

for stock in STOCKS:
    print(f"\n{'='*50}")
    print(f"Processing fitur untuk {stock}")
    print(f"{'='*50}")
    
    # Copy kolom harga
    series = df[[stock]].copy()
    
    # Buat lag features
    for lag in range(1, NLAGS + 1):
        series[f"{stock}_lag{lag}"] = series[stock].shift(lag)
    
    # Buat Moving Average features
    for w in MA_WINDOWS:
        series[f"{stock}_MA{w}"] = series[stock].rolling(window=w).mean()
    
    # Hapus baris NA pertama
    series.dropna(inplace=True)
    
    # Target adalah harga hari ini
    y = series[stock]
    X = series.drop(columns=[stock])
    
    # Split train-test
    split_idx = int(len(series) * TRAIN_RATIO)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    # Simpan ke dictionary
    data_dict[stock] = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }
    
    print(f"âœ“ {stock} selesai")
    print(f"  Train shape: {X_train.shape}, Test shape: {X_test.shape}")
    print(f"  Features: {list(X_train.columns)}")
    
    # Tampilkan semua data (full output)
    print(f"\n  SEMUA DATA X_train ({len(X_train)} baris):")
    print(X_train.to_string())
    print(f"\n  SEMUA DATA y_train ({len(y_train)} nilai):")
    print(y_train.to_string())
    print(f"\n  SEMUA DATA X_test ({len(X_test)} baris):")
    print(X_test.to_string())
    print(f"\n  SEMUA DATA y_test ({len(y_test)} nilai):")
    print(y_test.to_string())

print("\n" + "=" * 60)
print("SELURUH DATA TRAIN-TEST BERHASIL DIBUAT")
print("=" * 60)


## 4. Training Model XGBoost


In [None]:
print("=" * 60)
print("TRAINING MODEL XGBOOST")
print("=" * 60)
print(f"\nParameter Model:")
for key, value in MODEL_PARAMS.items():
    print(f"  {key}: {value}")

models = {}
results = {}

for stock in STOCKS:
    print(f"\n{'='*50}")
    print(f"Training Model untuk {stock}")
    print(f"{'='*50}")
    
    X_train = data_dict[stock]['X_train']
    X_test = data_dict[stock]['X_test']
    y_train = data_dict[stock]['y_train']
    y_test = data_dict[stock]['y_test']
    
    # Train model
    model = XGBRegressor(**MODEL_PARAMS)
    model.fit(X_train, y_train)
    
    # Predict
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    # Calculate Metrics
    rmse_train = mean_squared_error(y_train, pred_train) ** 0.5
    rmse_test = mean_squared_error(y_test, pred_test) ** 0.5
    
    metrics_train = {
        "RMSE": rmse_train,
        "MAE": mean_absolute_error(y_train, pred_train),
        "R2": r2_score(y_train, pred_train)
    }
    
    metrics_test = {
        "RMSE": rmse_test,
        "MAE": mean_absolute_error(y_test, pred_test),
        "R2": r2_score(y_test, pred_test)
    }
    
    # Feature Importance
    feature_importance = pd.DataFrame({
        "Feature": X_train.columns,
        "Importance": model.feature_importances_
    }).sort_values("Importance", ascending=False)
    
    # Simpan hasil
    models[stock] = model
    results[stock] = {
        'metrics_train': metrics_train,
        'metrics_test': metrics_test,
        'pred_train': pred_train,
        'pred_test': pred_test,
        'y_train': y_train,
        'y_test': y_test,
        'feature_importance': feature_importance
    }
    
    # Tampilkan hasil
    print(f"\nðŸ“Š METRICS TRAIN:")
    for k, v in metrics_train.items():
        print(f"   {k}: {v:.4f}")
    
    print(f"\nðŸ“Š METRICS TEST:")
    for k, v in metrics_test.items():
        print(f"   {k}: {v:.4f}")
    
    print(f"\nðŸ“ˆ FEATURE IMPORTANCE (SEMUA FEATURES):")
    print(feature_importance.to_string(index=False))
    
    print(f"\nâœ“ Model {stock} selesai di-training")

print("\n" + "=" * 60)
print("SELURUH MODEL BERHASIL DI-TRAINING")
print("=" * 60)


## 5. Prediksi Masa Depan (12 Minggu ke Depan)


In [None]:
def make_feature_row(stock, history_prices):
    """
    Membuat feature row dari history prices untuk prediksi
    history_prices: list atau array dari harga historis (terlama ... terbaru)
    """
    row = {}
    # Pastikan minimal NLAGS entries
    hist = list(history_prices)
    if len(hist) < NLAGS:
        pad = [hist[0]] * (NLAGS - len(hist))
        hist = pad + hist
    
    # Buat lag features
    for l in range(1, NLAGS + 1):
        row[f"{stock}_lag{l}"] = hist[-l]
    
    # Buat Moving Average features
    for w in MA_WINDOWS:
        window_vals = hist[-w:] if len(hist) >= w else hist
        row[f"{stock}_MA{w}"] = float(np.mean(window_vals)) if len(window_vals) > 0 else 0.0
    
    return row

def generate_future_dates(last_date, n_weeks):
    """Generate future dates (weekly)"""
    if isinstance(last_date, str):
        last_date = pd.to_datetime(last_date, format='%d/%m/%Y')
    return [last_date + timedelta(weeks=i) for i in range(1, n_weeks+1)]

print("=" * 60)
print("PREDIKSI MASA DEPAN (12 MINGGU)")
print("=" * 60)

# Gunakan data yang sudah dimuat sebelumnya, parse Date jika belum
if 'Date' in df.columns:
    df_final = df.copy()
    # Parse Date jika masih string
    if df_final['Date'].dtype == 'object':
        df_final['Date'] = pd.to_datetime(df_final['Date'], format='%d/%m/%Y', dayfirst=True)
    df_final = df_final.sort_values("Date").reset_index(drop=True)
else:
    # Fallback: load dari file jika perlu
    try:
        df_final = pd.read_csv("DATA 5 SAHAM FINAL.csv", parse_dates=["Date"], dayfirst=True)
    except:
        df_final = pd.read_csv("DATA 5 SAHAM TERBESAR TERPERCAYA DAN TERGILA.csv", sep=';')
        df_final['Date'] = pd.to_datetime(df_final['Date'], format='%d/%m/%Y', dayfirst=True)
    df_final = df_final.sort_values("Date").reset_index(drop=True)

future_predictions = {}

for stock in STOCKS:
    print(f"\n{'='*50}")
    print(f"Prediksi untuk {stock}")
    print(f"{'='*50}")
    
    model = models[stock]
    
    # Get historical price series
    price_series = df_final[["Date", stock]].dropna().copy()
    price_series = price_series.sort_values("Date").reset_index(drop=True)
    
    # Prepare history list of prices (oldest ... latest)
    history = list(price_series[stock].astype(float).values)
    last_date = price_series["Date"].iloc[-1]
    future_dates = generate_future_dates(last_date, HORIZON)
    
    # Recursive forecasting
    preds = []
    hist_for_pred = history.copy()
    
    for h in range(HORIZON):
        feat = make_feature_row(stock, hist_for_pred)
        X_row = pd.DataFrame([feat])
        yhat = model.predict(X_row)[0]
        preds.append(float(yhat))
        hist_for_pred.append(float(yhat))
    
    # Simpan hasil
    future_predictions[stock] = {
        'dates': future_dates,
        'predictions': preds,
        'last_actual_date': last_date,
        'last_actual_price': history[-1]
    }
    
    # Tampilkan hasil
    print(f"\nðŸ“… Tanggal Terakhir Data: {last_date.strftime('%d/%m/%Y')}")
    print(f"ðŸ’° Harga Terakhir: {history[-1]:.2f}")
    print(f"\nðŸ”® Prediksi 12 Minggu ke Depan:")
    print(f"{'Tanggal':<15} {'Prediksi':>12}")
    print("-" * 30)
    for date, pred in zip(future_dates, preds):
        print(f"{date.strftime('%d/%m/%Y'):<15} {pred:>12.2f}")
    
    # Tampilkan sebagai DataFrame
    pred_df = pd.DataFrame({
        "Date": [d.strftime('%d/%m/%Y') for d in future_dates],
        "Predicted_Price": preds
    })
    print(f"\nðŸ“Š Tabel Prediksi:")
    print(pred_df.to_string(index=False))

print("\n" + "=" * 60)
print("SELURUH PREDIKSI MASA DEPAN SELESAI")
print("=" * 60)


In [None]:
print("=" * 60)
print("VISUALISASI HASIL - PER SAHAM")
print("=" * 60)

# Visualisasi untuk setiap saham
for stock in STOCKS:
    print(f"\n{'='*50}")
    print(f"Visualisasi untuk {stock}")
    print(f"{'='*50}")
    
    # Data untuk plotting
    df_train = pd.DataFrame({
        'Actual': results[stock]['y_train'].values,
        'Predicted': results[stock]['pred_train']
    })
    
    df_test = pd.DataFrame({
        'Actual': results[stock]['y_test'].values,
        'Predicted': results[stock]['pred_test']
    })
    
    df_fi = results[stock]['feature_importance']
    
    # 1. Train Plot
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle(f'Visualisasi Hasil untuk {stock}', fontsize=16, fontweight='bold')
    
    # Train Plot
    axes[0, 0].plot(df_train.index, df_train['Actual'], label='Actual Train', linewidth=2, color='blue')
    axes[0, 0].plot(df_train.index, df_train['Predicted'], label='Predicted Train', linewidth=2, color='red', linestyle='--')
    axes[0, 0].set_title(f'{stock} - Actual vs Predicted (Train)')
    axes[0, 0].set_xlabel('Index')
    axes[0, 0].set_ylabel('Price')
    axes[0, 0].legend()
    axes[0, 0].grid(alpha=0.3)
    
    # Test Plot
    axes[0, 1].plot(df_test.index, df_test['Actual'], label='Actual Test', linewidth=2, color='blue')
    axes[0, 1].plot(df_test.index, df_test['Predicted'], label='Predicted Test', linewidth=2, color='red', linestyle='--')
    axes[0, 1].set_title(f'{stock} - Actual vs Predicted (Test)')
    axes[0, 1].set_xlabel('Index')
    axes[0, 1].set_ylabel('Price')
    axes[0, 1].legend()
    axes[0, 1].grid(alpha=0.3)
    
    # Feature Importance
    df_fi_sorted = df_fi.sort_values('Importance', ascending=True)
    axes[0, 2].barh(df_fi_sorted['Feature'], df_fi_sorted['Importance'], color='darkblue')
    axes[0, 2].set_title(f'{stock} - Feature Importance')
    axes[0, 2].set_xlabel('Importance')
    
    # Scatter Plot (Test)
    axes[1, 0].scatter(df_test['Actual'], df_test['Predicted'], alpha=0.6, s=40)
    min_val = min(df_test['Actual'].min(), df_test['Predicted'].min())
    max_val = max(df_test['Actual'].max(), df_test['Predicted'].max())
    axes[1, 0].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
    axes[1, 0].set_title(f'{stock} - Scatter: Actual vs Predicted (Test)')
    axes[1, 0].set_xlabel('Actual')
    axes[1, 0].set_ylabel('Predicted')
    axes[1, 0].grid(alpha=0.3)
    
    # Error Histogram
    errors = df_test['Actual'] - df_test['Predicted']
    axes[1, 1].hist(errors, bins=25, color='red', alpha=0.7, edgecolor='black')
    axes[1, 1].axvline(x=0, color='black', linestyle='--', linewidth=2)
    axes[1, 1].set_title(f'{stock} - Error Distribution (Test)')
    axes[1, 1].set_xlabel('Error (Actual - Predicted)')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].grid(alpha=0.3)
    
    # Future Predictions Plot
    future_data = future_predictions[stock]
    N_actual = min(36, len(df_final[df_final[stock].notna()]))
    actual_dates = df_final['Date'].iloc[-N_actual:].values
    actual_prices = df_final[stock].iloc[-N_actual:].values
    
    plot_dates = list(actual_dates) + future_data['dates']
    plot_prices_actual = list(actual_prices) + [None]*len(future_data['dates'])
    plot_prices_pred = [None]*len(actual_dates) + future_data['predictions']
    
    axes[1, 2].plot(plot_dates, plot_prices_actual, label='Actual (recent)', linewidth=2, color='blue')
    axes[1, 2].plot(plot_dates, plot_prices_pred, label=f'Predicted next {HORIZON}w', linewidth=2, 
                    linestyle='--', color='red')
    axes[1, 2].scatter(future_data['dates'], future_data['predictions'], color='red', s=50, zorder=5)
    axes[1, 2].set_title(f'{stock} - Forecast {HORIZON} weeks')
    axes[1, 2].set_xlabel('Date')
    axes[1, 2].set_ylabel('Price')
    axes[1, 2].legend()
    axes[1, 2].grid(alpha=0.3)
    axes[1, 2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Tampilkan metrics di terminal
    print(f"\nðŸ“Š METRICS:")
    print(f"  Train - RMSE: {results[stock]['metrics_train']['RMSE']:.4f}, "
          f"MAE: {results[stock]['metrics_train']['MAE']:.4f}, "
          f"RÂ²: {results[stock]['metrics_train']['R2']:.4f}")
    print(f"  Test  - RMSE: {results[stock]['metrics_test']['RMSE']:.4f}, "
          f"MAE: {results[stock]['metrics_test']['MAE']:.4f}, "
          f"RÂ²: {results[stock]['metrics_test']['R2']:.4f}")

print("\n" + "=" * 60)


## 7. Visualisasi Gabungan (5 Saham dalam Satu Plot)


In [None]:
print("=" * 60)
print("VISUALISASI GABUNGAN - TRAIN (5 SAHAM)")
print("=" * 60)

fig, axes = plt.subplots(5, 1, figsize=(14, 18), sharex=False)
fig.suptitle('Actual vs Predicted - Train Set (5 Saham)', fontsize=16, fontweight='bold')

for i, stock in enumerate(STOCKS):
    df_train = pd.DataFrame({
        'Actual': results[stock]['y_train'].values,
        'Predicted': results[stock]['pred_train']
    })
    
    axes[i].plot(df_train.index, df_train['Actual'], label='Actual Train', linewidth=2, color='blue')
    axes[i].plot(df_train.index, df_train['Predicted'], label='Predicted Train', linewidth=2, 
                 color='red', linestyle='--')
    axes[i].set_title(f'{stock} - Train Actual vs Predicted', fontweight='bold')
    axes[i].set_xlabel('Index')
    axes[i].set_ylabel('Price')
    axes[i].legend()
    axes[i].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("âœ“ Plot gabungan Train selesai")


In [None]:
print("\n" + "=" * 60)
print("VISUALISASI GABUNGAN - TEST (5 SAHAM)")
print("=" * 60)

fig, axes = plt.subplots(5, 1, figsize=(14, 18), sharex=False)
fig.suptitle('Actual vs Predicted - Test Set (5 Saham)', fontsize=16, fontweight='bold')

for i, stock in enumerate(STOCKS):
    df_test = pd.DataFrame({
        'Actual': results[stock]['y_test'].values,
        'Predicted': results[stock]['pred_test']
    })
    
    axes[i].plot(df_test.index, df_test['Actual'], label='Actual Test', linewidth=2, color='blue')
    axes[i].plot(df_test.index, df_test['Predicted'], label='Predicted Test', linewidth=2, 
                 color='red', linestyle='--')
    axes[i].set_title(f'{stock} - Test Actual vs Predicted', fontweight='bold')
    axes[i].set_xlabel('Index')
    axes[i].set_ylabel('Price')
    axes[i].legend()
    axes[i].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("âœ“ Plot gabungan Test selesai")


## 8. Ringkasan Hasil (Summary)


In [None]:
print("=" * 60)
print("RINGKASAN HASIL SEMUA SAHAM")
print("=" * 60)

# Buat summary table
summary_data = []
for stock in STOCKS:
    summary_data.append({
        'Stock': stock,
        'Train_RMSE': results[stock]['metrics_train']['RMSE'],
        'Train_MAE': results[stock]['metrics_train']['MAE'],
        'Train_R2': results[stock]['metrics_train']['R2'],
        'Test_RMSE': results[stock]['metrics_test']['RMSE'],
        'Test_MAE': results[stock]['metrics_test']['MAE'],
        'Test_R2': results[stock]['metrics_test']['R2']
    })

summary_df = pd.DataFrame(summary_data)
print("\nðŸ“Š METRICS SUMMARY:")
print(summary_df.to_string(index=False))

# Tampilkan semua prediksi train dan test untuk setiap saham
print("\n" + "=" * 60)
print("SEMUA PREDIKSI TRAIN & TEST (FULL OUTPUT)")
print("=" * 60)
for stock in STOCKS:
    print(f"\n{'='*60}")
    print(f"SAHAM: {stock}")
    print(f"{'='*60}")
    
    # Prediksi Train
    pred_train_df = pd.DataFrame({
        'Actual_Train': results[stock]['y_train'].values,
        'Predicted_Train': results[stock]['pred_train']
    })
    print(f"\nðŸ“Š PREDIKSI TRAIN ({len(pred_train_df)} baris):")
    print(pred_train_df.to_string(index=False))
    
    # Prediksi Test
    pred_test_df = pd.DataFrame({
        'Actual_Test': results[stock]['y_test'].values,
        'Predicted_Test': results[stock]['pred_test']
    })
    print(f"\nðŸ“Š PREDIKSI TEST ({len(pred_test_df)} baris):")
    print(pred_test_df.to_string(index=False))
    
    # Feature Importance
    print(f"\nðŸ“ˆ FEATURE IMPORTANCE:")
    print(results[stock]['feature_importance'].to_string(index=False))

# Tampilkan prediksi masa depan
print("\n" + "=" * 60)
print("RINGKASAN PREDIKSI MASA DEPAN (12 MINGGU)")
print("=" * 60)
for stock in STOCKS:
    future_data = future_predictions[stock]
    print(f"\n{stock}:")
    print(f"  Harga Terakhir: {future_data['last_actual_price']:.2f}")
    print(f"  Prediksi Minggu ke-12: {future_data['predictions'][-1]:.2f}")
    print(f"  Perubahan: {((future_data['predictions'][-1] / future_data['last_actual_price']) - 1) * 100:+.2f}%")
    
    # Tampilkan semua prediksi 12 minggu
    pred_future_df = pd.DataFrame({
        'Minggu': range(1, HORIZON + 1),
        'Tanggal': [d.strftime('%d/%m/%Y') for d in future_data['dates']],
        'Prediksi_Harga': future_data['predictions']
    })
    print(f"\n  ðŸ“… SEMUA PREDIKSI 12 MINGGU:")
    print(pred_future_df.to_string(index=False))

print("\n" + "=" * 60)
print("ANALISIS SELESAI")
print("=" * 60)
