In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score

plt.rcParams['figure.dpi'] = 150

# ========== Load Data ==========
data = pd.read_csv('oil_price_cleaned.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)
price = data['Price'].astype(float)

N = len(price)
split = int(N * 0.8)
train_price = price.iloc[:split]
test_price = price.iloc[split:]
print(f"Total={N}, Train={len(train_price)}, Test={len(test_price)}")

# ========== Feature Engineering ==========
def make_features(price_series: pd.Series):
    df = pd.DataFrame({'price': price_series})
    df['y'] = (df['price'].shift(-1) > df['price']).astype(int)
    df['price_lag_1'] = df['price'].shift(1)
    df['price_lag_10'] = df['price'].shift(10)

    delta = df['price'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss.replace(0, np.nan)
    df['rsi'] = (100 - (100 / (1 + rs))).shift(1)

    df = df.dropna()
    feature_cols = ['price_lag_1', 'price_lag_10', 'rsi']
    X = df[feature_cols].values.astype(float)
    y = df['y'].values.astype(int)
    idx = df.index

    return X, y, idx, feature_cols

# ========== Threshold Optimization ==========
def optimize_threshold(y_true, proba, metric='balanced_accuracy'):
    thresholds = np.linspace(0.1, 0.9, 81)
    scores = []

    for thresh in thresholds:
        y_pred = (proba >= thresh).astype(int)
        if metric == 'balanced_accuracy':
            score = balanced_accuracy_score(y_true, y_pred)
        elif metric == 'f1':
            score = f1_score(y_true, y_pred)
        else:
            score = accuracy_score(y_true, y_pred)
        scores.append(score)

    best_idx = np.argmax(scores)
    return thresholds[best_idx], scores[best_idx], thresholds, scores

# ========== Prepare Data ==========
X_tr, y_tr, idx_tr, feature_cols = make_features(train_price)

lookback = 30
s = pd.concat([train_price.iloc[-lookback:], test_price])
X_all, y_all, idx_all, _ = make_features(s)
test_mask = idx_all >= test_price.index[0]
X_te = X_all[test_mask]
y_te = y_all[test_mask]
idx_te = idx_all[test_mask]

print(f"Training set: {X_tr.shape}, Test set: {X_te.shape}")
print(f"Training class distribution: 0={sum(y_tr==0)}, 1={sum(y_tr==1)}")
print(f"Test class distribution: 0={sum(y_te==0)}, 1={sum(y_te==1)}")

# ========== Model Config ==========
models = {
    'balanced': Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='rbf', C=1.0, gamma='scale',
                    class_weight='balanced', probability=True, random_state=42))
    ]),
    'slight_weight': Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='rbf', C=1.0, gamma='scale',
                    class_weight={0: 1.2, 1: 1.0}, probability=True, random_state=42))
    ]),
    'gamma_0.01': Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='rbf', C=1.0, gamma=0.01,
                    class_weight='balanced', probability=True, random_state=42))
    ])
}

# ========== Training & Evaluation ==========
results = {}
best_model = None
best_score = -1
best_name = None

for name, model in models.items():
    print(f"\nTraining model: {name}")
    model.fit(X_tr, y_tr)
    proba = model.predict_proba(X_te)[:, 1]
    best_thresh, score, _, _ = optimize_threshold(y_te, proba, metric='balanced_accuracy')

    print(f"  Best threshold: {best_thresh:.2f}, Balanced Accuracy: {score:.3f}")

    y_pred = (proba >= best_thresh).astype(int)

    results[name] = {
        'model': model,
        'proba': proba,
        'threshold': best_thresh,
        'y_pred': y_pred,
        'score': score
    }

    if score > best_score:
        best_score = score
        best_model = model
        best_name = name

print(f"\nBest model: {best_name} (Balanced Accuracy: {best_score:.3f})")

# ========== Single-Step Evaluation ==========
best_result = results[best_name]
proba_single = best_result['proba']
y_pred_single = best_result['y_pred']
best_thresh = best_result['threshold']

acc_single = accuracy_score(y_te, y_pred_single)
f1_single = f1_score(y_te, y_pred_single)

print("\n" + "="*60)
print("Single-Step (Direction) Evaluation")
print("="*60)
print(f"Accuracy: {acc_single:.3f}")
print(f"F1 Score: {f1_single:.3f}")

# ========== Multi-Step Recursive Prediction ==========
print("\nPerforming improved multi-step recursive prediction...")

init_window = 30
history_prices = list(train_price.iloc[-init_window:].values)

def calc_features_from_history(prices):
    if len(prices) < 15:
        return None
    ps = pd.Series(prices)
    feat = [
        ps.iloc[-1],
        ps.iloc[-10] if len(ps) >= 10 else ps.iloc[0],
    ]
    delta = ps.diff()
    gain = delta.where(delta > 0, 0).iloc[-14:].mean()
    loss = (-delta.where(delta < 0, 0)).iloc[-14:].mean()
    rs = gain / loss if loss != 0 else 100
    rsi = 100 - (100 / (1 + rs))
    feat.append(rsi)
    return np.array(feat).reshape(1, -1)

n_steps = len(y_te)
y_pred_multi = []
proba_multi = []
for t in range(n_steps):
    x_t = calc_features_from_history(history_prices)
    if x_t is not None:
        p1 = best_model.predict_proba(x_t)[0, 1]
        yhat = int(p1 >= best_thresh)
    else:
        p1 = 0.5
        yhat = np.random.choice([0, 1])
    proba_multi.append(p1)
    y_pred_multi.append(yhat)
    change = np.random.uniform(0.001, 0.008)
    if yhat == 1:
        next_price = history_prices[-1] * (1 + change)
    else:
        next_price = history_prices[-1] * (1 - change)
    history_prices.append(next_price)
    if len(history_prices) > 50:
        history_prices.pop(0)

y_pred_multi = np.array(y_pred_multi)
proba_multi = np.array(proba_multi)

acc_multi = accuracy_score(y_te, y_pred_multi)
f1_multi = f1_score(y_te, y_pred_multi)

print("\n" + "="*60)
print("Multi-Step (Direction) Evaluation")
print("="*60)
print(f"Accuracy: {acc_multi:.3f}")
print(f"F1 Score: {f1_multi:.3f}")

# ========== Save Results ==========
os.makedirs('svm_balanced', exist_ok=True)

results_df = pd.DataFrame({
    'Date': idx_te,
    'y_true': y_te,
    'y_pred_single': y_pred_single,
    'proba_single': proba_single,
    'y_pred_multi': y_pred_multi,
    'proba_multi': proba_multi
})
results_df.to_csv('svm_balanced/predictions.csv', index=False)

with open('svm_balanced/report.txt', 'w', encoding='utf-8') as f:
    f.write(f"Best model: {best_name}\n")
    f.write(f"Optimized threshold (based on balanced accuracy): {best_thresh:.3f}\n\n")
    f.write(f"Single-step Accuracy: {acc_single:.3f}\n")
    f.write(f"Single-step F1 Score: {f1_single:.3f}\n")
    f.write(f"Multi-step Accuracy: {acc_multi:.3f}\n")
    f.write(f"Multi-step F1 Score: {f1_multi:.3f}\n")

print(f"\nResults saved to svm_balanced/ directory")
print(f"Best configuration: {best_name}, threshold: {best_thresh:.3f}")


Total=2503, Train=2002, Test=501
Training set: (1988, 3), Test set: (501, 3)
Training class distribution: 0=936, 1=1052
Test class distribution: 0=244, 1=257

Training model: balanced
  Best threshold: 0.10, Balanced Accuracy: 0.500

Training model: slight_weight
  Best threshold: 0.50, Balanced Accuracy: 0.502

Training model: gamma_0.01
  Best threshold: 0.53, Balanced Accuracy: 0.532

Best model: gamma_0.01 (Balanced Accuracy: 0.532)

Single-Step (Direction) Evaluation
Accuracy: 0.529
F1 Score: 0.471

Performing improved multi-step recursive prediction...

Multi-Step (Direction) Evaluation
Accuracy: 0.497
F1 Score: 0.484

Results saved to svm_balanced/ directory
Best configuration: gamma_0.01, threshold: 0.530
