# So Sánh Model: Với Technical Indicators vs Không Có Indicators

Notebook này so sánh hiệu suất của các mô hình machine learning khi:
- **Scenario 1**: Chỉ sử dụng dữ liệu gốc (OHLCV - Open, High, Low, Close, Volume)
- **Scenario 2**: Sử dụng dữ liệu gốc + 58 chỉ số kỹ thuật

**Models**:
- Linear Regression
- Random Forest
- XGBoost
- LSTM (Deep Learning)

**Mục tiêu**: Dự đoán giá đóng cửa ngày tiếp theo

## 1. Import Libraries

In [None]:
# Data Processing
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8-darkgrid')

# Machine Learning Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

print("✅ All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")
print(f"Pandas version: {pd.__version__}")

## 2. Load Datasets

In [None]:
# Load dataset WITHOUT indicators (raw OHLCV data)
df_raw = pd.read_csv('stock_data_2025_raw.csv')
df_raw['time'] = pd.to_datetime(df_raw['time'])
df_raw = df_raw.sort_values(['symbol', 'time']).reset_index(drop=True)

# Load dataset WITH indicators
df_indicators = pd.read_csv('stock_data_2025_with_indicators.csv')
df_indicators['time'] = pd.to_datetime(df_indicators['time'])
df_indicators = df_indicators.sort_values(['symbol', 'time']).reset_index(drop=True)

print("=" * 80)
print("DATASET COMPARISON")
print("=" * 80)
print(f"\n📊 Dataset WITHOUT Indicators (Raw):")
print(f"   Shape: {df_raw.shape}")
print(f"   Columns: {len(df_raw.columns)} - {df_raw.columns.tolist()}")
print(f"   Time Range: {df_raw['time'].min()} to {df_raw['time'].max()}")

print(f"\n📈 Dataset WITH Indicators:")
print(f"   Shape: {df_indicators.shape}")
print(f"   Columns: {len(df_indicators.columns)}")
print(f"   Time Range: {df_indicators['time'].min()} to {df_indicators['time'].max()}")
print(f"   Additional Features: {len(df_indicators.columns) - len(df_raw.columns)} technical indicators")

print("\n" + "=" * 80)

# Display sample
print("\n📋 Sample from Raw Data:")
display(df_raw.head())

print("\n📋 Sample from Data with Indicators (first 10 columns):")
display(df_indicators.iloc[:5, :10])

## 3. Prepare Features

### 3.1 WITHOUT Indicators (Only OHLCV)

In [None]:
def prepare_raw_data(df):
    """
    Chuẩn bị dữ liệu chỉ với OHLCV
    """
    df = df.copy()
    
    # Create target: next day's close price
    df['target'] = df.groupby('symbol')['close'].shift(-1)
    
    # Remove rows with NaN target
    df = df.dropna(subset=['target'])
    
    # Select features (only OHLCV)
    feature_cols = ['open', 'high', 'low', 'close', 'volume']
    
    X = df[feature_cols].values
    y = df['target'].values
    
    return X, y, feature_cols

X_raw, y_raw, features_raw = prepare_raw_data(df_raw)

print("=" * 80)
print("RAW DATA (WITHOUT INDICATORS)")
print("=" * 80)
print(f"Features: {len(features_raw)}")
print(f"Feature names: {features_raw}")
print(f"X shape: {X_raw.shape}")
print(f"y shape: {y_raw.shape}")
print("=" * 80)

### 3.2 WITH Indicators

In [None]:
def prepare_indicator_data(df):
    """
    Chuẩn bị dữ liệu với tất cả indicators
    """
    df = df.copy()
    
    # Create target: next day's close price
    df['target'] = df.groupby('symbol')['close'].shift(-1)
    
    # Remove rows with NaN
    df = df.dropna()
    
    # Select all features except time, symbol, and target
    exclude_cols = ['time', 'symbol', 'target']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    X = df[feature_cols].values
    y = df['target'].values
    
    return X, y, feature_cols

X_indicators, y_indicators, features_indicators = prepare_indicator_data(df_indicators)

print("=" * 80)
print("DATA WITH INDICATORS")
print("=" * 80)
print(f"Features: {len(features_indicators)}")
print(f"Feature names (first 20): {features_indicators[:20]}")
print(f"X shape: {X_indicators.shape}")
print(f"y shape: {y_indicators.shape}")
print("=" * 80)

## 4. Train/Test Split (80/20)

In [None]:
def train_test_split_timeseries(X, y, test_size=0.2):
    """
    Time series split: train/test theo thời gian
    """
    n = len(X)
    train_size = int(n * (1 - test_size))
    
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    
    return X_train, X_test, y_train, y_test

# Split RAW data
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split_timeseries(X_raw, y_raw)

# Split INDICATOR data
X_train_ind, X_test_ind, y_train_ind, y_test_ind = train_test_split_timeseries(X_indicators, y_indicators)

print("=" * 80)
print("TRAIN/TEST SPLIT SUMMARY")
print("=" * 80)

print("\n📊 RAW Data (No Indicators):")
print(f"   Train: {len(X_train_raw)} samples ({len(X_train_raw)/len(X_raw)*100:.1f}%)")
print(f"   Test:  {len(X_test_raw)} samples ({len(X_test_raw)/len(X_raw)*100:.1f}%)")

print("\n📈 Data with Indicators:")
print(f"   Train: {len(X_train_ind)} samples ({len(X_train_ind)/len(X_indicators)*100:.1f}%)")
print(f"   Test:  {len(X_test_ind)} samples ({len(X_test_ind)/len(X_indicators)*100:.1f}%)")

print("\n" + "=" * 80)

## 5. Model Evaluation Function

In [None]:
def evaluate_model(y_true, y_pred):
    """
    Tính các metrics đánh giá
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    return {
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'MAPE': mape
    }

# Store all results
all_results = []

print("✅ Evaluation function defined!")

## 6. Linear Regression

In [None]:
print("=" * 80)
print("LINEAR REGRESSION")
print("=" * 80)

# WITHOUT Indicators
print("\n🔹 Training WITHOUT Indicators...")
lr_raw = LinearRegression()
lr_raw.fit(X_train_raw, y_train_raw)
y_pred_raw = lr_raw.predict(X_test_raw)
metrics_raw = evaluate_model(y_test_raw, y_pred_raw)

all_results.append({
    'Model': 'Linear Regression',
    'Data': 'Without Indicators',
    'Features': len(features_raw),
    **metrics_raw
})

print(f"   MAE: {metrics_raw['MAE']:.4f}")
print(f"   RMSE: {metrics_raw['RMSE']:.4f}")
print(f"   R²: {metrics_raw['R2']:.4f}")
print(f"   MAPE: {metrics_raw['MAPE']:.2f}%")

# WITH Indicators
print("\n🔸 Training WITH Indicators...")
lr_ind = LinearRegression()
lr_ind.fit(X_train_ind, y_train_ind)
y_pred_ind = lr_ind.predict(X_test_ind)
metrics_ind = evaluate_model(y_test_ind, y_pred_ind)

all_results.append({
    'Model': 'Linear Regression',
    'Data': 'With Indicators',
    'Features': len(features_indicators),
    **metrics_ind
})

print(f"   MAE: {metrics_ind['MAE']:.4f}")
print(f"   RMSE: {metrics_ind['RMSE']:.4f}")
print(f"   R²: {metrics_ind['R2']:.4f}")
print(f"   MAPE: {metrics_ind['MAPE']:.2f}%")

# Comparison
print("\n📊 Improvement with Indicators:")
print(f"   MAE: {((metrics_raw['MAE'] - metrics_ind['MAE']) / metrics_raw['MAE'] * 100):.2f}%")
print(f"   RMSE: {((metrics_raw['RMSE'] - metrics_ind['RMSE']) / metrics_raw['RMSE'] * 100):.2f}%")
print(f"   R²: {((metrics_ind['R2'] - metrics_raw['R2']) / abs(metrics_raw['R2']) * 100):.2f}%")
print("=" * 80)

## 7. Random Forest

In [None]:
print("=" * 80)
print("RANDOM FOREST")
print("=" * 80)

# WITHOUT Indicators
print("\n🔹 Training WITHOUT Indicators...")
rf_raw = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_raw.fit(X_train_raw, y_train_raw)
y_pred_raw = rf_raw.predict(X_test_raw)
metrics_raw = evaluate_model(y_test_raw, y_pred_raw)

all_results.append({
    'Model': 'Random Forest',
    'Data': 'Without Indicators',
    'Features': len(features_raw),
    **metrics_raw
})

print(f"   MAE: {metrics_raw['MAE']:.4f}")
print(f"   RMSE: {metrics_raw['RMSE']:.4f}")
print(f"   R²: {metrics_raw['R2']:.4f}")
print(f"   MAPE: {metrics_raw['MAPE']:.2f}%")

# WITH Indicators
print("\n🔸 Training WITH Indicators...")
rf_ind = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_ind.fit(X_train_ind, y_train_ind)
y_pred_ind = rf_ind.predict(X_test_ind)
metrics_ind = evaluate_model(y_test_ind, y_pred_ind)

all_results.append({
    'Model': 'Random Forest',
    'Data': 'With Indicators',
    'Features': len(features_indicators),
    **metrics_ind
})

print(f"   MAE: {metrics_ind['MAE']:.4f}")
print(f"   RMSE: {metrics_ind['RMSE']:.4f}")
print(f"   R²: {metrics_ind['R2']:.4f}")
print(f"   MAPE: {metrics_ind['MAPE']:.2f}%")

# Comparison
print("\n📊 Improvement with Indicators:")
print(f"   MAE: {((metrics_raw['MAE'] - metrics_ind['MAE']) / metrics_raw['MAE'] * 100):.2f}%")
print(f"   RMSE: {((metrics_raw['RMSE'] - metrics_ind['RMSE']) / metrics_raw['RMSE'] * 100):.2f}%")
print(f"   R²: {((metrics_ind['R2'] - metrics_raw['R2']) / abs(metrics_raw['R2']) * 100):.2f}%")
print("=" * 80)

## 8. XGBoost

In [None]:
print("=" * 80)
print("XGBOOST")
print("=" * 80)

# WITHOUT Indicators
print("\n🔹 Training WITHOUT Indicators...")
xgb_raw = XGBRegressor(n_estimators=100, max_depth=7, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb_raw.fit(X_train_raw, y_train_raw)
y_pred_raw = xgb_raw.predict(X_test_raw)
metrics_raw = evaluate_model(y_test_raw, y_pred_raw)

all_results.append({
    'Model': 'XGBoost',
    'Data': 'Without Indicators',
    'Features': len(features_raw),
    **metrics_raw
})

print(f"   MAE: {metrics_raw['MAE']:.4f}")
print(f"   RMSE: {metrics_raw['RMSE']:.4f}")
print(f"   R²: {metrics_raw['R2']:.4f}")
print(f"   MAPE: {metrics_raw['MAPE']:.2f}%")

# WITH Indicators
print("\n🔸 Training WITH Indicators...")
xgb_ind = XGBRegressor(n_estimators=100, max_depth=7, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb_ind.fit(X_train_ind, y_train_ind)
y_pred_ind = xgb_ind.predict(X_test_ind)
metrics_ind = evaluate_model(y_test_ind, y_pred_ind)

all_results.append({
    'Model': 'XGBoost',
    'Data': 'With Indicators',
    'Features': len(features_indicators),
    **metrics_ind
})

print(f"   MAE: {metrics_ind['MAE']:.4f}")
print(f"   RMSE: {metrics_ind['RMSE']:.4f}")
print(f"   R²: {metrics_ind['R2']:.4f}")
print(f"   MAPE: {metrics_ind['MAPE']:.2f}%")

# Comparison
print("\n📊 Improvement with Indicators:")
print(f"   MAE: {((metrics_raw['MAE'] - metrics_ind['MAE']) / metrics_raw['MAE'] * 100):.2f}%")
print(f"   RMSE: {((metrics_raw['RMSE'] - metrics_ind['RMSE']) / metrics_raw['RMSE'] * 100):.2f}%")
print(f"   R²: {((metrics_ind['R2'] - metrics_raw['R2']) / abs(metrics_raw['R2']) * 100):.2f}%")
print("=" * 80)

## 9. LSTM (Deep Learning)

### 9.1 Data Preparation for LSTM

In [None]:
# Normalize data
scaler_X_raw = MinMaxScaler()
scaler_y_raw = MinMaxScaler()
scaler_X_ind = MinMaxScaler()
scaler_y_ind = MinMaxScaler()

# Scale RAW data
X_train_raw_scaled = scaler_X_raw.fit_transform(X_train_raw)
X_test_raw_scaled = scaler_X_raw.transform(X_test_raw)
y_train_raw_scaled = scaler_y_raw.fit_transform(y_train_raw.reshape(-1, 1)).flatten()
y_test_raw_scaled = scaler_y_raw.transform(y_test_raw.reshape(-1, 1)).flatten()

# Scale INDICATOR data
X_train_ind_scaled = scaler_X_ind.fit_transform(X_train_ind)
X_test_ind_scaled = scaler_X_ind.transform(X_test_ind)
y_train_ind_scaled = scaler_y_ind.fit_transform(y_train_ind.reshape(-1, 1)).flatten()
y_test_ind_scaled = scaler_y_ind.transform(y_test_ind.reshape(-1, 1)).flatten()

# Create sequences for LSTM
def create_sequences(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

TIME_STEPS = 10

# Create sequences for RAW data
X_train_raw_seq, y_train_raw_seq = create_sequences(X_train_raw_scaled, y_train_raw_scaled, TIME_STEPS)
X_test_raw_seq, y_test_raw_seq = create_sequences(X_test_raw_scaled, y_test_raw_scaled, TIME_STEPS)

# Create sequences for INDICATOR data
X_train_ind_seq, y_train_ind_seq = create_sequences(X_train_ind_scaled, y_train_ind_scaled, TIME_STEPS)
X_test_ind_seq, y_test_ind_seq = create_sequences(X_test_ind_scaled, y_test_ind_scaled, TIME_STEPS)

print("✅ Data prepared for LSTM")
print(f"\nRAW Data Sequences:")
print(f"   X_train shape: {X_train_raw_seq.shape} (samples, timesteps, features)")
print(f"   X_test shape: {X_test_raw_seq.shape}")

print(f"\nIndicator Data Sequences:")
print(f"   X_train shape: {X_train_ind_seq.shape}")
print(f"   X_test shape: {X_test_ind_seq.shape}")

### 9.2 LSTM WITHOUT Indicators

In [None]:
print("=" * 80)
print("LSTM WITHOUT INDICATORS")
print("=" * 80)

# Build LSTM model
lstm_raw = Sequential([
    LSTM(64, activation='relu', return_sequences=True, input_shape=(TIME_STEPS, X_train_raw_seq.shape[2])),
    Dropout(0.2),
    LSTM(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)
])

lstm_raw.compile(optimizer='adam', loss='mse', metrics=['mae'])

print("\n📋 Model Summary:")
lstm_raw.summary()

# Train
print("\n🔹 Training...")
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history_raw = lstm_raw.fit(
    X_train_raw_seq, y_train_raw_seq,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Predict
y_pred_raw_scaled = lstm_raw.predict(X_test_raw_seq, verbose=0).flatten()
y_pred_raw = scaler_y_raw.inverse_transform(y_pred_raw_scaled.reshape(-1, 1)).flatten()
y_test_raw_actual = scaler_y_raw.inverse_transform(y_test_raw_seq.reshape(-1, 1)).flatten()

metrics_raw = evaluate_model(y_test_raw_actual, y_pred_raw)

all_results.append({
    'Model': 'LSTM',
    'Data': 'Without Indicators',
    'Features': len(features_raw),
    **metrics_raw
})

print(f"\n✅ Training completed!")
print(f"   MAE: {metrics_raw['MAE']:.4f}")
print(f"   RMSE: {metrics_raw['RMSE']:.4f}")
print(f"   R²: {metrics_raw['R2']:.4f}")
print(f"   MAPE: {metrics_raw['MAPE']:.2f}%")
print("=" * 80)

### 9.3 LSTM WITH Indicators

In [None]:
print("=" * 80)
print("LSTM WITH INDICATORS")
print("=" * 80)

# Build LSTM model
lstm_ind = Sequential([
    LSTM(64, activation='relu', return_sequences=True, input_shape=(TIME_STEPS, X_train_ind_seq.shape[2])),
    Dropout(0.2),
    LSTM(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)
])

lstm_ind.compile(optimizer='adam', loss='mse', metrics=['mae'])

print("\n📋 Model Summary:")
lstm_ind.summary()

# Train
print("\n🔸 Training...")
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history_ind = lstm_ind.fit(
    X_train_ind_seq, y_train_ind_seq,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Predict
y_pred_ind_scaled = lstm_ind.predict(X_test_ind_seq, verbose=0).flatten()
y_pred_ind = scaler_y_ind.inverse_transform(y_pred_ind_scaled.reshape(-1, 1)).flatten()
y_test_ind_actual = scaler_y_ind.inverse_transform(y_test_ind_seq.reshape(-1, 1)).flatten()

metrics_ind = evaluate_model(y_test_ind_actual, y_pred_ind)

all_results.append({
    'Model': 'LSTM',
    'Data': 'With Indicators',
    'Features': len(features_indicators),
    **metrics_ind
})

print(f"\n✅ Training completed!")
print(f"   MAE: {metrics_ind['MAE']:.4f}")
print(f"   RMSE: {metrics_ind['RMSE']:.4f}")
print(f"   R²: {metrics_ind['R2']:.4f}")
print(f"   MAPE: {metrics_ind['MAPE']:.2f}%")

# Comparison
print("\n📊 Improvement with Indicators:")
print(f"   MAE: {((metrics_raw['MAE'] - metrics_ind['MAE']) / metrics_raw['MAE'] * 100):.2f}%")
print(f"   RMSE: {((metrics_raw['RMSE'] - metrics_ind['RMSE']) / metrics_raw['RMSE'] * 100):.2f}%")
print(f"   R²: {((metrics_ind['R2'] - metrics_raw['R2']) / abs(metrics_raw['R2']) * 100):.2f}%")
print("=" * 80)

## 10. Final Comparison Results

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(all_results)

print("=" * 100)
print("FINAL COMPARISON: WITH vs WITHOUT TECHNICAL INDICATORS")
print("=" * 100)
print(results_df.to_string(index=False))
print("=" * 100)

# Save results
results_df.to_csv('comparison_results.csv', index=False)
print("\n✅ Results saved to 'comparison_results.csv'")

## 11. Visualization

In [None]:
# Create comparison charts
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

metrics = ['MAE', 'RMSE', 'R2', 'MAPE']
colors = ['#FF6B6B', '#4ECDC4']

for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    
    # Prepare data for plotting
    models = results_df['Model'].unique()
    without_ind = []
    with_ind = []
    
    for model in models:
        without_ind.append(results_df[(results_df['Model'] == model) & (results_df['Data'] == 'Without Indicators')][metric].values[0])
        with_ind.append(results_df[(results_df['Model'] == model) & (results_df['Data'] == 'With Indicators')][metric].values[0])
    
    x = np.arange(len(models))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, without_ind, width, label='Without Indicators', color=colors[0], alpha=0.8)
    bars2 = ax.bar(x + width/2, with_ind, width, label='With Indicators', color=colors[1], alpha=0.8)
    
    ax.set_xlabel('Model', fontsize=12, fontweight='bold')
    ax.set_ylabel(metric, fontsize=12, fontweight='bold')
    ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=15, ha='right')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.3f}',
                   ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('comparison_chart.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Comparison chart saved as 'comparison_chart.png'")

## 12. Summary & Insights

In [None]:
print("=" * 100)
print("SUMMARY & KEY INSIGHTS")
print("=" * 100)

# Calculate average improvement
for model in results_df['Model'].unique():
    without = results_df[(results_df['Model'] == model) & (results_df['Data'] == 'Without Indicators')]
    with_ind = results_df[(results_df['Model'] == model) & (results_df['Data'] == 'With Indicators')]
    
    print(f"\n🔹 {model}:")
    print(f"   Features: {without['Features'].values[0]} → {with_ind['Features'].values[0]}")
    
    mae_improve = ((without['MAE'].values[0] - with_ind['MAE'].values[0]) / without['MAE'].values[0] * 100)
    rmse_improve = ((without['RMSE'].values[0] - with_ind['RMSE'].values[0]) / without['RMSE'].values[0] * 100)
    r2_improve = ((with_ind['R2'].values[0] - without['R2'].values[0]) / abs(without['R2'].values[0]) * 100)
    
    print(f"   MAE improvement: {mae_improve:+.2f}%")
    print(f"   RMSE improvement: {rmse_improve:+.2f}%")
    print(f"   R² improvement: {r2_improve:+.2f}%")
    
    if mae_improve > 0:
        print(f"   ✅ Technical indicators HELPED improve performance!")
    else:
        print(f"   ⚠️ Technical indicators did NOT help (possible overfitting)")

print("\n" + "=" * 100)
print("CONCLUSION:")
print("=" * 100)
print("""\n1. Technical indicators can provide additional context for models
2. More features doesn't always mean better performance
3. Some models (like tree-based) may benefit more from indicators
4. Deep learning models might need more data to leverage all indicators
5. Consider feature selection to remove redundant indicators
""")
print("=" * 100)