# Gold Price VND Prediction
## Dự đoán giá vàng VND cho 7 ngày và 30 ngày tới

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## 1. Load và Khám phá Dữ liệu

In [None]:
df = pd.read_csv('../data/vietdataverse_gold_2026-03-01.csv')
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df = df.sort_values('Date').reset_index(drop=True)

print(f"Dữ liệu từ {df['Date'].min()} đến {df['Date'].max()}")
print(f"Tổng số ngày: {len(df)}")
df.head()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

axes[0].plot(df['Date'], df['Buy Price'], label='Buy Price', linewidth=1)
axes[0].plot(df['Date'], df['Sell Price'], label='Sell Price', linewidth=1)
axes[0].set_title('Giá Vàng VND (2009-2026)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Năm')
axes[0].set_ylabel('Giá (VND)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

recent_df = df[df['Date'] >= '2024-01-01']
axes[1].plot(recent_df['Date'], recent_df['Buy Price'], label='Buy Price', linewidth=1.5)
axes[1].plot(recent_df['Date'], recent_df['Sell Price'], label='Sell Price', linewidth=1.5)
axes[1].set_title('Giá Vàng VND (2024-2026)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Tháng')
axes[1].set_ylabel('Giá (VND)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Feature Engineering

In [None]:
def create_features(df):
    df = df.copy()
    df['Avg_Price'] = (df['Buy Price'] + df['Sell Price']) / 2
    
    for lag in [1, 2, 3, 5, 7, 14, 30]:
        df[f'Lag_{lag}'] = df['Avg_Price'].shift(lag)
    
    for window in [7, 14, 30, 60]:
        df[f'MA_{window}'] = df['Avg_Price'].rolling(window=window).mean()
        df[f'STD_{window}'] = df['Avg_Price'].rolling(window=window).std()
    
    df['Price_Change'] = df['Avg_Price'].diff()
    df['Price_Change_Pct'] = df['Avg_Price'].pct_change() * 100
    
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['Quarter'] = df['Date'].dt.quarter
    df['DayOfYear'] = df['Date'].dt.dayofyear
    
    df['Spread'] = df['Sell Price'] - df['Buy Price']
    df['Spread_Pct'] = (df['Spread'] / df['Buy Price']) * 100
    
    return df

df_features = create_features(df)
df_clean = df_features.dropna().reset_index(drop=True)
print(f"Số dòng sau xử lý: {len(df_clean)}")

## 3. Chuẩn bị Dữ liệu

In [None]:
feature_cols = [col for col in df_clean.columns if col not in ['Date', 'Buy Price', 'Sell Price', 'Avg_Price']]
X = df_clean[feature_cols]
y = df_clean['Avg_Price']

split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).ravel()

print(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")

## 4. Training Models

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    print(f"\n{model_name}")
    print(f"MAE: {mae:,.0f} | RMSE: {rmse:,.0f} | R²: {r2:.4f} | MAPE: {mape:.2f}%")
    return {'Model': model_name, 'MAE': mae, 'RMSE': rmse, 'R2': r2, 'MAPE': mape}

results = []

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, verbose=-1)
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train_scaled)
    pred_scaled = model.predict(X_test_scaled)
    pred = scaler_y.inverse_transform(pred_scaled.reshape(-1, 1)).ravel()
    results.append(evaluate_model(y_test, pred, name))

## 5. So sánh Models

In [None]:
results_df = pd.DataFrame(results).sort_values('RMSE')
print("\nSO SÁNH MODELS:")
print(results_df.to_string(index=False))
print(f"\nBEST MODEL: {results_df.iloc[0]['Model']}")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes[0, 0].barh(results_df['Model'], results_df['MAE'])
axes[0, 0].set_title('MAE')
axes[0, 1].barh(results_df['Model'], results_df['RMSE'])
axes[0, 1].set_title('RMSE')
axes[1, 0].barh(results_df['Model'], results_df['R2'])
axes[1, 0].set_title('R²')
axes[1, 1].barh(results_df['Model'], results_df['MAPE'])
axes[1, 1].set_title('MAPE')
plt.tight_layout()
plt.show()

## 6. Dự đoán 7 và 30 ngày

In [None]:
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]

def predict_future(model, df_clean, scaler_X, scaler_y, days=7):
    predictions = []
    current_df = df_clean.copy()
    
    for i in range(days):
        last_features = current_df[feature_cols].iloc[-1:]
        last_features_scaled = scaler_X.transform(last_features)
        pred_scaled = model.predict(last_features_scaled)
        pred = scaler_y.inverse_transform(pred_scaled.reshape(-1, 1))[0, 0]
        
        last_date = current_df['Date'].iloc[-1]
        new_date = last_date + timedelta(days=1)
        
        new_row = pd.DataFrame({
            'Date': [new_date],
            'Buy Price': [pred * 0.995],
            'Sell Price': [pred * 1.005],
            'Avg_Price': [pred]
        })
        
        temp_df = pd.concat([current_df[['Date', 'Buy Price', 'Sell Price']], new_row], ignore_index=True)
        temp_df = create_features(temp_df).dropna()
        current_df = temp_df
        
        predictions.append({
            'Date': new_date,
            'Predicted_Price': pred,
            'Predicted_Buy': pred * 0.995,
            'Predicted_Sell': pred * 1.005
        })
    
    return pd.DataFrame(predictions)

pred_7days = predict_future(best_model, df_clean, scaler_X, scaler_y, days=7)
pred_30days = predict_future(best_model, df_clean, scaler_X, scaler_y, days=30)

print("\nDỰ ĐOÁN 7 NGÀY:")
print(pred_7days.to_string(index=False))
print("\nDỰ ĐOÁN 30 NGÀY:")
print(pred_30days.head(10).to_string(index=False))

## 7. Lưu kết quả

In [None]:
import joblib
import os

os.makedirs('../models', exist_ok=True)

joblib.dump(best_model, '../models/best_model.pkl')
joblib.dump(scaler_X, '../models/scaler_X.pkl')
joblib.dump(scaler_y, '../models/scaler_y.pkl')
joblib.dump(feature_cols, '../models/feature_cols.pkl')

pred_7days.to_csv('../data/predictions_7days.csv', index=False)
pred_30days.to_csv('../data/predictions_30days.csv', index=False)
results_df.to_csv('../data/model_comparison.csv', index=False)

print("✓ Đã lưu models và predictions!")