# **STEP 0: IMPORT LIBRARIES**

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  
from sklearn.preprocessing import OneHotEncoder      
from sklearn.model_selection import KFold   
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from statistics import mean
from sklearn.model_selection import train_test_split
import joblib 
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold, StratifiedKFold

# Additional useful imports
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import learning_curve
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_absolute_error, median_absolute_error
import seaborn as sns
from scipy import stats
import warnings
import os
import yfinance as yf
import ta

# **STEP 1: DEFINE FOREX PAIRS LIST**

In [38]:
forex_pairs = ['EURUSD=X', 'USDJPY=X', 'GBPUSD=X', 'AUDUSD=X', 'NZDUSD=X', 'EURJPY=X'] # You can update by adding more

# **STEP 2: GET FOREX DATA CRAWLING**

**2.1: Define Reference List** 

In [39]:
directory = 'Forex_Data'
if not os.path.exists(directory):
    os.makedirs(directory)      # Create directory if has not exist yet

**2.2: Crawl all FOREX data needed**

In [40]:
# Tải dữ liệu lịch sử cho mỗi mã Forex và lưu vào tệp CSV
for pair in forex_pairs:
    print(f"Downloading data for {pair}...")
    data = yf.download(pair, period="max", interval='1d')  # Dowload the data since the day it appears
    data.reset_index(inplace=True)  # Reset index to ense Date is a normal column
    
    # Save the data to the CSV file
    file_path = os.path.join(directory, f'{pair}_data.csv')
    data.to_csv(file_path, index=False)
    print(f"Saved data for {pair} at {file_path}")
    print()

Downloading data for EURUSD=X...


[*********************100%***********************]  1 of 1 completed


Saved data for EURUSD=X at Forex_Data\EURUSD=X_data.csv

Downloading data for USDJPY=X...


[*********************100%***********************]  1 of 1 completed


Saved data for USDJPY=X at Forex_Data\USDJPY=X_data.csv

Downloading data for GBPUSD=X...


[*********************100%***********************]  1 of 1 completed


Saved data for GBPUSD=X at Forex_Data\GBPUSD=X_data.csv

Downloading data for AUDUSD=X...


[*********************100%***********************]  1 of 1 completed


Saved data for AUDUSD=X at Forex_Data\AUDUSD=X_data.csv

Downloading data for NZDUSD=X...


[*********************100%***********************]  1 of 1 completed


Saved data for NZDUSD=X at Forex_Data\NZDUSD=X_data.csv

Downloading data for EURJPY=X...


[*********************100%***********************]  1 of 1 completed

Saved data for EURJPY=X at Forex_Data\EURJPY=X_data.csv






**2.3:  MERGE ALL FOREX DATA INTO ONE DATAFRAME**

In [41]:
# Khởi tạo một DataFrame trống
merged_data = pd.DataFrame()

for pair in forex_pairs:
    file_path = os.path.join(directory, f'{pair}_data.csv')
    
    # Đọc dữ liệu từ file CSV
    data = pd.read_csv(file_path)
    
    # Đổi tên cột 'Close' để phân biệt giữa các cặp tiền
    data.rename(columns={'Close': f'{pair}_Close'}, inplace=True)
    
    # Chỉ lấy cột 'Date' và 'Close' để kết hợp
    if merged_data.empty:
        merged_data = data[['Date', f'{pair}_Close']]  # Lần đầu tiên, lấy cả 'Date' và 'Close'
    else:
        merged_data = pd.merge(merged_data, data[['Date', f'{pair}_Close']], on='Date', how='outer')  # Ghép dựa trên cột 'Date'

print(merged_data)

            Date  EURUSD=X_Close  USDJPY=X_Close  GBPUSD=X_Close  \
0     1996-10-30             NaN      114.180000             NaN   
1     1996-11-01             NaN      113.500000             NaN   
2     1996-11-04             NaN      113.879997             NaN   
3     1996-11-05             NaN      114.250000             NaN   
4     1996-11-06             NaN      113.949997             NaN   
...          ...             ...             ...             ...   
7253  2024-09-18        1.112310      142.014999        1.316742   
7254  2024-09-19        1.111482      142.710007        1.320115   
7255  2024-09-20             NaN             NaN        1.328180   
7256  2024-09-21             NaN             NaN        1.331611   
7257  2024-09-22             NaN             NaN             NaN   

      AUDUSD=X_Close  NZDUSD=X_Close  EURJPY=X_Close  
0                NaN             NaN             NaN  
1                NaN             NaN             NaN  
2                N

#   **STEP 3: PRE-PROCESSING DATA:**

**1.1. Load historical Forex data**


**NOTE**:
- **Date Format**: DD/MM/YY. (FOR 1 MINUTE INTERVAL: YYYY-MM-DD HH:MM:SS)
- **Interval**: 1d = 1 day, 1m = 1 minutes (YAHOO FINANCE ONLY SUPPLY DATA FOR THE LAST **7 DAYS** OF 1 MINUTE INTERVAL DATA) 