# Foreign Exchange Return Forecasting of Neighboring Countries based on Powerful Anchor Countries
# *SIADS 696: Milestone II*

### *By Team #2*

## Overview
- Defining Custom Functions
- Importing Data
- Applying Unsupervised Learning
- Applying Supervised Learning

In [13]:
# Importing Packages
import country_converter as coco
import matplotlib.pyplot as plt
import xgboost as xgb
import yfinance as yf
import pandas as pd
import numpy as np

# Importing partial packages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from xgboost import XGBClassifier

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/josephhiggins/Documents/siads-696/env/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <B637898E-C0C3-3F93-8C08-800EE41A7A5B> /Users/josephhiggins/Documents/siads-696/env/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file)"]


## Defining Custom Functions

In [None]:
# Preprocessing column names to standardize
def standardize_column_names(dataframe):
    dataframe.columns = [
        str(column).lower().replace(" ", "_").replace(",", "")
        for column in dataframe.columns
    ]

    return dataframe

In [None]:
def clean(country):
    if pd.isna(country):
        return country
    country = str(country).strip()
    country = p.sub("", country)
    return re.sub(r",\s*The$", "", country, flags=re.I)

In [None]:
def short(country):
    if pd.isna(country):
        return pd.NA
    res = cc.convert(names=clean(country), to="name_short")
    if isinstance(res, (list, tuple)):
        res = res[0] if res else pd.NA
    if not res or res == "not found" or pd.isna(res):
        warnings.warn(f"Problem: {country}")
        return pd.NA
    return res

In [None]:
def xxxusd(code):
    x = f"{code}USD=X", f"USD{code}=X"
    return x

## Importing Data

In [None]:
# Importing Real GDP Purchasing Power Parity
# Note the Ranking; 1 = big country per region
country_gdp = pd.read_csv("../data/input/real_gdp_purchasing_power_parity_0.csv")

# Displaying the first 5 rows of the dataframe
country_gdp.head()

name                   object
slug                   object
value                  object
date_of_information     int64
ranking                 int64
region                 object
dtype: object

In [None]:
# Importing Fred Anchors
df_fred_world_gdp = pd.read_csv("../data/input/fred_anchors_0.csv")

# Preprocessing Fred Anchors
df_fred_world_gdp["observation_date"] = pd.to_datetime(
    df_fred_world_gdp["observation_date"]
)
df_fred_world_gdp["time_period"] = df_fred_world_gdp["observation_date"].dt.year
df_fred_world_gdp = df_fred_world_gdp.drop(columns=["observation_date"]).rename(
    columns={"NYGDPMKTPCDWLD": "world_gdp"}
)

# Displaying the first 5 rows of the dataframe
df_fred_world_gdp.head()

In [None]:
# Importing GDP
df_imf_gdp = pd.read_csv(
    "../data/input/imf_gdp_0.csv",
    on_bad_lines="skip",
    sep=",",
    quoting=csv.QUOTE_ALL,
    skipinitialspace=True,
    usecols=[
        "COUNTRY",
        "TIME_PERIOD",
        "TYPE_OF_TRANSFORMATION",
        "FREQUENCY",
        "OBS_VALUE",
        "INDICATOR",
    ],
    engine="python",
)

# Preprocessing GDP
df_imf_gdp = (
    standardize_column_names(df_imf_gdp)
    .query(
        "indicator == 'US Dollar per domestic currency' "
        "and type_of_transformation == 'End-of-period (EoP)' "
        "and frequency == 'Annual'"
    )
    .dropna(subset=["obs_value"])
    .rename(columns={"time_period": "year"})
    .assign(year=lambda d: d["year"].str[:4])
    .sort_values(["country", "year"])
    .drop(columns=["indicator", "type_of_transformation", "frequency"])
    .reset_index(drop=True)
).assign(year=lambda d: pd.to_datetime(d["year"], errors="coerce").dt.year)

# Displaying the first 5 rows of the dataframe
df_imf_gdp.head()

In [None]:
# Importing Exchange Rates
df_imf_trade = pd.read_csv(
    "../data/input/imf_trade_0.csv",
    usecols=[
        "COUNTRY",
        "COUNTERPART_COUNTRY",
        "TIME_PERIOD",
        "OBS_VALUE",
        "TRADE_FLOW",
        "SCALE",
        "UNIT",
    ],
)

# Manually excluding countries that are either trade in pegged currencies or don't have a true boundary from another
ls_0 = [
    "World",
    "Advanced Economies",
    "Latin America and the Caribbean (LAC)",
    "Hong Kong Special Administrative Region, People's Republic of China",
    "Emerging and Developing Europe",
    "Middle East and Central Asia",
    "Emerging Market and Developing Economies",
    "Euro Area (EA)",
    "Emerging and Developing Asia",
]

# Excluding the "Exclusion List"
df_imf_trade = df_imf_trade[
    (~df_imf_trade["COUNTRY"].isin(ls_0))
    & (~df_imf_trade["COUNTERPART_COUNTRY"].isin(ls_0))
]

# Viewing first five rows
df_imf_trade = standardize_column_names(df_imf_trade).sort_values(
    by=["obs_value"], ascending=False
)

# Displaying the first 5 rows of the dataframe
df_imf_trade.head()

In [None]:
# Defining countries in the European Union that use the Euro (EUR) and British Pound (GBP)
EUR = [
    "Austria",
    "Belgium",
    "Cyprus",
    "Estonia",
    "Finland",
    "France",
    "Germany",
    "Greece",
    "Ireland",
    "Italy",
    "Latvia",
    "Lithuania",
    "Luxembourg",
    "Malta",
    "Netherlands",
    "Portugal",
    "Slovakia",
    "Slovenia",
    "Spain",
    "Andorra",
    "Monaco",
    "San Marino",
    "Vatican City",
    "Saint Barthélemy",
    "Saint Pierre and Miquelon",
    "Kosovo",
    "Montenegro",
    "Bosnia and Herzegovina",
    "Bulgaria",
    "Cape Verde",
    "Cameroon",
    "Central African Republic",
    "Chad",
    "Republic of the Congo",
    "Equatorial Guinea",
    "Gabon",
    "Benin",
    "Burkina Faso",
    "Côte d'Ivoire",
    "Guinea-Bissau",
    "Mali",
    "Niger",
    "Senegal",
    "Togo",
    "French Polynesia",
    "New Caledonia",
    "Wallis and Futuna",
    "Comoros",
    "Croatia",
    "Morocco",
    "São Tomé and Príncipe",
    "Denmark",
    "North Macedonia",
]

# Defining countries that use the British Pound (GBP)
GBP = [
    "Guernsey",
    "Jersey",
    "Isle of Man",
    "Gibraltar",
    "Falkland Islands",
    "Saint Helena",
]

# Defining other countries with pegged currencies or no true boundary
other = [
    "Bhutan",
    "Nepal",
    "North Korea",
    "Afghanistan",
    "Turkmenistan",
    "South Sudan",
    "Guam",
    "Macau",
    "Tuvalu",
    "Kiribati",
    "Palau",
    "Greenland",
    "Maldives",
    "Iraq",
    "Solomon Islands",
    "Brunei Darussalam",
    "Bangladesh",
    "Myanmar",
    "Marshall Islands",
    "Iran",
    "Yemen",
    "Libya",
    "Somalia",
    "Liberia",
    "Sudan",
    "Sierra Leone",
    "Mongolia",
    "Angola",
    "Kyrgyz Republic",
    "Tajikistan",
]

# Cleaning country names
EUR = {short(x) for x in EUR if pd.notna(short(x))}
GBP = {short(x) for x in GBP if pd.notna(short(x))}
other = {short(x) for x in other if pd.notna(short(x))}

In [None]:
# Importing cleaned data
# TODO - where is this coming from? I.e. what code created these files?
df_trade = pd.read_parquet("../data/input/trade_0.parquet")
df_gdp = pd.read_parquet("../data/input/gdp_0.parquet")

## Applying Unsupervised Learning

In [None]:
# Converting country names to short format
cc = coco.CountryConverter()
p = re.compile(
    r",\s*(?:Kingdom of the Netherlands|United Kingdom-British Overseas Territory|Republic of the|Union of the|State of the)$",
    re.I,
)

In [None]:
# Anchors selection
df_gdp = df_gdp[~(df_gdp["COUNTRY"] == "United States")]
df_gdp.head()
Anchor = {
    "EUR": EUR,
    "CNY": {"China"},
    "JPY": {"Japan"},
}

# No USD due to many relationship ties with multiple currency.
# Time Period
period = 10

# N Neighbor
n = 10

# Minimum Percentage Volume
v = 0.05

# Exports of goods
# Neighbor mapping
neighbor = {x: i for i, y in Anchor.items() for x in y}
ex = EUR | GBP | other | {"China", "Japan", "United States"}
df_trade = df_trade.dropna(
    subset=["COUNTRY", "COUNTERPART_COUNTRY", "TIME_PERIOD", "OBS_VALUE"]
).copy()

# Keep data period
df_trade = df_trade[df_trade["TIME_PERIOD"].astype(str).str.match(r"^\d{4}")]
df_trade["year"] = df_trade["TIME_PERIOD"].astype(str).str[:4].astype(int)
max_year = int(df_trade["year"].max())
df_trade = df_trade[df_trade["year"].between(max_year - period + 1, max_year)]

# Filter and map potential neighbors
df_export = df_trade[~df_trade["COUNTRY"].isin(ex)].copy()
df_export["anchor"] = df_export["COUNTERPART_COUNTRY"].map(neighbor)

# Total exports per country
etot = (
    df_export.groupby("COUNTRY", as_index=False)["OBS_VALUE"]
    .sum()
    .rename(columns={"OBS_VALUE": "export_total"})
)

# Exports to each anchor
eanch = (
    df_export.dropna(subset=["anchor"])
    .groupby(["COUNTRY", "anchor"], as_index=False)["OBS_VALUE"]
    .sum()
    .rename(columns={"OBS_VALUE": "export_to_anchor"})
)

# Export shares
esh = eanch.merge(etot, on="COUNTRY", how="left").assign(
    export_share=lambda d: d["export_to_anchor"] / d["export_total"]
)

# Best export anchor per country
bexp = esh.sort_values(
    ["COUNTRY", "export_share", "export_to_anchor"], ascending=[True, False, False]
).drop_duplicates("COUNTRY")
bexp = bexp[bexp["export_share"] >= v]

# Swap role
df_import = df_trade.rename(
    columns={"COUNTRY": "COUNTERPART_COUNTRY_orig", "COUNTERPART_COUNTRY": "COUNTRY"}
).rename(columns={"COUNTERPART_COUNTRY_orig": "COUNTERPART_COUNTRY"})

# Potential neighbors
df_import = df_import[~df_import["COUNTRY"].isin(ex)].copy()
df_import["anchor"] = df_import["COUNTERPART_COUNTRY"].map(neighbor)

# Total imports per country
import_totals = (
    df_import.groupby("COUNTRY", as_index=False)["OBS_VALUE"]
    .sum()
    .rename(columns={"OBS_VALUE": "import_total"})
)

# Imports from each anchor
ianch = (
    df_import.dropna(subset=["anchor"])
    .groupby(["COUNTRY", "anchor"], as_index=False)["OBS_VALUE"]
    .sum()
    .rename(columns={"OBS_VALUE": "import_from_anchor"})
)

# Import shares
ish = ianch.merge(import_totals, on="COUNTRY", how="left").assign(
    import_share=lambda d: d["import_from_anchor"] / d["import_total"]
)

# Best import anchor per country
bimp = ish.sort_values(
    ["COUNTRY", "import_share", "import_from_anchor"], ascending=[True, False, False]
).drop_duplicates("COUNTRY")
bimp = bimp[bimp["import_share"] >= v]

In [None]:
# Combine table
comb = esh.merge(ish, on=["COUNTRY", "anchor"], how="outer", suffixes=("_exp", "_imp"))

# Handle missing value
f = [
    "export_to_anchor",
    "export_total",
    "export_share",
    "import_from_anchor",
    "import_total",
    "import_share",
]
for c in f:
    if c not in comb.columns:
        comb[c] = 0.0
comb[f] = comb[f].fillna(0.0)

# Combined metrics
comb = comb.assign(
    total_trade_with_anchor=lambda d: d["export_to_anchor"] + d["import_from_anchor"],
    total_trade_volume=lambda d: d["export_total"] + d["import_total"],
)
comb["combined_exposure"] = 0.0
nz = comb["total_trade_volume"] > 0
comb.loc[nz, "combined_exposure"] = (
    comb.loc[nz, "total_trade_with_anchor"] / comb.loc[nz, "total_trade_volume"]
)
comb_f = comb.loc[comb["combined_exposure"] >= v].copy()

# Rank neighbors per anchor
comb_f = comb_f.sort_values(
    by=[
        "anchor",
        "combined_exposure",
        "total_trade_with_anchor",
        "export_share",
        "import_share",
        "COUNTRY",
    ],
    ascending=[True, False, False, False, False, True],
)

# Rank within each anchor and take top n
comb_f["rank_within_anchor"] = comb_f.groupby("anchor")["combined_exposure"].rank(
    method="first", ascending=False
)
comb_top = comb_f.loc[comb_f["rank_within_anchor"] <= n].copy()

# Final neighbor
neighbors_dict = comb_top.groupby("anchor")["COUNTRY"].apply(list).to_dict()

# Summary results
summary = (
    comb_top[
        [
            "anchor",
            "COUNTRY",
            "combined_exposure",
            "export_share",
            "import_share",
            "export_to_anchor",
            "import_from_anchor",
            "export_total",
            "import_total",
            "total_trade_with_anchor",
            "total_trade_volume",
            "rank_within_anchor",
        ]
    ]
    .sort_values(["anchor", "rank_within_anchor"])
    .reset_index(drop=True)
)

res = summary.copy()
for col in ["combined_exposure", "export_share", "import_share"]:
    res[col] = (res[col] * 100).round(2)

res_sorted = res.sort_values(["anchor", "rank_within_anchor"])
anchor_neighbor = res_sorted.groupby("anchor")["COUNTRY"].apply(list).to_dict()

for a, c in [("CNY", "Australia"), ("JPY", "Philippines"), ("JPY", "Vietnam")]:
    if a in anchor_neighbor and c in anchor_neighbor[a]:
        anchor_neighbor[a].remove(c)

anchor_neighbor

*Performing Principle Component Analyis (PCA) to reduce the dimensionality of our training data.*

In [None]:
# We identify the hour anchor: Anchor = {"EUR": EUR, "CNY": {"China"}, "JPY": {"Japan"}}

# Going to start by pulling Anchor Countries
# Codes are obtain from yahoo finance symbol
# https://finance.yahoo.com/markets/currencies/

fx_list = {
    # Anchor currencies
    "CNY": "CNY=X",  # China
    "EUR": "EUR=X",  # Euro
    "JPY": "JPY=X",  # Japan
    # Neutral currencies
    "CAD": "CAD=X",  # Canada
    "BRL": "BRL=X",  # Brazil
    "MXN": "MXN=X",  # Mexico
    "COP": "COP=X",  # Colombia
    "PEN": "PEN=X",  # Peru
    "NOK": "NOK=X",  # Norway
    "ZAR": "ZAR=X",  # South Africa
    "INR": "INR=X",  # India
    "TRY": "TRY=X",  # Turkey
    "EGP": "EGP=X",  # Egypt
    "RUB": "RUB=X",  # Russia
    "ILS": "ILS=X",  # Israel
    # CNY group
    "CDF": "CDF=X",  # DR Congo
    "LAK": "LAK=X",  # Laos
    "TZS": "TZS=X",  # Tanzania
    "CLP": "CLP=X",  # Chile
    "GNF": "GNF=X",  # Guinea
    "PKR": "PKR=X",  # Pakistan
    "PHP": "PHP=X",  # Philippines
    "VND": "VND=X",  # Vietnam
    "MRU": "MRU=X",  # Mauritania
    # EUR group
    "ALL": "ALL=X",  # Albania
    "CZK": "CZK=X",  # Czechia
    "TND": "TND=X",  # Tunisia
    "RON": "RON=X",  # Romania
    "HUF": "HUF=X",  # Hungary
    "PLN": "PLN=X",  # Poland
    "RSD": "RSD=X",  # Serbia
    "SEK": "SEK=X",  # Sweden
    "ISK": "ISK=X",  # Iceland
    "DZD": "DZD=X",  # Algeria
    # JPY group
    "PGK": "PGK=X",  # Papua New Guinea
    "TWD": "TWD=X",  # Taiwan
    "THB": "THB=X",  # Thailand
    "AUD": "AUD=X",  # Australia
    "IDR": "IDR=X",  # Indonesia
    "KRW": "KRW=X",  # South Korea
    "MYR": "MYR=X",  # Malaysia
    "NZD": "NZD=X",  # New Zealand
}

# Period is the timeframe we want, we conclude for the time being, it will be 10 years
PERIOD = "10y"

# We conclude we will be looking at exchanges on a daily basis.
INTERVAL = "1d"
FFill_Limit = 3
N_PCS_To_Cluster = 3
N_Cluster = 3

# Creating a list of tickers from the fx_list dictionary
tickers = list(fx_list.values())

# Creating a DataFrame by downloading historical FX data using yfinance
yfinance_fx_raw = yf.download(
    tickers, period=PERIOD, interval=INTERVAL, auto_adjust=None, progress=False
)

In [None]:
# Remove problem currency
x = yfinance_fx_raw["Close"]
c = x.notna().sum().sort_values()
c

In [None]:
# We remove currencies that have less than 2000 data points
want = list(fx_list.keys())
t = []
invert = {}

# Loop through each currency in the want list and get their corresponding ticker symbols
for c in want:
    t1, t2 = xxxusd(c)
    t.extend([t1, t2])

# Remove duplicates while preserving order
t = list(dict.fromkeys(t))

# Creating a DataFrame by downloading historical FX data using yfinance
raw = yf.download(
    t, period=PERIOD, interval=INTERVAL, auto_adjust=None, progress=False
)["Close"]

# Creating a DataFrame to hold exchange rate levels and whether they need to be inverted
level = {}
for c in want:
    t1, t2 = xxxusd(c)
    if t1 in raw.columns and raw[t1].notna().sum() > 0:
        level[c] = raw[t1]
        invert[c] = False
    elif t2 in raw.columns and raw[t2].notna().sum() > 0:
        level[c] = 1.0 / raw[t2]
        invert[c] = True

# Creating a DataFrame from the level dictionary and sorting by index
level = pd.DataFrame(level).sort_index()

# Calculating log returns and handling missing values
lreturn = np.log(level / level.shift(1))
lreturn = lreturn.ffill(limit=3).dropna(how="any")

In [None]:
# Standardizing the log returns
scaler = StandardScaler()
X = pd.DataFrame(
    scaler.fit_transform(lreturn),
    index=lreturn.index,
    columns=lreturn.columns,
)

# PCA and KMeans Clustering
pca = PCA(n_components=2)
Z = pca.fit_transform(X.T)

# Creating a DataFrame for PCA results
pca_df = pd.DataFrame(Z, index=X.columns, columns=["PC1", "PC2"])

# KMeans Clustering
k = 3
km = KMeans(n_clusters=k, random_state=42)
pca_df["cluster"] = km.fit_predict(pca_df[["PC1", "PC2"]].to_numpy())

In [None]:
# Addidtional pull for explained_variance_ratio_
evr = pca.explained_variance_ratio_
print("Explained variance ratio: ", evr, " | Cumalative: ", evr.cumsum())

In [None]:
# Grouping currencies based on economic ties
g = {
    "CNY": ["CDF", "LAK", "TZS", "CLP", "GNF", "PKR", "PHP", "VND", "MRU"],
    "EUR": ["ALL", "CZK", "TND", "RON", "HUF", "PLN", "RSD", "SEK", "ISK", "DZD"],
    "JPY": ["PGK", "TWD", "THB", "AUD", "IDR", "KRW", "MYR", "NZD"],
}


# Function to assign group based on currency code
def assign(cc):
    for i, j in g.items():
        if cc == i or cc in j:
            return i
    return "other"


# Assigning group to each currency in the PCA DataFrame
pca_df["group"] = pca_df.index.map(assign)

# Visualization of PCA Results
col = {"CNY": "red", "EUR": "blue", "INR": "green", "JPY": "orange", "other": "gray"}
plt.figure(figsize=(12, 8))
for grp, sub in pca_df.groupby("group"):
    plt.scatter(
        sub["PC1"],
        sub["PC2"],
        c=col.get(grp, "black"),
        s=70,
        alpha=0.85,
        label=f"{grp} (n={len(sub)})",
    )
    for i, j in sub.iterrows():
        plt.annotate(
            i,
            (j["PC1"], j["PC2"]),
            xytext=(3, 3),
            textcoords="offset points",
            fontsize=8,
        )

plt.title("Currencies in PCA Space (standardized returns, XXX/USD)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.show()

In [None]:
# Calculating Silhouette Score
score = silhouette_score(pca_df[['PC1','PC2']], pca_df['cluster'])

# Saving the log returns DataFrame to a Parquet file
lreturn.to_parquet("fx_log_return.parquet", index = False)

## Applying Supervised Learning

*Applying Granger causality test to determine if a causal realtionship is found in our anchor and neighboring currency returns*

In [None]:
lreturn = pd.read_parquet('../data/input/fx_log_return.parquet')

g = {"CNY": ["CDF", "LAK", "TZS", "CLP", "GNF", "PKR", "PHP", "VND", "MRU"],
     "EUR": ["ALL", "CZK", "TND", "RON", "HUF", "PLN", "RSD", "SEK", "ISK", "DZD"],
     "JPY": ["PGK", "TWD", "THB", "AUD", "IDR", "KRW", "MYR", "NZD"]}

def granger(res, g, ml=5, a=0.05):
    x = []
    for i, j in g.items():
        for k in j:
            if i not in res.columns or k not in res.columns:
                continue
            df = pd.concat([res[k], res[i]], axis=1).dropna()
            df.columns = ["neighbor", "anchor"]
            if len(df) <= ml * 3:
                continue
            t = grangercausalitytests(df, maxlag=ml)
            pval = [t[lag][0]['ssr_ftest'][1] for lag in range(1, ml + 1)]
            min_p = float(np.min(pval))
            best_lag = int(np.argmin(pval) + 1)
            x.append({"anchor": i,
                    "neighbor": k,
                    "min_pval": round(min_p, 4),
                    "best_lag (days)": best_lag,
                    "significant": (min_p < a)})
    rdf = pd.DataFrame(x)
    return rdf

rdf = granger(lreturn,g,10,0.05)
rdf = rdf.sort_values(["anchor", "min_pval"]).reset_index(drop=True)
an = rdf[rdf["significant"]==True].sort_values(["anchor", "min_pval"])
ng = (an.groupby("anchor")["neighbor"].apply(list).to_dict())
ng

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:
ls_0 = [i for i in ng.items() if i[0] == 'EUR'][0]
ls_1 = [ls_0[0] for i in range(len([i for i in ng.items() if i[0] == 'EUR'][0][1]))]
len(ls_0[1]), len(ls_1)

pd.DataFrame({'Anchor': ls_1, 'Neighbor': ls_0[1]})

In [None]:
# Load the data
df_0 = pd.read_parquet('../data/input/fx_log_return.parquet')[['CNY', 'CDF']]

# Starting with CNY/CDF as an example
df_0['target'] = df_0['CDF'].shift(-1)

# Dropping all NA
df_0.dropna(inplace=True)
df_0.head()

In [None]:
# Create lag features (e.g., previous 5 days' returns)
for i in range(1, 6):
    df_0[f'Return_lag_{i}'] = df_0['CDF'].pct_change().shift(i)

# Create moving average features
df_0['MA_10'] = df_0['CDF'].rolling(window=10).mean().shift(1)
df_0['MA_50'] = df_0['CDF'].rolling(window=50).mean().shift(1)

# Drop initial rows with NaNs created by rolling windows
df_0.dropna(inplace=True)
df_0 = df_0.replace([np.inf, -np.inf], np.nan).fillna(0)
df_0.head()

In [None]:
# Define features (X) and target (y)
features = [col for col in df_0.columns if col not in ['target']]
X = df_0[features]
y = df_0['target']

# Time-based train-test split
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

xgb = XGBRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [.03, .24, .8],
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [.2, .5, .8],
              'colsample_bytree': [.7],
              'n_estimators': [2, 5, 10],
              }

xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

xgb_grid.fit(X_train,
             y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)