In [93]:
import yfinance as yf
import pandas as pd
import statsmodels.api as sm

# Step 1: Define tickers: 1 outcome, 1 treatment, relevant FX and ETF confounders
tickers = [
    'USDT-USD',   # outcome
    'VND=X',      # treatment
    # Major currencies
    'EURUSD=X',   # Euro
    'JPY=X',      # Japanese Yen
    'GBPUSD=X',   # British Pound
    'AUDUSD=X',   # Australian Dollar
    'CNY=X',      # Chinese Yuan
    # Popular ETFs
    'SPY',        # S&P 500
    'QQQ',        # Nasdaq
    'EEM',        # Emerging Markets
    'IWM',        # Russell 2000
    'GLD',        # Gold
    'USO',        # Oil
    'CORN',       # Corn
    'SOYB',       # Soybeans
]

df = yf.download(tickers, start='2023-07-25', end='2026-01-01')['Close']

# Rename columns for convenience
rename_map = {
    'USDT-USD': 'USDT',
    'VND=X': 'VND',
    'EURUSD=X': 'EURUSD',
    'JPY=X': 'JPY',
    'GBPUSD=X': 'GBPUSD',
    'AUDUSD=X': 'AUDUSD',
    'CNY=X': 'CNY',
    'SPY': 'SP500',
    'QQQ': 'NASDAQ',
    'EEM': 'EM',
    'IWM': 'RUS2000',
    'GLD': 'Gold',
    'USO': 'Oil',
    'CORN': 'Corn',
    'SOYB': 'Soybean'
}
df.rename(columns=rename_map, inplace=True)
df.dropna(inplace=True)

# Step 2: Compute returns
for col in df.columns:
    df[f'{col}_ret'] = df[col]
df.dropna(inplace=True)

# Step 3: Create lagged returns
for col in [c for c in df.columns if c.endswith('_ret')]:
    df[f'{col}_lag1'] = df[col].shift(1)
df.dropna(inplace=True)

# Step 4: Define variables
y_col = 'USDT_ret'
d_col = 'VND_ret'

# Confounders: lagged returns of all other variables
x_cols = [c for c in df.columns if c.endswith('_lag1') and (c != f'{d_col}_lag1') and (c != f'{y_col}_lag1')]

# Step 5: Prepare design matrix
X = df[[d_col] + x_cols]
X = sm.add_constant(X)
y = df[y_col]

# Step 6: Fit OLS regression
model = sm.OLS(y, X).fit()
print(model.summary())

# Extract treatment coef and variance
beta_ols = model.params[d_col]
var_ols = model.cov_params().loc[d_col, d_col]

print(f"\nCoefficient for {d_col}: {beta_ols}")
print(f"Variance of coefficient for {d_col}: {var_ols}")


  df = yf.download(tickers, start='2023-07-25', end='2026-01-01')['Close']
[*********************100%***********************]  15 of 15 completed

                            OLS Regression Results                            
Dep. Variable:               USDT_ret   R-squared:                       0.207
Model:                            OLS   Adj. R-squared:                  0.185
Method:                 Least Squares   F-statistic:                     9.263
Date:                Thu, 14 Aug 2025   Prob (F-statistic):           2.92e-18
Time:                        00:13:41   Log-Likelihood:                 3184.5
No. Observations:                 512   AIC:                            -6339.
Df Residuals:                     497   BIC:                            -6275.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                1.0111      0.005  




In [91]:
import yfinance as yf
import pandas as pd
from doubleml import DoubleMLData, DoubleMLPLR
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# === Step 1: Define tickers ===
tickers = [
    'USDT-USD',   # outcome
    'VND=X',      # treatment (Vietnamese Dong)
    # Commodity ETFs
    'GLD',        # Gold
    # 'SLV',        # Silver
    'CORN',       # Corn
    'USO',        # Crude Oil
]

df = yf.download(tickers, start='2023-07-25', end='2026-01-01')['Close']

# Rename columns for convenience
rename_map = {
    'USDT-USD': 'USDT',
    'VND=X': 'VND',
    'GLD': 'Gold',
    # 'SLV': 'Silver',
    'CORN': 'Corn',
    'USO': 'Oil',
}

df.rename(columns=rename_map, inplace=True)
df.dropna(inplace=True)

# === Step 3: Compute returns ===
for col in df.columns:
    df[f'{col}_ret'] = df[col]

df.dropna(inplace=True)

# === Step 4: Create lagged returns ===
for col in [c for c in df.columns if c.endswith('_ret')]:
    df[f'{col}_lag1'] = df[col].shift(1)

df.dropna(inplace=True)

# === Step 5: Define variables ===
y_col = 'USDT_ret'  # outcome
d_col = 'VND_ret'   # treatment

# Confounders: lagged returns of all except treatment and outcome
x_cols = [c for c in df.columns if c.endswith('_lag1') and c not in (f'{d_col}_lag1', f'{y_col}_lag1')]

print("Number of confounders:", len(x_cols))
print("Confounders:", x_cols)

# === Step 6: Create DoubleML data object ===
dml_data = DoubleMLData(df, y_col=y_col, d_cols=d_col, x_cols=x_cols)

# === Step 7: Define ML learners ===
mlp_m = make_pipeline(StandardScaler(),
                      MLPRegressor(hidden_layer_sizes=(1000), max_iter=500, random_state=42))
mlp_g = make_pipeline(StandardScaler(),
                      MLPRegressor(hidden_layer_sizes=(1000), max_iter=500, random_state=42))

# === Step 8: Initialize and fit DoubleMLPLR model ===
dml_plr = DoubleMLPLR(dml_data, mlp_g, mlp_m, n_folds=5)
dml_plr.fit()

# === Step 9: Print results ===
print("=== DoubleML Results ===")
print("Estimated effect of ZAR returns on USDT returns:", dml_plr.coef)
print("Standard error:", dml_plr.se)
print("t-statistic:", dml_plr.t_stat)
print("p-value:", dml_plr.pval)


  df = yf.download(tickers, start='2023-07-25', end='2026-01-01')['Close']
[*********************100%***********************]  5 of 5 completed
  binary_treat = type_of_target(this_d) == "binary"


Number of confounders: 3
Confounders: ['Corn_ret_lag1', 'Gold_ret_lag1', 'Oil_ret_lag1']




=== DoubleML Results ===
Estimated effect of ZAR returns on USDT returns: [1.32776476e-07]
Standard error: [6.06773267e-08]
t-statistic: [2.18823873]
p-value: [0.02865222]




In [94]:
import numpy as np
from scipy.stats import chi2

# Assume these are computed from your OLS and DML results
beta_ols = model.params[d_col]
var_ols = model.cov_params().loc[d_col, d_col]

beta_dml = dml_plr.coef[0]
var_dml = dml_plr.se[0] ** 2

diff = np.abs(beta_ols - beta_dml)
var_diff = abs(var_ols - var_dml)

if var_diff <= 0:
    raise ValueError("Variance difference is non-positive, Hausman test not valid.")

hausman_stat = diff**2 / var_diff
p_value = 1 - chi2.cdf(hausman_stat, df=1)

print("\n=== Hausman Test ===")
print(f"Hausman test statistic: {hausman_stat:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("Reject null hypothesis: Systematic difference between OLS and DML estimators.")
else:
    print("Fail to reject null hypothesis: No evidence of systematic difference.")

# Hausman summary table
import pandas as pd

hausman_table = pd.DataFrame({
    "Estimator": ["OLS", "DML", "Hausman Test"],
    "Coefficient": [beta_ols, beta_dml, np.nan],
    "Std. Error": [np.sqrt(var_ols), np.sqrt(var_dml), np.nan],
    "t-stat": [beta_ols/np.sqrt(var_ols), beta_dml/np.sqrt(var_dml), hausman_stat],
    "p-value": [model.pvalues[d_col], getattr(dml_plr, "pval", [np.nan])[0], p_value]
})

print("\n=== Hausman Table ===")
print(hausman_table)


=== Hausman Test ===
Hausman test statistic: 0.0006
P-value: 0.9804
Fail to reject null hypothesis: No evidence of systematic difference.

=== Hausman Table ===
      Estimator   Coefficient    Std. Error    t-stat   p-value
0           OLS  1.359077e-07  1.409730e-07  0.964069  0.335480
1           DML  1.327765e-07  6.067733e-08  2.188239  0.028652
2  Hausman Test           NaN           NaN  0.000606  0.980368
