In [22]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.covariance import LedoitWolf

import pypfopt.objective_functions as objective_functions
from pypfopt.expected_returns import mean_historical_return
from pypfopt.efficient_frontier import EfficientFrontier

In [23]:
sector_tickers = [
    "XLF",  # Financials
    "XLK",  # Technology
    "XLV",  # Health Care
    "XLY",  # Consumer Discretionary
    "XLP",  # Consumer Staples
    "XLE",  # Energy
    "XLI",  # Industrials
    "XLU",  # Utilities
    "XLB",  # Materials
    "XLRE",  # Real Estate
    "XLC",  # Communication Services
]

In [24]:
# import data from csv
df_ret = pd.read_parquet("../data/returns.parquet")
df_prices = pd.read_parquet("../data/prices.parquet")
df_vol = pd.read_parquet("../data/vola.parquet")

In [25]:
ticker_subset = ["XLF", "XLK", "XLV"]
df_ret = df_ret[ticker_subset]
df_prices = df_prices[ticker_subset]
sector_tickers = ticker_subset

In [26]:
# DONE : deal with NaN values for tickers
# XLC was created June 2018
# XLRE was created Oct 2015 ( as split off from XLF )
# so during training, check if any of the tickers are NaN
# if yes then set their weight to 0 and dont include them for cov mat and optimization

# DONE : understand why solver sometimes fails
# maybe write own solver
# maybe eigenvals too small therefore unstable
# why does mu < riskfree rate lead to problems ?

In [27]:
# TODO : why doesnt a different lookback change anything
# the price window is longer ie date index more into the past
# BUT the portfolio return doesnt change ???

In [41]:
# mean variance optimization
# based on paper by Sood et al. (2023):

# Parameters
lookback = 160
initial_cash = 100_000
start_date = pd.to_datetime("2019-01-02")
end_date = pd.to_datetime("2019-01-03")

# define daterange from start to end date
date_range = pd.bdate_range(start=start_date, end=end_date)

# Initialize portfolio
portfolio_value = initial_cash
portfolio_history = []

cash = initial_cash
shares = {t: 0 for t in sector_tickers}

for eval_date in date_range:
    if eval_date not in df_ret.index:
        print(f"Date {eval_date} not in data, skipping.")
        continue

    ret_idx = df_ret.index.get_loc(eval_date)
    prices_idx = df_prices.index.get_loc(eval_date)

    if ret_idx < lookback:
        print(f"Not enough data for {eval_date}, skipping.")
        continue

    return_window = df_ret.iloc[ret_idx - lookback : ret_idx]
    prices_window = df_prices.iloc[prices_idx - lookback : prices_idx]
    print(prices_window.index.min(), 'to', prices_window.index.max())

    nan_tickers = []
    for ticker in sector_tickers:
        if return_window[ticker].isna().any():
            nan_tickers.append(ticker)

    if len(nan_tickers) > 0:
        print(f"NaN values for {eval_date}: {nan_tickers}")
        return_window = return_window.drop(columns=nan_tickers)
        prices_window = prices_window.drop(columns=nan_tickers)
        valid_sectors = [t for t in sector_tickers if t not in nan_tickers]
    else:
        valid_sectors = sector_tickers

    # update portfolio value with current prices
    prices = df_prices.iloc[prices_idx].to_dict()
    if len(portfolio_history) > 0:
        portfolio_value = sum([shares[t] * prices[t] for t in valid_sectors]) + cash

    # Estimate covariance using Ledoit-Wolf shrinkage
    lw = LedoitWolf()
    lw.fit(return_window)
    cov_matrix = lw.covariance_
    print(cov_matrix)

    # Fix potential negative eigenvalues (make PSD)
    eigvals, eigvecs = np.linalg.eigh(cov_matrix)
    eigvals[eigvals < 0] = 0
    cov_psd = eigvecs @ np.diag(eigvals) @ eigvecs.T

    # estimate expected returns over lookback period as simple average
    mu = return_window.mean()
    print(mu)
    # check if all of mu are below 0, if yes report error
    if (mu < 0).all():
        print(f"All expected returns are negative for {eval_date}")
        # continue

    # NOTE on negative expected returns :
    # (https://github.com/robertmartin8/PyPortfolioOpt/issues/88#issuecomment-615533265)
    # you should think about whether your expected returns are a good estimate of future returns
    # if they aren't, then the max_sharpe portfolio will be pretty useless!

    # Calculate optimal weights by optimizing Sharpe ratio
    ef = EfficientFrontier(mu, cov_psd)

    # ALTERNATIVE #1 using max_sharpe
    # try:
    #     weights_raw = ef.max_sharpe(risk_free_rate=0)  # ordered dict
    # except Exception as e:
    #     # error probably because the optimisation problem is not convex
    #     print("= " * 20)
    #     print(f"Error in max_sharpe for {eval_date}: {e}")
    #     print("mu", mu)
    #     print("cov_psd", cov_psd)
    #     print("eigvals", eigvals)
    #     print("= " * 20)
    #     continue

    # ALTERNATIVE #2 ( see https://github.com/robertmartin8/PyPortfolioOpt/issues/88 )
    w_min, w_max = 0, 1
    weights_raw = ef.nonconvex_objective(
        objective_functions.sharpe_ratio,
        objective_args=(ef.expected_returns, ef.cov_matrix),
        weights_sum_to_one=True,
    )
    print(weights_raw)

    # whole shares only
    asset_cash = {t: weights_raw[t] * portfolio_value for t in valid_sectors}
    shares = {t: np.floor(asset_cash[t] / prices[t]) for t in valid_sectors}
    # calc rebalanced weights to compare with DRL agents later
    weights = {t: shares[t] * prices[t] / portfolio_value for t in valid_sectors}
    # rest ist cash
    cash = portfolio_value - np.sum([shares[t] * prices[t] for t in valid_sectors])
    w_c = cash / portfolio_value

    # save portfolio history with all weights
    portfolio_record = {
        "date": eval_date,
        "cash": cash,
        "portfolio_value": portfolio_value,
        "lookback": lookback,
        "w_c": w_c,  # include cash weight
    }
    # add weights for each ticker
    for t in valid_sectors:
        portfolio_record[f"w_{t}"] = weights[t]
    
    portfolio_history.append(portfolio_record)

# Convert to DataFrame
portfolio_df = pd.DataFrame(portfolio_history)
portfolio_df

2018-05-14 00:00:00 to 2018-12-31 00:00:00
[[1.31211544e-04 1.04750214e-04 7.60304744e-05]
 [1.04750214e-04 2.10693167e-04 1.13415197e-04]
 [7.60304744e-05 1.13415197e-04 1.08918244e-04]]
Ticker
XLF   -0.000975
XLK   -0.000679
XLV    0.000332
dtype: float64
OrderedDict([('XLF', 3.2049378106392735e-16), ('XLK', 0.0), ('XLV', 1.0)])
2018-05-15 00:00:00 to 2019-01-02 00:00:00
[[1.31766281e-04 1.04768955e-04 7.51426816e-05]
 [1.04768955e-04 2.10712820e-04 1.13219523e-04]
 [7.51426816e-05 1.13219523e-04 1.10148963e-04]]
Ticker
XLF   -0.000918
XLK   -0.000675
XLV    0.000196
dtype: float64
OrderedDict([('XLF', 0.0), ('XLK', 3.885780586188048e-16), ('XLV', 0.9999999999999998)])


Unnamed: 0,date,cash,portfolio_value,lookback,w_c,w_XLF,w_XLK,w_XLV
0,2019-01-02,16.471977,100000.0,160,0.000165,0.0,0.0,0.999835
1,2019-01-03,16.471977,97969.818573,160,0.000168,0.0,0.0,0.999832


In [29]:
plt.figure(figsize=(15, 5))
sns.lineplot(final_df, x='date', y='portfolio_value', hue='lookback')
plt.show()

NameError: name 'final_df' is not defined

<Figure size 1500x500 with 0 Axes>