In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.covariance import LedoitWolf

import pypfopt.objective_functions as objective_functions
from pypfopt.expected_returns import mean_historical_return
from pypfopt.efficient_frontier import EfficientFrontier

In [3]:
sector_tickers = [
    "XLF",  # Financials
    "XLK",  # Technology
    "XLV",  # Health Care
    "XLY",  # Consumer Discretionary
    "XLP",  # Consumer Staples
    "XLE",  # Energy
    "XLI",  # Industrials
    "XLU",  # Utilities
    "XLB",  # Materials
    "XLRE",  # Real Estate
    "XLC",  # Communication Services
]

In [4]:
# import data from csv
df_ret = pd.read_parquet("../data/returns.parquet")
df_prices = pd.read_parquet("../data/prices.parquet")
df_vol = pd.read_parquet("../data/vola.parquet")

In [None]:
# DONE : deal with NaN values for tickers
# XLC was created June 2018
# XLRE was created Oct 2015 ( as split off from XLF )
# so during training, check if any of the tickers are NaN
# if yes then set their weight to 0 and dont include them for cov mat and optimization

# DONE : understand why solver sometimes fails
# maybe write own solver
# maybe eigenvals too small therefore unstable
# why does mu < riskfree rate lead to problems ?

In [None]:
# mean variance optimization
# based on paper by Sood et al. (2023):

# Parameters
lookback = 60
initial_cash = 100_000
start_date = pd.to_datetime("2019-01-01")
end_date = pd.to_datetime("2020-01-01")

# define daterange from start to end date
date_range = pd.bdate_range(start=start_date, end=end_date)

# Initialize portfolio
portfolio_value = initial_cash
portfolio_history = []

cash = initial_cash
shares = {t: 0 for t in sector_tickers}

for eval_date in date_range:
    if eval_date not in df_ret.index:
        print(f"Date {eval_date} not in data, skipping.")
        continue

    ret_idx = df_ret.index.get_loc(eval_date)
    prices_idx = df_prices.index.get_loc(eval_date)

    if ret_idx < lookback:
        print(f"Not enough data for {eval_date}, skipping.")
        continue

    return_window = df_ret.iloc[ret_idx - lookback : ret_idx]
    prices_window = df_prices.iloc[prices_idx - lookback : prices_idx]

    nan_tickers = []
    for ticker in sector_tickers:
        if return_window[ticker].isna().any():
            nan_tickers.append(ticker)

    if len(nan_tickers) > 0:
        print(f"NaN values for {eval_date}: {nan_tickers}")
        return_window = return_window.drop(columns=nan_tickers)
        prices_window = prices_window.drop(columns=nan_tickers)
        valid_sectors = [t for t in sector_tickers if t not in nan_tickers]
    else:
        valid_sectors = sector_tickers

    # update portfolio value with current prices
    prices = df_prices.iloc[prices_idx].to_dict()
    if len(portfolio_history) > 0:
        portfolio_value = sum([shares[t] * prices[t] for t in valid_sectors]) + cash

    # Estimate covariance using Ledoit-Wolf shrinkage
    lw = LedoitWolf()
    lw.fit(return_window)
    cov_matrix = lw.covariance_

    # Fix potential negative eigenvalues (make PSD)
    eigvals, eigvecs = np.linalg.eigh(cov_matrix)
    eigvals[eigvals < 0] = 0
    cov_psd = eigvecs @ np.diag(eigvals) @ eigvecs.T

    # estimate expected returns over lookback period as simple average
    mu = return_window.mean()
    # check if all of mu are below 0, if yes report error
    if (mu < 0).all():
        print(f"All expected returns are negative for {eval_date}")
        # continue

    # NOTE on negative expected returns :
    # (https://github.com/robertmartin8/PyPortfolioOpt/issues/88#issuecomment-615533265)
    # you should think about whether your expected returns are a good estimate of future returns
    # if they aren't, then the max_sharpe portfolio will be pretty useless!

    # Calculate optimal weights by optimizing Sharpe ratio
    ef = EfficientFrontier(mu, cov_psd)

    # ALTERNATIVE #1 using max_sharpe
    # try:
    #     weights_raw = ef.max_sharpe(risk_free_rate=0)  # ordered dict
    # except Exception as e:
    #     # error probably because the optimisation problem is not convex
    #     print("= " * 20)
    #     print(f"Error in max_sharpe for {eval_date}: {e}")
    #     print("mu", mu)
    #     print("cov_psd", cov_psd)
    #     print("eigvals", eigvals)
    #     print("= " * 20)
    #     continue

    # ALTERNATIVE #2 ( see https://github.com/robertmartin8/PyPortfolioOpt/issues/88 )
    w_min, w_max = 0, 1
    weights_raw = ef.nonconvex_objective(
        objective_functions.sharpe_ratio,
        objective_args=(ef.expected_returns, ef.cov_matrix),
        constraints=[
            {"type": "eq", "fun": lambda w: np.sum(w) - 1}, # sum to 1
            {"type": "ineq", "fun": lambda w: w - w_min,}, # larger than min
            {"type": "ineq", "fun": lambda w: w_max - w}, # smaller than max
        ],
    )

    # whole shares only
    asset_cash = {t: weights_raw[t] * portfolio_value for t in valid_sectors}
    shares = {t: np.floor(asset_cash[t] / prices[t]) for t in valid_sectors}
    # calc rebalanced weights to compare with DRL agents later
    weights = {t: shares[t] * prices[t] / portfolio_value for t in valid_sectors}
    # rest ist cash
    cash = portfolio_value - np.sum([shares[t] * prices[t] for t in valid_sectors])
    w_c = cash / portfolio_value

    # save portfolio history
    portfolio_history.append(
        {"date": eval_date, "cash": cash, "portfolio_value": portfolio_value}
    )

# Convert to DataFrame
portfolio_df = pd.DataFrame(portfolio_history)
portfolio_df

Date 2019-01-01 00:00:00 not in data, skipping.
All expected returns are negative for 2019-01-03 00:00:00
All expected returns are negative for 2019-01-04 00:00:00
All expected returns are negative for 2019-01-07 00:00:00
All expected returns are negative for 2019-01-08 00:00:00
Date 2019-01-21 00:00:00 not in data, skipping.
Date 2019-02-18 00:00:00 not in data, skipping.
Date 2019-04-19 00:00:00 not in data, skipping.
Date 2019-05-27 00:00:00 not in data, skipping.
Date 2019-07-04 00:00:00 not in data, skipping.
Date 2019-09-02 00:00:00 not in data, skipping.
Date 2019-11-28 00:00:00 not in data, skipping.
Date 2019-12-25 00:00:00 not in data, skipping.
Date 2020-01-01 00:00:00 not in data, skipping.


Unnamed: 0,date,cash,portfolio_value
0,2019-01-02,287.006510,100000.000000
1,2019-01-03,347.556498,98172.411026
2,2019-01-04,245.380980,101204.601385
3,2019-01-07,228.984713,101890.654400
4,2019-01-08,279.667929,102960.902676
...,...,...,...
247,2019-12-24,364.952177,127947.972797
248,2019-12-26,347.236746,128495.880484
249,2019-12-27,336.457146,128471.505199
250,2019-12-30,221.936064,127906.519566
