# Index Tracking with Gurobi

This Python notebook is part of the webinar [Proven Techniques for Solving Financial Problems with Gurobi](https://www.gurobi.com/events/proven-techniques-for-solving-financial-problems-with-gurobi/).

The sequence of python code will:
1. Import stock data from yahoo finance
2. Clean up the data and change format
3. Perform an index tracking experiment

## Importing Data from YFinance

- Adjusted Stock price data for SP100 constitutents 
- Data from 2010 to 2022

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
from datetime import datetime
from curl_cffi import requests
from io import StringIO

session = requests.Session(impersonate="chrome")
ticker = yf.Ticker('...', session=session)

In [2]:
# Options
FIRST_DATE  = "2020-01-01"
LAST_DATE   = "2025-01-01"
N_PROCESSES = 10
MKT_INDEX   = "^SP100" # ^GSPC for SP500 or ^SP100 
#MKT_INDEX   = "^GSPC"

In [3]:
url = 'https://en.wikipedia.org/wiki/S%26P_100'

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

sp_assets = pd.read_html(StringIO(r.text))[2]

assets = sp_assets['Symbol'].str.replace('.', '-').tolist()

assets.append('^SP100')

data = yf.download(assets, start=FIRST_DATE, end=LAST_DATE)["Close"]

data = data.reset_index()

df = data.melt(id_vars=['Date'], var_name='Ticker', value_name='Price')

df.to_csv("df.csv")

[************          26%                       ]  27 of 102 completed

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  102 of 102 completed

102 Failed downloads:
['TGT', 'COF', 'GOOGL', 'MET', 'FDX', 'BLK', 'COST', 'NEE', 'TMUS', 'PFE', 'T', 'MDT', 'LLY', 'NFLX', 'BRK-B', 'META', 'PYPL', 'COP', 'GD', 'SCHW', 'QCOM', 'BKNG', 'HD', 'JPM', 'RTX', 'AMZN', 'INTC', 'JNJ', 'ADBE', 'KO', 'SBUX', 'CMCSA', 'AAPL', 'AXP', 'NKE', 'AMT', 'DHR', 'BMY', 'AIG', 'NOW', 'MMM', 'MDLZ', 'WMT', 'TSLA', 'MRK', 'CSCO', 'WFC', 'BK', 'LOW', 'USB', 'DE', 'UPS', 'XOM', 'CRM', 'PM', 'INTU', 'MS', 'SPG', 'TMO', 'GOOG', 'TXN', 'ORCL', 'DUK', 'HON', 'LIN', 'AMGN', 'UNH', 'MCD', 'ABBV', 'ACN', 'AVGO', 'MSFT', 'AMD', 'NVDA', 'BAC', '^SP100', 'ABT', 'DIS', 'VZ', 'GM', 'BA', 'EMR', 'ISRG', 'LMT', 'MA', 'PEP', 'MO', 'CVS', 'CVX', 'UBER', 'CAT', 'PLTR', 'UNP', 'IBM', 'PG', 'V', 'SO', 'C', 'GE', 'GILD', 'GS', 'CL']: AttributeError("'str' object has no attribute 'name'")


In [4]:
df.dropna(inplace=True)

## Cleaning and Splitting the Data

In [None]:
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2

THRESH_VALID_DATA = 0.95 # defines where to cut stocks with missing data
PERC_SIZE_TRAIN = 0.75   # defines the size of train dataset (in %)

df_ret, df_train, df_test  = clean_data(
    df_prices, 
    MKT_INDEX,
    thresh_valid_data = THRESH_VALID_DATA,
    size_train = PERC_SIZE_TRAIN
)

df_train.to_parquet("data/ret-data-cleaned-TRAIN.parquet")
df_test.to_parquet("data/ret-data-cleaned-TEST.parquet")

In [5]:
df_wide = df.pivot(index = 'Date', columns = 'Ticker', values = 'Price')

In [None]:
#uma forma de padronizar os dados, nao sei se eh necessario
#std_df_wide = (df_wide-df_wide.mean())/df_wide.std()

## Unconstrained Index Tracking

$
\begin{array}{llll}
  & \min              & \frac{1}{T} \; \sum_{t = 1}^{T} \left(\sum_{i = 1}^{I} \; w_{i} \: \times \: r_{i,t} - R_{t}\right)^2 \\
  & \text{subject to} &   \sum_{i = 1}^{I} w_{i}  = 1  \\
  &                   & w_i \geq 0 \\
\end{array}
$



$
\begin{array}{lll}
& where: \\
& \\
& w_i  &: \text{Weight of asset i in index} \\
& R_{t} &: \text{Returns of tracked index (e.g. SP500) at time t} \\
& r_{i,j} &: \text{Return of asset i at time t}
\end{array}
$

In [6]:
import gurobipy as gp
from random import sample, seed

seed(20220209) # reproducibility

mkt_index = "^SP100"
n_assets = 20

# data from main notebook
r_it = df_wide

r_it.dropna(inplace=True)

r_mkt = r_it[mkt_index]

r_it = r_it.drop(mkt_index, axis = 1)

tickers = list(r_it.columns)

sampled_tickers = sample(tickers, n_assets)

r_it = r_it[sampled_tickers]

print(r_it.head())

KeyError: '^SP100'

# Setup opt problem and solve

In [9]:
# Create an empty model
m = gp.Model('gurobi_index_tracking')

# PARAMETERS 

# w_i: the i_th stock gets a weight w_i
w = pd.Series(m.addVars(sampled_tickers, 
                         lb = 0,
                         ub = 1,
                         vtype = gp.GRB.CONTINUOUS), 
               index=sampled_tickers)

# CONSTRAINTS

# sum(w_i) = 1: portfolio budget constrain (long only)
m.addConstr(w.sum() == 1, 'port_budget')

m.update()

# eps_t = R_{i,t}*w - R_{M,t}
my_error = r_it.dot(w) - r_mkt

# set objective function
m.setObjective(
    gp.quicksum(my_error.pow(2)), 
    gp.GRB.MINIMIZE)     

# Optimize model
m.setParam('OutputFlag', 0)
m.optimize()

w_hat  = [i.X for i in m.getVars()]

print(f"Solution:") 

for i, i_ticker in enumerate(sampled_tickers):
    print(f"{i_ticker}:\t {w_hat[i]*100:.2f}%")

# check constraints
print(f"\nchecking constraints:")
print(f"sum(w) = {np.sum(w_hat)}")

Restricted license - for non-production use only - expires 2026-11-23
Solution:
RTX:	 0.00%
ABT:	 0.00%
AMD:	 0.00%
MS:	 0.00%
MA:	 0.00%
INTU:	 0.00%
SBUX:	 0.00%
HD:	 0.00%
LLY:	 0.00%
MDLZ:	 0.00%
PFE:	 0.00%
SPG:	 0.00%
UNH:	 0.00%
GD:	 0.00%
MO:	 0.00%
AVGO:	 0.00%
BLK:	 100.00%
IBM:	 0.00%
GOOG:	 0.00%
BAC:	 0.00%

checking constraints:
sum(w) = 1.0000000000001181


In [None]:
# check out of sample plot
import matplotlib.pyplot as plt

# aqui coloquei qualquer coisa so pra testar
df_test = df_wide

print(df_test.columns)
print(sampled_tickers)
df_test_mkt = df_test[mkt_index]

r_hat = df_test[sampled_tickers].dot(w_hat)

cumret_r = np.cumprod(1+ r_hat)
cumret_mkt = np.cumprod(1+ df_test_mkt)

fig, ax = plt.subplots()
ax.plot(cumret_mkt.index,
        cumret_mkt, 
       label = mkt_index)

ax.plot(cumret_r.index,
        cumret_r,
       label = f"ETF ({n_assets} assets)")

ax.legend()
ax.set_title(f'ETF and {mkt_index}')
ax.set_xlabel('')
ax.set_ylabel('Cumulative Returns')

plt.xticks(rotation = 90)

plt.show()

NameError: name 'df_wide' is not defined

## Constrained Index Tracking

$
\begin{array}{llll}
  & \min              & \frac{1}{T} \; \sum_{t = 1}^{T} \left(\sum_{i = 1}^{I} \; w_{i} \: \times \: r_{i,t} - R_{t}\right)^2 \\
  & \text{subject to} &   \sum_{i = 1}^{I} w_{i}  = 1  \\
  &                   &   \sum_{i = 1}^{I} z_{i} \leq K \\
  &                   & w_i \geq 0 \\
  &                   & z_i \in {0, 1}
\end{array}
$

  

$
\begin{array}{lllll}
& where: \\
& \\
& w_i  &: \text{Weight of asset i in index} \\
& z_i &: \text{Binary variable (0, 1) that decides wheter asset i is in portfolio} \\
& R_{t} &: \text{Returns of tracked index (e.g. SP500) at time t} \\
& r_{i,j} &: \text{Return of asset i at time t}
\end{array}
$

In [30]:
# Create an empty model
m = gp.Model('gurobi_index_tracking')

# PARAMETERS 

max_assets = 10

# w_i: the i_th stock gets a weight w_i
w = pd.Series(m.addVars(sampled_tickers, 
                         lb = 0,
                         ub = 0.2,
                         vtype = gp.GRB.CONTINUOUS), 
               index=sampled_tickers)

# [NEW] z_i: the i_th stock gets a binary z_i
z = pd.Series(m.addVars(sampled_tickers,
                        vtype = gp.GRB.BINARY),
                index=sampled_tickers)

# CONSTRAINTS

# sum(w_i) = 1: portfolio budget constrain (long only)
m.addConstr(w.sum() == 1, 'port_budget')

# [NEW]  w_i <= z_i: restrictions of values of w_i so take it chose particular tickers
for i_ticker in sampled_tickers:
    m.addConstr(w[i_ticker] <= z[i_ticker], 
                f'dummy_restriction_{i_ticker}')

# [NEW] sum(z_i) <= max_assets: number of assets constraint
m.addConstr(z.sum() <= max_assets, 'max_assets_restriction')

m.update()

# eps_t = R_{i,t}*w - R_{M,t}
my_error = r_it.dot(w) - r_mkt

# set objective function
m.setObjective(
    gp.quicksum(my_error.pow(2)), 
    gp.GRB.MINIMIZE)     

# Optimize model
m.setParam('OutputFlag', 0)
m.setParam('TimeLimit', 60*5) # in secs
#m.setParam('MIPGap', 0.05) # in secs
m.optimize()

params = [i.X for i in m.getVars()]

n_assets = len(sampled_tickers)
w_hat = params[0:n_assets]
z_hat = params[n_assets:]
MIPGap = m.getAttr('MIPGap')
status = m.getAttr("Status")

print(f"Solution for w:") 

for i, i_ticker in enumerate(sampled_tickers):
    print(f"{i_ticker}:\t {w_hat[i]*100:.2f}%")

# check constraints
print(f"\nchecking constraints:")
print(f"sum(w) = {np.sum(w_hat)}")
print(f"sum(z) = {np.sum(z_hat)}")
print(f"w <= z = {w_hat <= z_hat}")
print(f"MIPGap={MIPGap}")
print(f"Status={status}")

Solution for w:
QCOM:	 2.82%
ABT:	 0.00%
AMD:	 8.52%
MRK:	 0.00%
LOW:	 0.00%
INTU:	 8.67%
RTX:	 0.00%
HD:	 0.00%
LIN:	 0.00%
MCD:	 0.00%
PEP:	 0.00%
SO:	 6.76%
TXN:	 0.69%
GD:	 0.00%
MMM:	 0.00%
AVGO:	 20.00%
BLK:	 18.60%
XOM:	 0.00%
IBM:	 13.94%
GOOG:	 20.00%

checking constraints:
sum(w) = 1.0
sum(z) = 9.0
w <= z = True
MIPGap=0.0
Status=2


In [31]:
# check out of sample plot
import matplotlib.pyplot as plt

df_test = pd.read_parquet("data/ret-data-cleaned-TEST.parquet")

print(df_test.columns)
print(sampled_tickers)
df_test_mkt = df_test[mkt_index]

r_hat = df_test[sampled_tickers].dot(w_hat)

cumret_r = np.cumprod(1+ r_hat)
cumret_mkt = np.cumprod(1+ df_test_mkt)

fig, ax = plt.subplots()
ax.plot(cumret_mkt.index,
        cumret_mkt, 
       label = mkt_index)

ax.plot(cumret_r.index,
        cumret_r,
       label = f"ETF ({n_assets} assets)")

ax.legend()
ax.set_title(f'ETF and {mkt_index}')
ax.set_xlabel('')
ax.set_ylabel('Cumulative Returns')

plt.xticks(rotation = 90)

plt.show()

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.