# Setup

In [34]:
import talib as ta
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import pickle
from tiingo import TiingoClient
import financedatabase as fd
import datetime
import financetoolkit
from utils.api_keys import API_KEYS
import zipfile
import requests

# Tiingo

We use the [supported tickers by Tiingo](https://www.tiingo.com/documentation/end-of-day#:~:text=supported_tickers.zip) to select ETFs that are still listed on the following: 
*   Nasdaq
*   NYSE
*   BATS

The ETFs must be traded prior to 2018 and still be active.

This selection of ETFs resulted in a list of 1789 ETFs.

In [9]:
# Download the Tiingo supported tickers file from the URL
tiingo_zip_url = 'https://apimedia.tiingo.com/docs/tiingo/daily/supported_tickers.zip'
#unzip the file
tiingo_zip_path = 'supported_tickers.zip'
# Download the zip file
response = requests.get(tiingo_zip_url)
with open(tiingo_zip_path, 'wb') as f:
    f.write(response.content)
# Unzip the file
with zipfile.ZipFile(tiingo_zip_path, 'r') as zip_ref:
    zip_ref.extractall('.')
 
# Read the CSV file
supported_tickers_df = pd.read_csv(tiingo_zip_path)
# Filter the DataFrame to include only ETFs
etfs_df = supported_tickers_df[supported_tickers_df['assetType'] == 'ETF']
# Filter the DataFrame to include only the tickers that are not delisted
etfs_df = etfs_df[etfs_df['endDate'] >= '2025-04-23']
# Filter the DataFrame to include only the tickers traded prior to 2018
etfs_df = etfs_df[etfs_df['startDate'] <= '2018-01-01']
# Filter the DataFrame to include only the tickers traded on NYSE, NASDAQ or BATS
etfs_df = etfs_df[etfs_df['exchange'].isin(['NYSE', 'NYSE ARCA', 'NYSE MKT', 'NASDAQ', 'BATS'])]
# return the tickers
etf_tickers = etfs_df['ticker'].tolist()
# display the first 10 tickers
print("First 10 ETF tickers:")
print(etf_tickers[:10])


First 10 ETF tickers:
['AADR', 'AAVM', 'AAXJ', 'ABFL', 'ABLG', 'ACP', 'ACSI', 'ACV', 'ACWI', 'ACWV']


In [16]:
len(etf_tickers)

1789

# Sector indentification with Finance Database

We will use the [Finance Database](https://github.com/JerBouma/FinanceDatabase) package to identify the sector and industry of the ETFs, as this information is available for stocks only on Tiingo.

In [22]:
#Initialize the financedatabase ETFs Data
etfs = fd.ETFs()
etfs_data = etfs.data
etfs_data

Unnamed: 0_level_0,name,currency,summary,category_group,category,family,exchange
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
^ACWI,ISHARES TRUST,USD,The iShares MSCI ACWI ETF seeks to track the i...,Financials,Developed Markets,BlackRock Asset Management,NIM
^ADFI-IV,NFIELD DYNAMIC FIXED INCOME ETF,USD,The NFIELD DYNAMIC FIXED INCOME ETF (ADFI) is ...,Fixed Income,Corporate Bonds,,ASE
^ADRE,INVESCO ACTIVELY M,USD,The Invesco Active REIT ETF is an actively man...,Real Estate,REITs,Invesco Investment Management,NIM
^ARB-EU,ALTSHARES MERGER ARBITRAGE ETF,USD,The ALTSHARES MERGER ARBITRAGE ETF seeks capit...,Alternatives,,AltShares,ASE
^ARB-IV,ALTSHARES MERGER ARBITRAGE ETF,USD,The ALTSHARES MERGER ARBITRAGE ETF seeks capit...,Alternatives,,AltShares,ASE
...,...,...,...,...,...,...,...
VGFPF,Vanguard Funds Public Limited Company - Vangua...,,The Vanguard S&P 500 UCITS ETF (USD) Accumulat...,Equities,,,
VFDEF,Vanguard Funds Public Limited Company - Vangua...,,VFDEF is an exchange-traded fund (ETF) that ai...,Equities,,,
WSDMF,WisdomTree Issuer ICAV - WisdomTree Europe Equ...,,The WisdomTree Issuer ICAV - WisdomTree Europe...,Equities,,,
WDSSF,WisdomTree Issuer ICAV - WisdomTree US Quality...,,The WisdomTree Issuer ICAV - WisdomTree US Qua...,Equities,,,


In [30]:

# Build dictionnary for sector, industry and category
etf_sector_info = {}

# For each selected Ticker, get the sector, industry and category
enriched_etfs = []
for ticker in etf_tickers:
    # Look for additional information in financedatabase
    # we may need to handle suffixes (ex: adding .US)
    if ticker in etfs_data.index:
        etf_info = etfs_data.loc[ticker]
        enriched_etfs.append({
            'ticker': ticker,
            'name': etf_info['name'] if 'name' in etf_info else 'N/A',
            'sector': etf_info['category_group'] if 'category_group' in etf_info else 'N/A',
            'industry': etf_info['category'] if 'category' in etf_info else 'N/A'
        })
    else:
        # Try other formats
        alt_ticker = ticker + '.US'
        if alt_ticker in etfs_data:
            etf_info = etfs_data.loc[alt_ticker]
            enriched_etfs.append({
                'ticker': ticker,
                'name': etf_info.get('name', 'N/A'),
                'sector': etf_info.get('category_group', 'N/A'),
                'industry': etf_info['category'] if 'category' in etf_info else 'N/A'
            })
        else:
            # Ticker not found in financedatabase, add a placeholder
            enriched_etfs.append({
                'ticker': ticker,
                'name': 'N/A',
                'sector': 'N/A',
                'industry': 'N/A',
                'category': 'N/A'
            })

# Build a DataFrame from the enriched ETFs list
enriched_etfs_df = pd.DataFrame(enriched_etfs)

# Filter the DataFrame to include only ETFs with sector, industry or category containing 'Technology'
tech_etfs_df = enriched_etfs_df[
                                (enriched_etfs_df['sector'].str.contains('Technology', na=False)) |
                                (enriched_etfs_df['industry'].str.contains('Technology', na=False))
                                ].reset_index(drop=True)                           

# Afficher le résultat
print(f"Tech ETFs found: {len(tech_etfs_df)}")
tech_etfs_df.head()

Tech ETFs found: 75


Unnamed: 0,ticker,name,sector,industry,category
0,ARKK,ARK Innovation ETF,Information Technology,Factors,
1,ARKQ,ARK Autonomous Technology & Robotics ETF,Industrials,Technology,
2,ARKW,ARK Next Generation Internet ETF,Information Technology,Technology,
3,CIBR,First Trust NASDAQ Cybersecurity ETF,Information Technology,Technology,
4,CQQQ,Invesco China Technology ETF,Information Technology,Emerging Markets,


In [32]:
tech_etfs_list = tech_etfs_df['ticker'].tolist()

# Importing Data from Tiingo

With our asset universe of 75 ETFs constructed, we will use the Tiingo API to download the historical data for these ETFs.

In [35]:
tiingo_api = API_KEYS['tiingo']
config = {}
config['session'] = True
config['api_key'] = tiingo_api
# Create a Tiingo client
tiingo_client = TiingoClient(config)

# Set the start and end dates for the data
start_date = '2018-01-01'
end_date = '2025-01-01'
# Set the frequency for the data
frequency = 'daily'
# Get the data for the ETFs
etf_data = {}
for ticker in tech_etfs_list:
    try:
        # Fetch historical data for the ETF
        data = tiingo_client.get_dataframe(ticker, startDate=start_date, endDate=end_date, frequency=frequency)
        # Store the data in the dictionary
        etf_data[ticker] = data
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")

# Save the data to a pickle file
with open('etf_data.pkl', 'wb') as f:
    pickle.dump(etf_data, f)


In [78]:
# load the data from the pickle file
with open('etf_data.pkl', 'rb') as f:
    etf_historical = pickle.load(f)
# Display the first 5 rows of the historical data for the first ETF
etf_historical[list(etf_historical.keys())[0]].head()

Unnamed: 0_level_0,close,high,low,open,volume,adjClose,adjHigh,adjLow,adjOpen,adjVolume,divCash,splitFactor
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-01-02 00:00:00+00:00,38.58,38.59,37.247,37.34,420381,36.133634,36.143,34.88516,34.972263,420381,0.0,1.0
2018-01-03 00:00:00+00:00,38.74,39.053,38.73,38.95,667492,36.283489,36.576641,36.274123,36.480173,667492,0.0,1.0
2018-01-04 00:00:00+00:00,38.87,39.19,38.48,39.1,358595,36.405245,36.704954,36.039975,36.620661,358595,0.0,1.0
2018-01-05 00:00:00+00:00,39.5,39.5,39.07,39.26,361640,36.995297,36.995297,36.592563,36.770515,361640,0.0,1.0
2018-01-08 00:00:00+00:00,39.23,39.23,38.32,39.01,421467,36.742418,36.742418,35.890121,36.536368,421467,0.0,1.0


# Adding the Technical Indicators

We will populate the data with technical analysis features as in Roychoudhury (2021).

To lighten the number of features, we will use fibonacci periods to calculate the technical indicators.
We also add volatility ratios to the data.

In [None]:
def add_technical_indicators_MAX(df):
    """
    Adds technical indicators to a price DataFrame using Fibonacci periods.
    Optimized to limit DataFrame fragmentation.
    """
    df = df.copy()
    results = {}
    
    # Fibonacci periods for technical indicators
    fib_periods = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89]
    
    # --- BASE INDICATORS ---
    results['RSI'] = ta.RSI(df['close'], timeperiod=14)
    results['MFI'] = ta.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=13)
    results['ADX'] = ta.ADX(df['high'], df['low'], df['close'], timeperiod=13)
    
    # --- OSCILLATORS AND TREND ---
    # MACD with Fibonacci periods
    macd, macdsignal, macdhist = ta.MACD(df['close'], fastperiod=13, slowperiod=21, signalperiod=8)
    results['MACD'] = macd
    results['MACD_signal'] = macdsignal
    results['MACD_hist'] = macdhist
    
    # Stochastic oscillator
    slowk, slowd = ta.STOCH(df['high'], df['low'], df['close'], fastk_period=13, slowk_period=3, slowd_period=3)
    results['SlowK'] = slowk
    results['SlowD'] = slowd
    
    # --- MOVING AVERAGES ---
    # SMA and EMA on Fibonacci periods
    for period in fib_periods:
        if period >= 2:  # To avoid error with EMA(1)
            results[f'SMA_{period}'] = ta.SMA(df['close'], timeperiod=period)
            results[f'EMA_{period}'] = ta.EMA(df['close'], timeperiod=period)
    
    # --- VOLATILITY ---
    # ATR on Fibonacci periods
    for period in fib_periods:
        if period >= 2:  # ATR requires at least 2 periods
            results[f'ATR_{period}'] = ta.ATR(df['high'], df['low'], df['close'], timeperiod=period)
    
    # Volatility ratios (short term / long term)
    volatility_pairs = [(3, 13), (5, 21), (8, 34), (13, 55), (21, 89)]
    for short_period, long_period in volatility_pairs:
        short_atr = ta.ATR(df['high'], df['low'], df['close'], timeperiod=short_period)
        long_atr = ta.ATR(df['high'], df['low'], df['close'], timeperiod=long_period)
        with np.errstate(divide='ignore', invalid='ignore'):  # Ignore division by zero
            results[f'VolRatio_{short_period}_{long_period}'] = np.where(long_atr != 0, short_atr / long_atr, np.nan)
    
    # --- RETURNS ---
    # Calculate daily returns first
    returns = df['close'].pct_change()
    
    # Returns on different Fibonacci periods
    for period in fib_periods:
        if period >= 1:
            # Return over the last N days
            results[f'Return_{period}d'] = df['close'].pct_change(periods=period)
            
            # Average returns over the last N days
            if period > 1:  # Avoid window size 1
                results[f'AvgReturn_{period}d'] = returns.rolling(window=period).mean()
    
    # --- VOLUME ---
    results['OBV'] = ta.OBV(df['close'], df['volume'])
    results['Volume_SMA_13'] = ta.SMA(df['volume'], timeperiod=13)
    
    # Join all results in a single operation (avoids fragmentation)
    indicators_df = pd.DataFrame(results, index=df.index)
    result_df = pd.concat([df, indicators_df], axis=1)
    
    return result_df

In [68]:
def add_technical_indicators(df):
    """
    Adds essential technical indicators to a price DataFrame using selected Fibonacci periods.
    Optimized to limit the number of features for clustering.
    """
    df = df.copy()
    results = {}
    
    # Select only key Fibonacci periods instead of all
    key_periods = [ 2, 5, 13, 21, 55]  # Reduced set of periods
    
    # --- ESSENTIAL INDICATORS ---
    results['RSI'] = ta.RSI(df['close'], timeperiod=14)
    results['ADX'] = ta.ADX(df['high'], df['low'], df['close'], timeperiod=14)
    
    # MACD (single set of parameters)
    macd, macdsignal, _ = ta.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)
    results['MACD'] = macd
    results['MACD_signal'] = macdsignal
    
    # --- KEY MOVING AVERAGES ---
    for period in key_periods:
        results[f'SMA_{period}'] = ta.SMA(df['close'], timeperiod=period)
    
    # --- VOLATILITY ---
    # Just one medium and one long-term ATR
    results['ATR_13'] = ta.ATR(df['high'], df['low'], df['close'], timeperiod=13)
    results['ATR_55'] = ta.ATR(df['high'], df['low'], df['close'], timeperiod=55)
    
    # Just two key volatility ratios
    short_atr = ta.ATR(df['high'], df['low'], df['close'], timeperiod=5)
    long_atr = ta.ATR(df['high'], df['low'], df['close'], timeperiod=21)
    with np.errstate(divide='ignore', invalid='ignore'):
        results['VolRatio_5_21'] = np.where(long_atr != 0, short_atr / long_atr, np.nan)
    
    # --- RETURNS ---
    # Key return periods only
    for period in key_periods:
        results[f'Return_{period}d'] = df['close'].pct_change(periods=period)
    
    # --- VOLUME ---
    results['OBV'] = ta.OBV(df['close'], df['volume'])
    
    # Join all results in a single operation
    indicators_df = pd.DataFrame(results, index=df.index)
    result_df = pd.concat([df, indicators_df], axis=1)
    
    return result_df

In [79]:
# Add technical indicators to the pickled data
etf_data_with_indicators = {}
for ticker, data in etf_historical.items():
    # Add technical indicators to the DataFrame
    enriched_data = add_technical_indicators(data)
    # Store the enriched DataFrame in the dictionary
    etf_data_with_indicators[ticker] = enriched_data
# Save the enriched data to a pickle file
with open('etf_data_with_indicators.pkl', 'wb') as f:
    pickle.dump(etf_data_with_indicators, f)

In [80]:
# Load the data from the pickle file
with open('etf_data_with_indicators.pkl', 'rb') as f:
    etf_historical_with_indicators = pickle.load(f)
etf_historical_with_indicators



{'ARKK':                            close    high     low   open   volume   adjClose  \
 date                                                                          
 2018-01-02 00:00:00+00:00  38.58  38.590  37.247  37.34   420381  36.133634   
 2018-01-03 00:00:00+00:00  38.74  39.053  38.730  38.95   667492  36.283489   
 2018-01-04 00:00:00+00:00  38.87  39.190  38.480  39.10   358595  36.405245   
 2018-01-05 00:00:00+00:00  39.50  39.500  39.070  39.26   361640  36.995297   
 2018-01-08 00:00:00+00:00  39.23  39.230  38.320  39.01   421467  36.742418   
 ...                          ...     ...     ...    ...      ...        ...   
 2024-12-24 00:00:00+00:00  60.96  60.990  59.440  59.63  4801695  60.960000   
 2024-12-26 00:00:00+00:00  60.96  61.135  60.040  60.72  6888985  60.960000   
 2024-12-27 00:00:00+00:00  59.27  60.600  58.440  60.54  9592628  59.270000   
 2024-12-30 00:00:00+00:00  57.61  58.230  56.900  58.01  8456884  57.610000   
 2024-12-31 00:00:00+00:00  56.7