In [9]:
import yfinance as yf
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster


def fetch_price_data(tickers, start_date, end_date):
    data = yf.download(tickers, start=start_date, end=end_date)
    return data['Adj Close']


def calculate_daily_returns(prices):
    returns = prices.pct_change(fill_method=None).dropna()
    return returns


def compute_correlation_matrix(returns):
    correlation_matrix = returns.corr()
    return correlation_matrix


def select_least_correlated_assets(correlation_matrix, num_assets):
    
    distance_matrix = np.sqrt(0.5 * (1 - correlation_matrix))
    # Ensure the distance matrix contains only finite values
    distance_matrix = distance_matrix.replace([np.inf, -np.inf], np.nan).dropna()
    clusters = linkage(distance_matrix, method='ward')
    cluster_labels = fcluster(clusters, num_assets, criterion='maxclust')
    
    selected_assets = []
    for i in range(1, num_assets + 1):
        
        cluster_assets = correlation_matrix.columns[cluster_labels == i]
        selected_assets.append(cluster_assets[0])
    
    return selected_assets


def main():
    tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'META', 'BRK-B', 'V', 'JNJ', 'WMT']  # Example tickers
    start_date = '2020-01-01'
    end_date = '2024-01-01'
    num_assets = 5  
    
    prices = fetch_price_data(tickers, start_date, end_date)
    
    print("\n")
    print("Returns")
    returns = calculate_daily_returns(prices)
    print(returns)
   
    print("\n")
    print("correlation_matrix")
    correlation_matrix = compute_correlation_matrix(returns)
    print(correlation_matrix)


    print("\n")
    selected_assets = select_least_correlated_assets(correlation_matrix, num_assets)
    
    print("Selected Assets:", selected_assets)

if __name__ == "__main__":
    main()


[*********************100%%**********************]  10 of 10 completed



Returns
Ticker          AAPL      AMZN     BRK-B     GOOGL       JNJ      META  \
Date                                                                     
2020-01-03 -0.009722 -0.012139 -0.009676 -0.005231 -0.011578 -0.005291   
2020-01-06  0.007968  0.014886  0.003581  0.026654 -0.001248  0.018834   
2020-01-07 -0.004703  0.002092 -0.004714 -0.001932  0.006107  0.002164   
2020-01-08  0.016086 -0.007809  0.000310  0.007118 -0.000137  0.010138   
2020-01-09  0.021241  0.004799  0.011770  0.010498  0.002966  0.014311   
...              ...       ...       ...       ...       ...       ...   
2023-12-22 -0.005547 -0.002730  0.000927  0.007620  0.004004 -0.001977   
2023-12-26 -0.002841 -0.000065  0.001010  0.000212  0.004374  0.004075   
2023-12-27  0.000518 -0.000456  0.000336 -0.008126  0.001345  0.008455   
2023-12-28  0.002226  0.000261  0.001737 -0.000997  0.001471  0.001369   
2023-12-29 -0.005424 -0.009388 -0.002545 -0.003851  0.001022 -0.012168   

Ticker          MSFT      T


  clusters = linkage(distance_matrix, method='ward')
