In [3]:
import pandas as pd 
import yfinance as yf 
from scipy.stats import spearmanr
import networkx as nx 
import numpy as np 

url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

#read wikipedia S&P500 table into PD dataframe 
df_sp500 = pd.read_html(url)[0][['Symbol', 'Security', 'GICS Sector']] 
#ticker DF 
sp500_tickers = df_sp500['Symbol'].tolist() 

#read changes table into DF 
#sp500_changes = pd.read_html(url)[1]
#problematic 
# drop rows where at least one element missing
#changes = df_changes[['Date', 'Added', 'Removed']].dropna() 
#filter out reasons column 
#changes = changes[~changes['Date'].str.contains('Reason')]
#filter out note column 
#changes = changes[~changes['Date'].str.contains('Note')] 

#leaving these fuckers out (Berkshire Hathaway - 2010, Brown-Forman - 1982, AirBNB - 2023....so on)
#issues: these have not been in the dataset for the full time period (1258 days) so i need to leave them out until ifigure something out 
exclude = ['BRK.B', 'BF.B', 'ABNB', 'CARR', 'CEG', 'CTVA', 'DOW', 'FOXA', 'FOX', 'GEHC', 'KVUE', 'OTIS', 'UBER', 'VLTO'] 
sp500_tickers = [ticker for ticker in sp500_tickers if ticker not in exclude]

#dict for individual stock DF storage 
stockdataframes = {} 

start_dte = "2019-01-01"
end_dte = "2023-12-31"

#ADD INCLUSION AND EXCLUSION DATE AND FILTER BY THAT FOR NEWER STOCKS AND REMOVED ONES 

#iterate through all tickers+ get historical data 
for ticker in sp500_tickers: 
    #added = row['Added'] 
    #removed = row['Removed'] 
    
    try:
        stock = yf.Ticker(ticker) 
        stock_df = stock.history(start=start_dte, end=end_dte)[['Close']]
        
        #debug 
        #print(f"Debug - Ticker: {ticker}, DataFrame shape: {stock_df.shape}")
        
        #merge with wiki data - check if not empty first
        if 'Close' in stock_df.columns and not stock_df.empty: 
            stock_df['Symbol'] = ticker
            stock_df['Security'] = df_sp500[df_sp500['Symbol'] == ticker]['Security'].iloc[0]
            stock_df['GICS Sector'] = df_sp500[df_sp500['Symbol'] == ticker]['GICS Sector'].iloc[0]
            #merged_stock_df = pd.merge(stock_df, df_sp500[df_sp500['Symbol'] == ticker], left_index=True, right_index=True)
        
            #store merged df in dictionary 
            stockdataframes[ticker] = stock_df #used o be merged_stock_dataframe
        
            #print(f"Data fetched for {ticker}")
        else:
            print(f"Failed to fetch data for {ticker}: {e}")
    except Exception as e: 
        print(f"Failed to fetch data for {ticker}: {e}")

#merge all stock dfs into one 
full_df = pd.concat(stockdataframes.values(), keys=stockdataframes.keys())

full_df = full_df.reset_index(level=0) 

#ptiny 
print(full_df.head(450)) 
display(full_df) 
#full_df.to_csv('SP500.csv', index=True)


                          level_0       Close Symbol Security  GICS Sector
Date                                                                      
2019-01-02 00:00:00-05:00     MMM  153.602432    MMM       3M  Industrials
2019-01-03 00:00:00-05:00     MMM  147.818665    MMM       3M  Industrials
2019-01-04 00:00:00-05:00     MMM  153.900070    MMM       3M  Industrials
2019-01-07 00:00:00-05:00     MMM  153.546112    MMM       3M  Industrials
2019-01-08 00:00:00-05:00     MMM  154.189621    MMM       3M  Industrials
...                           ...         ...    ...      ...          ...
2020-10-07 00:00:00-04:00     MMM  142.362335    MMM       3M  Industrials
2020-10-08 00:00:00-04:00     MMM  143.405533    MMM       3M  Industrials
2020-10-09 00:00:00-04:00     MMM  144.765091    MMM       3M  Industrials
2020-10-12 00:00:00-04:00     MMM  144.209305    MMM       3M  Industrials
2020-10-13 00:00:00-04:00     MMM  142.482040    MMM       3M  Industrials

[450 rows x 5 columns]


Unnamed: 0_level_0,level_0,Close,Symbol,Security,GICS Sector
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-02 00:00:00-05:00,MMM,153.602432,MMM,3M,Industrials
2019-01-03 00:00:00-05:00,MMM,147.818665,MMM,3M,Industrials
2019-01-04 00:00:00-05:00,MMM,153.900070,MMM,3M,Industrials
2019-01-07 00:00:00-05:00,MMM,153.546112,MMM,3M,Industrials
2019-01-08 00:00:00-05:00,MMM,154.189621,MMM,3M,Industrials
...,...,...,...,...,...
2023-12-22 00:00:00-05:00,ZTS,194.538773,ZTS,Zoetis,Health Care
2023-12-26 00:00:00-05:00,ZTS,195.057587,ZTS,Zoetis,Health Care
2023-12-27 00:00:00-05:00,ZTS,196.454422,ZTS,Zoetis,Health Care
2023-12-28 00:00:00-05:00,ZTS,196.713837,ZTS,Zoetis,Health Care


In [6]:
#granger causality 
from statsmodels.tsa.stattools import grangercausalitytests
from scipy.stats import spearmanr

def calculate_granger_causality(data1, data2, max_lag=1, significance_level=0.05):
    # Perform Granger causality test
    granger_result = grangercausalitytests(np.column_stack((data1, data2)), max_lag, verbose=False)
    
    # Extract p-values
    p_values = [granger_result[i+1][0]['ssr_ftest'][1] for i in range(max_lag)]
    
    # Check if any p-value is below significance level
    return any(p < significance_level for p in p_values)

stocks_by_symbol = {} 

for symbol, data in full_df.groupby('Symbol'):
    data.reset_index(inplace=True)
    stocks_by_symbol[symbol] = data[['Date','Close']]   

stock1_data = stocks_by_symbol['MMM']['Close']
stock2_data = stocks_by_symbol['ZTS']['Close']

# Calculate Granger causality
granger_result = calculate_granger_causality(stock1_data, stock2_data, max_lag=1, significance_level=0.05)
print("Granger causality between MMM and ZTS:", granger_result)


Granger causality between MMM and ZTS: False




In [2]:
#fixing monthly data missing issue 
import pandas as pd 

filepath = 'SP500.csv' 
full_df = pd.read_csv(filepath) 

# Convert date to datetime
full_df['Date'] = pd.to_datetime(full_df['Date'], errors='coerce')

# Drop rows with NaT (Not a Time) values
full_df = full_df.dropna(subset=['Date'])

# Print the data type of the 'Date' column to ensure it's datetime-like
print(full_df['Date'].dtype)
print(full_df['Date'].head())

#non_date_values = full_df['Date'][pd.to_datetime(full_df['Date'], errors='coerce').isna()]
#print(non_date_values)

#find last trade days 
def last_trade_day(year, month): 
    lastday = pd.Timestamp(year, month, 1) + pd.offsets.MonthEnd(0) 
    while lastday.weekday() >= 5: #weekend days 
        lastday -= pd.Timedelta(days=1)
    return lastday 

# Create a new column 'Last_Trade_Day' with accurate last trading days
full_df['Last_Trade_Day'] = [last_trade_day(date.year, date.month) for date in full_df['Date']]

# Filter rows where 'Date' equals 'Last_Trade_Day'
last_day_df = full_df[full_df['Date'] == full_df['Last_Trade_Day']]

# Reset index
last_day_df = last_day_df.reset_index(drop=True)

print(len(last_day_df))
print(last_day_df.head())

#func call to each row 
#lastdays = [last_trade_day(year, month) for year, month in zip(full_df['Date'].dt.year, full_df['Date'].dt.month)]
#lastdays = [last_trade_day(date.year, date.month) for date in full_df['Last_Trade_Day']]

#add accurate last days to df 
#full_df['Last_Trade_Day'] = lastdays 

#filtering 
#last_day_df = full_df[full_df['Date'] == full_df['Last_Trading_Day']]

#reset index 

#print(len(last_day_df))
#print(last_day_df.head())




object
0    2019-01-02 00:00:00-05:00
1    2019-01-03 00:00:00-05:00
2    2019-01-04 00:00:00-05:00
3    2019-01-07 00:00:00-05:00
4    2019-01-08 00:00:00-05:00
Name: Date, dtype: object
0
Empty DataFrame
Columns: [Date, level_0, Close, Symbol, Security, GICS Sector, Last_Trade_Day]
Index: []


In [2]:
"""
import matplotlib.pyplot as plt 

monthly_df = full_df.resample('M').last() 

# Calculate Spearman correlation coefficient
corr_matrix = monthly_df.pivot(columns='Symbol', values='Close').pct_change().corr(method='spearman').dropna()

#create graph 
G = nx.Graph() 

for stock in sp500_tickers: 
    G.add_node(stock) 
    
#add links 
for i in range(len(sp500_tickers)): 
    for j in range(i + 1, len(sp500_tickers)):
        stock1 = sp500_tickers[i]
        stock2 = sp500_tickers[j]
        
        # Check if the stock symbols are present in the correlation matrix
        if stock1 in corr_matrix.index and stock2 in corr_matrix.columns:
            correlation_coefficient = corr_matrix.loc[stock1, stock2]
            if not np.isnan(correlation_coefficient):
                G.add_edge(stock1, stock2, weight=correlation_coefficient)

#visualization 
pos = nx.spring_layout(G, seed=42)
# Draw only edges with significant correlation
edges = [(source, target) for source, target, data in G.edges(data=True) if abs(data['weight']) > 0.5]
nx.draw_networkx_nodes(G, pos, node_size=30)
nx.draw_networkx_labels(G, pos, font_size=8)
nx.draw_networkx_edges(G, pos, edgelist=edges, edge_color='gray')

labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)

plt.show()
"""

2

In [None]:
#REF 
#import pandas as pd 
#import yfinance as yf 

#url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

#read tables into PD dataframe 
#dfs = pd.read_html(url) 

#get first table (index 0) + first 3 cols 
#df = dfs[0].iloc[:, :3] 
#print(df) 