In [None]:
import requests
import zipfile
import io
data_zip = 'https://github.com/umsi-amadaman/stocks/raw/main/sp500_5year_archive.zip'

response = requests.get(data_zip)

if response.status_code == 200:
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall('/content/sp500_5year_archive/')

In [None]:
### if you know pandas going from here is super easy... if you don't...

import csv

csv_file_path = '/content/sp500_5year_archive/sp500_stocks_last5.csv'  # the path to your CSV file


csv_data = []
with open(csv_file_path, 'r', newline='') as file:
    csv_reader = csv.reader(file)  # Create a CSV reader object

    for row in csv_reader:  # Read each row of the CSV file
        #print(row)  # Process each row as needed
        csv_data.append(row)


In [None]:
csv_data[0]

['Date', 'Symbol', 'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']

In [None]:
csv_data[1]

['2019-01-02',
 'MMM',
 '153.6024169921875',
 '190.9499969482422',
 '190.99000549316406',
 '186.6999969482422',
 '187.82000732421875',
 '2475200']

In [None]:
csv_data_minimal = [[x[0], x[1], x[2]]      for x in csv_data]

In [None]:
csv_data_minimal[:5]

[['Date', 'Symbol', 'Adj Close'],
 ['2019-01-02', 'MMM', '153.6024169921875'],
 ['2019-01-03', 'MMM', '147.81869506835938'],
 ['2019-01-04', 'MMM', '153.9000701904297'],
 ['2019-01-07', 'MMM', '153.54615783691406']]

### General Approaches for Creating Edges in Networks

| Relationship Type         | Metric/Method                        | Description / Use Cases                                                  |
|---------------------------|--------------------------------------|--------------------------------------------------------------------------|
| **Similarity**            | Correlation, Cosine similarity       | Similar attributes, behaviors, or profiles (e.g., users, documents)      |
|                           | Mutual information, Distance correlation | Detects shared patterns, even nonlinear ones                            |
|                           | Jaccard similarity                   | For sets — overlap of attributes or neighbors                            |
|                           | Structural similarity (e.g., SimRank) | Nodes are similar if they're linked to similar others                    |
| **Co-occurrence**         | Raw count, PMI, TF-IDF-weighted      | Nodes appearing together in time, space, or documents                    |
|                           | Binarized co-presence                | Did A and B co-occur at all?                                             |
| **Distance / Proximity**  | Euclidean, geodesic, spatial metrics | Physical, semantic, or embedding-space distance                          |
|                           | DTW, Levenshtein                     | For time series or strings                                               |
| **Causality / Influence** | Granger causality, Transfer entropy  | Does one entity predict or influence another?                            |
|                           | Causal graphs / DAGs                 | From interventions or time series                                        |
| **Flow / Movement**       | Traffic, migration, citations        | Directed edges based on flow quantity or frequency                       |
|                           | Money, goods, information            | Supply chains, neural activity, communication networks                   |
| **Interaction / Behavior**| Shared activity, transactions        | Users buying the same item, replying to the same tweet                   |
|                           | Communication frequency              | Messaging, emails, chats                                                 |
| **Structural / Social**   | Shared group membership              | Organizations, clubs, classes                                            |
|                           | Common neighbors, triadic closure    | Used in link prediction or friendship inference                          |
| **Probabilistic / Model-based** | Graphical models, Bayesian networks | Learn edges from statistical dependencies                               |
|                           | Network inference                    | From observed node states (e.g. Ising models, GLMs)                      |
| **Semantic / Conceptual** | Ontology links, knowledge graphs     | Edges encode logical or hierarchical relationships                       |


In [None]:
data_wide = {}

for date, stock, price in csv_data_minimal:
  if date not in data_wide:
    data_wide[date] = {}
  data_wide[date][stock] = price




In [None]:
import pandas as pd

df_long = pd.DataFrame(csv_data_minimal[1:], columns=['Date', 'Symbol', 'Price'])

In [None]:
df_long.head()

In [None]:
df_wide = df_long.pivot(index='Date', columns = 'Symbol', values = 'Price')

In [None]:
df_wide.head()

In [None]:
df_wide.corr()

In [None]:
df_wide.iloc[:,:] = df_wide.iloc[:,:].apply(pd.to_numeric, errors='coerce')

In [None]:
corr_matrix = df_wide.corr()

corr_matrix.head()

In [None]:
[origin for origin in corr_matrix.index][:3]

In [None]:
[dest for dest in corr_matrix.columns][:3]

In [None]:
corr_matrix.loc['A', 'A']

In [None]:
threshold = 0.7
[ (origin, destination,  corr_matrix.loc[origin, destination])
    for origin in corr_matrix.index for destination in corr_matrix.columns if
          origin != destination and abs(corr_matrix.loc[origin, destination]  > threshold)]

In [None]:
import networkx as nx

threshold = 0.7
edges = [ (origin, destination,  corr_matrix.loc[origin, destination])
    for origin in corr_matrix.index for destination in corr_matrix.columns if
          origin != destination and abs(corr_matrix.loc[origin, destination]  > threshold)]


G = nx.Graph()

G.add_weighted_edges_from(edges)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G, seed=42)
nx.draw_networkx(G, pos, with_labels=True, node_size=500, node_color='skyblue', font_size=10, font_color='black')
plt.show()

In [None]:
threshold = -0.7
edgesN = [ (origin, destination,  corr_matrix.loc[origin, destination])
    for origin in corr_matrix.index for destination in corr_matrix.columns if
          origin != destination and corr_matrix.loc[origin, destination]  < threshold]


Gneg = nx.Graph()

Gneg.add_weighted_edges_from(edgesN)

In [None]:
plt.figure(figsize=(10, 10))
negative = nx.spring_layout(Gneg, seed=42)
nx.draw_networkx(Gneg, negative, with_labels=True, node_size=500, node_color='skyblue', font_size=10, font_color='black')
plt.show()

In [None]:
import matplotlib.cm as cm
import matplotlib.colors as mcolors


threshold = 0.8

pos_edges = [
    (i, j, corr_matrix.loc[i, j])
    for i in corr_matrix.index
    for j in corr_matrix.columns
    if i != j and corr_matrix.loc[i, j] > threshold
]

neg_edges = [
    (i, j, corr_matrix.loc[i, j])
    for i in corr_matrix.index
    for j in corr_matrix.columns
    if i != j and corr_matrix.loc[i, j] < -threshold
]

G_pos = nx.Graph() #positive graph
G_pos.add_weighted_edges_from(pos_edges)

G_neg = nx.Graph() # negative graph
G_neg.add_weighted_edges_from(neg_edges)

pos_layout = nx.circular_layout(G_pos) ## circular layout to space out nodes
neg_layout = nx.circular_layout(G_neg)

pos_cmap = cm.Blues # color by correlation direction
neg_cmap = cm.Reds

def get_edge_colors(graph, cmap):
    weights = [abs(weight) for _, _, weight in graph.edges(data="weight")]
    norm = mcolors.Normalize(vmin=min(weights), vmax=max(weights))
    return [cmap(norm(weight)) for weight in weights]


# Plot negative correlations
plt.figure(figsize=(10, 10))
nx.draw_networkx_nodes(G_neg, neg_layout, node_size=500, node_color="lightcoral")
nx.draw_networkx_labels(G_neg, neg_layout, font_size=10, font_color="black")
nx.draw_networkx_edges(G_neg, neg_layout, edge_color=get_edge_colors(G_neg, neg_cmap), width=1.5)
plt.title("Negative Correlation Network")
plt.show()


In [None]:

# Plot positive correlations
plt.figure(figsize=(10, 10))
nx.draw_networkx_nodes(G_pos, pos_layout, node_size=500, node_color="skyblue")
nx.draw_networkx_labels(G_pos, pos_layout, font_size=10, font_color="black")
nx.draw_networkx_edges(G_pos, pos_layout, edge_color=get_edge_colors(G_pos, pos_cmap), width=1.5)
plt.title("Positive Correlation Network")
plt.show()


In [None]:
import matplotlib.cm as cm
import matplotlib.colors as mcolors


threshold = 0.8

pos_edges = [
    (i, j, corr_matrix.loc[i, j])
    for i in corr_matrix.index
    for j in corr_matrix.columns
    if i != j and corr_matrix.loc[i, j] > threshold
]

neg_edges = [
    (i, j, corr_matrix.loc[i, j])
    for i in corr_matrix.index
    for j in corr_matrix.columns
    if i != j and corr_matrix.loc[i, j] < -threshold
]

G_pos = nx.Graph() #positive graph
G_pos.add_weighted_edges_from(pos_edges)

G_neg = nx.Graph() # negative graph
G_neg.add_weighted_edges_from(neg_edges)

pos_layout = nx.spring_layout(G_pos) ## circular layout to space out nodes
neg_layout = nx.spring_layout(G_neg)

pos_cmap = cm.Blues # color by correlation direction
neg_cmap = cm.Reds

def get_edge_colors(graph, cmap):
    weights = [abs(weight) for _, _, weight in graph.edges(data="weight")]
    norm = mcolors.Normalize(vmin=min(weights), vmax=max(weights))
    return [cmap(norm(weight)) for weight in weights]


# Plot negative correlations
plt.figure(figsize=(10, 10))
nx.draw_networkx_nodes(G_neg, neg_layout, node_size=500, node_color="lightcoral")
nx.draw_networkx_labels(G_neg, neg_layout, font_size=10, font_color="black")
nx.draw_networkx_edges(G_neg, neg_layout, edge_color=get_edge_colors(G_neg, neg_cmap), width=1.5)
plt.title("Negative Correlation Network")
plt.show()


### Alternative Metrics for Edges

| Category                   | Metric/Method                        | Description                                                                 |
|---------------------------|--------------------------------------|-----------------------------------------------------------------------------|
| Linear & Nonlinear        | Correlation / Partial Correlation    | Measures linear association; partial removes influence of other variables  |
|                           | Mutual Information                   | Captures any dependency, including nonlinear relationships                 |
|                           | Distance Correlation                 | Detects both linear and nonlinear dependence                               |
| Temporal Relationships    | Granger Causality                    | Tests whether one time series predicts another; produces directed edges    |
|                           | Transfer Entropy                     | Nonlinear, model-free info transfer between time series                    |
| Price Behavior Similarity | Euclidean / DTW Distance             | Compares time series directly or via shape alignment (DTW = Dynamic Time Warping) |
|                           | Cosine Similarity                    | Measures angle between return vectors (directional similarity)             |
|                           | PCA-Based Similarity                 | Stocks with similar loadings on principal components                       |
| Model-Based               | Regression Coefficients (e.g. LASSO) | Edges based on non-zero coefficients in multivariate models                |
|                           | Graphical Lasso                      | Sparse precision matrix; partial correlations with regularization          |
| Structural/Fundamental    | Sector Co-membership                 | Binary edges based on shared industry/sector                               |
|                           | Supply Chain Links                   | Edges based on supplier-customer relationships                             |
|                           | News Co-mentions                     | Connect stocks appearing together in financial news                        |
|                           | Social Media Co-movement             | Based on sentiment or co-occurrence on platforms like Twitter              |
|                           | Shared Ownership / Boards            | Links based on common institutional investors or board members             |
