In [None]:
import yfinance as yf
import pandas as pd
import concurrent.futures
import networkx as nx
import plotly.graph_objs as go
from sklearn.cluster import SpectralClustering
from time import sleep
from random import randint
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression

# Download the list of S&P 500 companies from Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
tables = pd.read_html(url)
sp500_table = tables[0]
sp500_symbols = sp500_table['Symbol'].tolist()
selected_symbols = sp500_symbols

# Download Historical Data
def fetch_data(symbol, retries=5):
    for attempt in range(retries):
        try:
            data = yf.download(symbol, start='2020-01-01', end='2024-07-31')[['Adj Close', 'Volume']]
            if data.empty:
                raise ValueError(f"No data found for {symbol}")
            return symbol, data
        except Exception as e:
            print(f"Attempt {attempt+1} failed for {symbol}: {e}")
            sleep(randint(1, 3))
    return symbol, None

# Use threading to download data for each stock
data_frames = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(fetch_data, symbol): symbol for symbol in selected_symbols}
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        if result is not None:
            symbol, data = result
            if data is not None:
                data_frames[symbol] = data

# Combine all data into a single DataFrame
valid_data = {symbol: data for symbol, data in data_frames.items() if data is not None}
prices = pd.DataFrame({symbol: data['Adj Close'] for symbol, data in valid_data.items()})
volumes = pd.DataFrame({symbol: data['Volume'] for symbol, data in valid_data.items()})

prices = prices.dropna(axis=1, how='all')
volumes = volumes.dropna(axis=1, how='all')

returns = prices.pct_change().dropna()

avg_corr = returns.corr()


In [None]:
# --- Plotting the Network --- #
notional_values = prices * volumes
notional_values = notional_values.mean(axis=0)  # Average notional value over the period

# Normalize notional values for plotting
min_notional = notional_values.min()
max_notional = notional_values.max()
normalized_notional = 10 + (40 * (notional_values - min_notional) / (max_notional - min_notional))

notional_df = pd.DataFrame({'notional_value': notional_values, 'normalized_notional': normalized_notional})

# Plotting function
def plot_avg_corr_network(correlations, notional_df, threshold=0.4, n_clusters=5, max_connections=6):
    correlations = correlations.dropna(axis=0, how='any').dropna(axis=1, how='any')
    correlations = correlations.applymap(lambda x: x if x > 0 else 0)

    G = nx.Graph()
    
    for stock in correlations.columns:
        G.add_node(stock, notional_value=notional_df.loc[stock, 'normalized_notional'])
    
    for i in correlations.columns:
        node_correlations = correlations[i].sort_values(ascending=False)
        connections = 0
        for j in node_correlations.index:
            if i != j and node_correlations[j] > threshold and connections < max_connections:
                G.add_edge(i, j, weight=node_correlations[j])
                connections += 1

    pos = nx.spring_layout(G, k=0.1)
    
    clustering = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', assign_labels='discretize').fit(correlations)
    labels = clustering.labels_
    
    edge_traces = []
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        trace = go.Scatter(
            x=[x0, x1, None], y=[y0, y1, None],
            line=dict(width=edge[2]['weight'], color='#888'),
            hoverinfo='none',
            mode='lines')
        edge_traces.append(trace)

    node_x = []
    node_y = []
    node_color = []
    node_size = []
    for i, node in enumerate(G.nodes()):
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_color.append(labels[i])
        node_size.append(G.nodes[node]['notional_value'])

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=[node for node in G.nodes()],
        textposition="bottom center",
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='Rainbow',
            size=node_size,
            color=node_color,
            colorbar=dict(
                thickness=15,
                title='Cluster Group',
                xanchor='left',
                titleside='right'
            ),
            line_width=2))

    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))
        node_text.append(f'{adjacencies[0]}')

    node_trace.marker.color = node_color
    node_trace.text = node_text

    fig = go.Figure(data=edge_traces + [node_trace],
                    layout=go.Layout(
                        title='Stock Correlation Network',
                        titlefont_size=16,
                        width=1800,
                        height=1400,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20,l=5,r=5,t=40),
                        annotations=[dict(
                            text="",
                            showarrow=False,
                            xref="paper", yref="paper")],
                        xaxis=dict(showgrid=False, zeroline=False),
                        yaxis=dict(showgrid=False, zeroline=False))
                    )
    fig.show()

plot_avg_corr_network(avg_corr, notional_df)

In [None]:
# --- Ranks Top 10 Highest Correlated Stocks with a Given Stock --- #

# Ensure ticker data is present
ticker = input("Enter the ticker symbol: ")
if ticker not in avg_corr.columns:
    raise ValueError(f'{ticker} data is not available in the dataset.')

# Find the highest correlated stocks 
target_stock = ticker
correlations_with_target = avg_corr[target_stock]

# Sort the correlations in descending order and exclude the target stock itself
highest_correlations = correlations_with_target.sort_values(ascending=False).drop(target_stock)

# Display the top 10 highest correlated stocks
top_10_highest_correlated_stocks = highest_correlations.head(10)
print(f'Top 10 highest correlated stocks with {ticker}:')
print(top_10_highest_correlated_stocks)

In [None]:
# --- Create a Synthetic Version of a Stock --- #

def recreate_ticker(target_ticker, returns, exclude_tickers=[], num_tickers=10, display_plot=True):
    
    if target_ticker in exclude_tickers:
        exclude_tickers.remove(target_ticker)
    
    # Calculate correlations with the target ticker
    correlations = returns.corr()[target_ticker].drop(target_ticker)
    
    # Select the top num_tickers based on correlation
    top_tickers = correlations.abs().sort_values(ascending=False).head(num_tickers).index.tolist()
    
    # Exclude additional specified tickers
    top_tickers = [ticker for ticker in top_tickers if ticker not in exclude_tickers]
    
    X = returns[top_tickers]
    y = returns[target_ticker]
    
    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X, y)
    
    # Get the weights
    weights = model.coef_
    tickers = X.columns
    ticker_weights = pd.DataFrame({'Ticker': tickers, 'Weight': weights})
    
    # Recreate the target ticker
    y_pred = model.predict(X)
    
    if display_plot:
        plt.figure(figsize=(10, 6))
        plt.plot(returns.index, y, label=target_ticker)
        plt.plot(returns.index, y_pred, label='Synthetic ' + target_ticker)
        plt.title(f'Real vs Synthetic {target_ticker}')
        plt.legend()
        plt.show()
    
    return ticker_weights, y_pred

# Example usage
target_ticker = 'TSLA'  # Replace with your target ticker
exclude_tickers = ['GOOGL', 'MSFT']  # Replace with any tickers you want to exclude from the synthetic creation
ticker_weights, synthetic_returns = recreate_ticker(target_ticker, returns, exclude_tickers)

print(ticker_weights)