In [4]:
import tslearn

In [5]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from scipy.stats import zscore
from tslearn.metrics import dtw
from sklearn.linear_model import Ridge
from statsmodels.tsa.arima.model import ARIMA


In [6]:

# Load datasets
companies_df = pd.read_csv('sp500_companies.csv')
index_df = pd.read_csv('sp500_index.csv')
stocks_df = pd.read_csv('sp500_stocks.csv')

# Data preparation for clustering, outliers, and classification
financial_features = companies_df[['Marketcap', 'Ebitda', 'Revenuegrowth', 'Currentprice']].dropna()
financial_features_scaled = StandardScaler().fit_transform(financial_features)



In [9]:
# --- Clustering ---
# 1. Sector and Industry-based Clustering
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(companies_df[['Sector', 'Industry']].fillna('Unknown')).toarray()
sector_kmeans = KMeans(n_clusters=5, random_state=42)
sector_clusters = sector_kmeans.fit_predict(categorical_features)
print(f"Sector and Industry-based Clustering - Silhouette Score: {silhouette_score(categorical_features, sector_clusters):.2f}")

# 2. Financial Metric-based Clustering
financial_kmeans = KMeans(n_clusters=5, random_state=42)
financial_clusters = financial_kmeans.fit_predict(financial_features_scaled)
print(f"Financial Metric-based Clustering - Silhouette Score: {silhouette_score(financial_features_scaled, financial_clusters):.2f}")

# 3. Time-series Similarity Clustering
# Compute pairwise DTW distances for a subset of stocks
time_series = stocks_df.pivot(index='Date', columns='Symbol', values='Adj Close').fillna(0).iloc[:, :10]
dtw_distances = np.zeros((time_series.shape[1], time_series.shape[1]))
for i in range(time_series.shape[1]):
    for j in range(i, time_series.shape[1]):
        dtw_distances[i, j] = dtw(time_series.iloc[:, i].values, time_series.iloc[:, j].values)
        dtw_distances[j, i] = dtw_distances[i, j]
print(f"Time-series Similarity Clustering - Pairwise DTW distances computed for 10 stocks.")



[WinError 2] The system cannot find the file specified
  File "c:\Users\coboz\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\coboz\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\coboz\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\coboz\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Sector and Industry-based Clustering - Silhouette Score: 0.21
Financial Metric-based Clustering - Silhouette Score: 0.41
Time-series Similarity Clustering - Pairwise DTW distances computed for 10 stocks.


In [12]:
# --- Outlier Detection ---
# 1. Revenue Growth Outliers
revenue_growth_outliers = financial_features[np.abs(zscore(financial_features['Revenuegrowth'])) > 3]
print(f"Revenue Growth Outliers Detected: {len(revenue_growth_outliers)}")

# 2. Price Movement Outliers
# Ensure the index alignment while detecting price movement outliers
stocks_df['Daily Return'] = stocks_df.groupby('Symbol')['Adj Close'].pct_change(fill_method=None)

# Drop rows with NaN in 'Daily Return' for z-score computation
daily_returns_clean = stocks_df.dropna(subset=['Daily Return'])

# Calculate z-scores on the 'Daily Return' column
daily_returns_clean['Return ZScore'] = zscore(daily_returns_clean['Daily Return'])

# Detect outliers (absolute z-score greater than 3)
price_outliers = daily_returns_clean[np.abs(daily_returns_clean['Return ZScore']) > 3]

print(f"Price Movement Outliers Detected: {len(price_outliers)}")




Revenue Growth Outliers Detected: 8
Price Movement Outliers Detected: 9548


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_returns_clean['Return ZScore'] = zscore(daily_returns_clean['Daily Return'])


In [13]:
# --- Feature Selection ---
# 1. Correlation Analysis
correlations = financial_features.corrwith(index_df['S&P500'])
print("Feature Correlations with S&P 500 Index:")
print(correlations)

# 2. PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(financial_features_scaled)
print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_}")



Feature Correlations with S&P 500 Index:
Marketcap       -0.042492
Ebitda          -0.045430
Revenuegrowth   -0.115122
Currentprice    -0.122366
dtype: float64
PCA Explained Variance Ratio: [0.47386769 0.25514919]


In [15]:
# --- Classification ---
# 1. Sector Prediction
# Ensure X and y_sector are aligned by dropping rows with missing values in the relevant columns
aligned_data = companies_df.dropna(subset=['Marketcap', 'Ebitda', 'Revenuegrowth', 'Currentprice', 'Sector'])

# Extract features (X) and target labels (y_sector)
X = aligned_data[['Marketcap', 'Ebitda', 'Revenuegrowth', 'Currentprice']].values
y_sector = (aligned_data['Sector'] == 'Technology').astype(int)

# Train the Logistic Regression model
classifier = LogisticRegression(random_state=42)
classifier.fit(X, y_sector)

# Print the accuracy of the model
accuracy = classifier.score(X, y_sector)
print(f"Sector Prediction Accuracy: {accuracy:.2f}")



Sector Prediction Accuracy: 0.80


In [17]:

# --- Hyperparameter Tuning ---
# 1. Optimization for Classification Models
param_grid = {'max_depth': [5, 10, 15], 'n_estimators': [50, 100, 150]}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
grid_search.fit(X, y_sector)
print(f"Best Hyperparameters for Classification: {grid_search.best_params_}")

# 2. Time-series Forecasting Tuning (ARIMA)
from statsmodels.tsa.stattools import adfuller

# Select data for a single stock (AAPL) and drop NaN values
symbol_series = stocks_df[stocks_df['Symbol'] == 'AAPL']['Adj Close'].dropna()

# Check data sufficiency
if len(symbol_series) < 50:
    print("Insufficient data for ARIMA modeling. Consider using a stock with more observations.")
else:
    # Check stationarity using Augmented Dickey-Fuller test
    adf_test = adfuller(symbol_series)
    print(f"ADF Statistic: {adf_test[0]}, p-value: {adf_test[1]}")

    if adf_test[1] > 0.05:
        print("Data is non-stationary. Applying differencing...")
        symbol_series = symbol_series.diff().dropna()  # Apply differencing

    # Fit ARIMA model with adjusted parameters if necessary
    try:
        arima_model = ARIMA(symbol_series, order=(1, 1, 1))  # Adjusted order for simplicity
        arima_results = arima_model.fit()
        print(f"ARIMA Model AIC: {arima_results.aic}")
    except Exception as e:
        print(f"ARIMA modeling failed: {e}")



Best Hyperparameters for Classification: {'max_depth': 5, 'n_estimators': 100}
Insufficient data for ARIMA modeling. Consider using a stock with more observations.


In [9]:
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt

# Select time-series data for ARIMA (e.g., AAPL's adjusted close prices)
symbol_series = stocks_df[stocks_df['Symbol'] == 'AAPL']['Adj Close'].dropna()

# Fit ARIMA model (order can be tuned as needed)
arima_model = ARIMA(symbol_series, order=(5, 1, 0))  # p=5, d=1, q=0
arima_results = arima_model.fit()

# Extract residuals and forecast
residuals = arima_results.resid
forecast = arima_results.forecast(steps=10)  # Forecast next 10 steps

# Plot residuals and forecast
plt.figure(figsize=(10, 5))
plt.subplot(2, 1, 1)
plt.title("ARIMA Residuals")
plt.plot(residuals, label="Residuals")
plt.legend()

plt.subplot(2, 1, 2)
plt.title("ARIMA Forecast")
plt.plot(symbol_series[-50:], label="Actual")
plt.plot(forecast, label="Forecast", linestyle="--")
plt.legend()
plt.tight_layout()
plt.show()


LinAlgError: Schur decomposition solver error.

In [8]:
from arch import arch_model

# Fit GARCH model (e.g., AAPL's adjusted close prices)
garch_model = arch_model(symbol_series, vol='Garch', p=1, q=1)
garch_results = garch_model.fit(disp="off")

# Extract conditional volatility
volatility = garch_results.conditional_volatility

# Plot conditional volatility
plt.figure(figsize=(10, 5))
plt.title("GARCH Conditional Volatility")
plt.plot(volatility, label="Volatility")
plt.legend()
plt.show()


ValueError: first_obs and last_obs produce in an empty array.