In [1]:
print("\nS&P 500 Time Series Analysis")
print(" In this notebook, its a  comprehensive exploratory data analysis (EDA) and time series modeling on the S&P 500 index over the last 25 years using Python. with the use yfinance to fetch real financial data, conduct visualizations with plotly and seaborn, apply statistical tests, and build a simple regression model for forecasting.")


S&P 500 Time Series Analysis
 In this notebook, its a  comprehensive exploratory data analysis (EDA) and time series modeling on the S&P 500 index over the last 25 years using Python. with the use yfinance to fetch real financial data, conduct visualizations with plotly and seaborn, apply statistical tests, and build a simple regression model for forecasting.


In [2]:
!pip install yfinance plotly seaborn scikit-learn statsmodels



In [3]:
# importing libraries and modules for use 
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

plt.style.use('seaborn-v0_8')
sns.set_palette("Set2")

In [4]:
#  Exploratory Data Analysis (EDA)
# overall overviews
print("=== Data Info ===")
print(sp500.info())
print("\n=== Missing Values ===")
print(sp500.isnull().sum())

=== Data Info ===


NameError: name 'sp500' is not defined

In [7]:
# summary statistics
sp500.describe()

NameError: name 'sp500' is not defined

In [6]:
# Visualization of  Closing Price Over Time with Plotly)
fig = px.line(sp500, x=sp500.index, y="Adj Close",
              title="S&P 500 Adjusted Close Price (1999–2024)",
              labels={"Adj Close": "Price (USD)"},
              color_discrete_sequence=["#1f77b4"])
fig.update_layout(hovermode="x unified")
fig.show()

NameError: name 'px' is not defined

In [None]:
print("Insight: Long-term upward trend with visible cycles ; dot-com bubble, 2008 crash, post-2020 surge")

In [None]:
#Volume Over Time
fig_vol = px.line(sp500, x=sp500.index, y="Volume",
                  title="S&P 500 Trading Volume Over Time",
                  labels={"Volume": "Volume"},
                  color_discrete_sequence=["#d62728"])
fig_vol.show()

In [None]:
print("Insight: Volume increased significantly after 2010 — more institutional and algorithmic trading")

In [None]:
print("\n Distribution of Daily Returns")
# Calculation of  daily returns
sp500['Return'] = sp500['Adj Close'].pct_change()

# Drop NaN
returns = sp500['Return'].dropna()

# Histogram + KDE
plt.figure(figsize=(10, 6))
sns.histplot(returns, bins=100, kde=True, color='skyblue')
plt.title("Distribution of Daily Returns (S&P 500)")
plt.xlabel("Daily Return")
plt.ylabel("Frequency")
plt.axvline(returns.mean(), color='red', linestyle='--', label=f'Mean: {returns.mean():.4f}')
plt.axvline(0, color='black', linestyle='-', alpha=0.5)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
print("Insight: Near-normal distribution but with fat tails — extreme events more common than normal theory predicts. ")

In [None]:
print("Rolling Statistics: Mean and Volatility")
window = 252  # ~1 year of trading days

sp500['Rolling_Mean'] = sp500['Adj Close'].rolling(window).mean()
sp500['Rolling_Volatility'] = returns.rolling(window).std() * np.sqrt(252)  # Annualized

# Plot
fig, ax1 = plt.subplots(figsize=(12, 6))

color = 'tab:blue'
ax1.set_xlabel('Date')
ax1.set_ylabel('Price', color=color)
ax1.plot(sp500.index, sp500['Adj Close'], label='Adj Close', color=color, alpha=0.6)
ax1.plot(sp500.index, sp500['Rolling_Mean'], label='1Y Rolling Mean', color='blue', linewidth=2)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Volatility (Annualized)', color=color)
ax2.plot(sp500.index, sp500['Rolling_Volatility'], color='red', linestyle='--', label='1Y Volatility')
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title("S&P 500: Price, Trend, and Volatility")
fig.legend(loc="upper left", bbox_to_anchor=(0.1, 0.9))
plt.show()

In [None]:
print("Insight: Volatility spikes during crises (2000, 2008, 2020). Trend-following behavior evident.")

In [None]:
# Statistical Analysis
# Stationarity Test: Augmented Dickey-Fuller (ADF)
# stationary time series has constant mean/variance 

result = adfuller(sp500['Adj Close'].dropna())
print('=== ADF Test on Raw Prices ===')
print(f'ADF Statistic: {result[0]:.6f}')
print(f'p-value: {result[1]:.6f}')
print("Critical Values:")
for key, value in result[4].items():
    print(f'\t{key}: {value:.3f}')

if result[1] <= 0.05:
    print("✅ Reject Null Hypothesis → Series is stationary")
else:
    print("❌ Cannot reject Null Hypothesis → Series is non-stationary")

In [None]:

# Autocorrelation & Partial Autocorrelation (ACF/PACF)
print("\nAutocorrelation & Partial Autocorrelation (ACF/PACF)")
fig, ax = plt.subplots(2, 2, figsize=(14, 8))

# ACF of Returns
plot_acf(returns.dropna(), lags=40, ax=ax[0,0], title="ACF of Daily Returns")
# PACF of Returns
plot_pacf(returns.dropna(), lags=40, ax=ax[0,1], title="PACF of Daily Returns")

# ACF of Squared Returns (volatility clustering)
plot_acf(returns.dropna()**2, lags=40, ax=ax[1,0], title="ACF of Squared Returns")
# PACF of Squared Returns
plot_pacf(returns.dropna()**2, lags=40, ax=ax[1,1], title="PACF of Squared Returns")

plt.tight_layout()
plt.show()

In [None]:
print("Insight: Little autocorrelation in returns (efficient market hypothesis). -- Strong autocorrelation in squared returns → volatility clustering (GARCH effects).")

In [None]:
print("\nRegression Analysis")

print ("Feature Engineering")
# Create features
sp500['Lag_1'] = sp500['Log_Return'].shift(1)
sp500['Lag_2'] = sp500['Log_Return'].shift(2)
sp500['Volatility_5D'] = sp500['Log_Return'].rolling(5).std()
sp500['MA_10'] = sp500['Adj Close'].rolling(10).mean()
sp500['MA_50'] = sp500['Adj Close'].rolling(50).mean()
sp500['MA_Ratio'] = sp500['MA_10'] / sp500['MA_50']  # Momentum signal

# Target: next day's return
sp500['Target'] = sp500['Log_Return'].shift(-1)

# Drop NaNs
data_for_reg = sp500[['Lag_1', 'Lag_2', 'Volatility_5D', 'MA_Ratio', 'Target']].dropna()

In [None]:
print("Train-Test Split")
# Use 80% for training
split_index = int(len(data_for_reg) * 0.8)
train = data_for_reg.iloc[:split_index]
test = data_for_reg.iloc[split_index:]

X_train = train[['Lag_1', 'Lag_2', 'Volatility_5D', 'MA_Ratio']]
y_train = train['Target']

X_test = test[['Lag_1', 'Lag_2', 'Volatility_5D', 'MA_Ratio']]
y_test = test['Target']

In [None]:
print("Fiting a  Linear Regression Model")
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"✅ Regression Results:")
print(f"RMSE: {rmse:.6f}")
print(f"R² Score: {r2:.4f}")

# Feature importance
features = X_train.columns
coefficients = model.coef_
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})
print("\nFeature Coefficients:")
print(coef_df)

In [None]:
print("Insight: R² is likely very low (e.g., < 0.05) — hard to predict stock returns.
Lag_1 may have small negative coefficient (mean reversion).
MA_Ratio might be positive (momentum effect).")

In [None]:
print("Predicted vs Actual Plot")

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color='purple')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Returns")
plt.ylabel("Predicted Returns")
plt.title("Actual vs Predicted Log Returns (Linear Regression)")
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
#Key Insights & Summary Report
print(" S&P 500 Time Series Analysis (1999–2024) – Junior Data Scientist Report")
print("="*70)
print("1. 📈 Long-Term Trend: Strong upward growth (~7–8% annualized), with major drawdowns in 2000–02, 2008, 2020.")
print("2. 📉 Volatility: Non-constant; clusters during crises (volatility clustering).")
print("3. 📊 Returns Distribution: Approximately normal but with fat tails — extreme moves more likely.")
print("4.  Stationarity: Prices non-stationary; log returns are stationary.")
print("5.  Autocorrelation: Returns uncorrelated (efficient market), but volatility is persistent.")
print("6.  Seasonality: Weak seasonal pattern — possible Santa Claus rally?")
print("7.  Regression: Poor predictive power (low R²), consistent with market efficiency.")
print("8.  Takeaway: Buy-and-hold has worked well; timing the market is extremely difficult.")

In [None]:
 print("Interactive Candlestick Chart for the  Last 2 years for clarity")
recent_data = sp500.tail(500)

fig = go.Figure(data=[go.Candlestick(x=recent_data.index,
                open=recent_data['Open'],
                high=recent_data['High'],
                low=recent_data['Low'],
                close=recent_data['Close'],
                name="S&P 500")])

fig.update_layout(title="S&P 500 Recent Candlestick Chart",
                  xaxis_title="Date",
                  yaxis_title="Price (USD)",
                  xaxis_rangeslider_visible=True)

fig.show()