# Gaussian Copula Model for Securities
This notebook demonstrates how to build a Gaussian copula model based on n number of securities and their market data.

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm, multivariate_normal
from sklearn.preprocessing import StandardScaler
import yfinance as yf

# make numpy display all the columns in one line
np.set_printoptions(linewidth=np.inf)
# do the same for pandas print
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 10)

np.random.seed(42)

## Step 1: Load and Preprocess Market Data
Load the market data for n securities and preprocess it.

In [4]:
# Download market data for 10 stocks for the year 2024
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'TSLA', 'NVDA', 'JPM', 'V', 'DIS']
data = yf.download(tickers, start='2024-01-01', end='2024-12-31')['Close']

returns = data.pct_change().dropna()
scaler = StandardScaler()
scaled_returns = scaler.fit_transform(returns)

[*********************100%***********************]  10 of 10 completed


## Method 1: Using library functions of joint normal samples
Calculate the covariance matrix of the scaled returns.

In [12]:
# cov_matrix of scaled returns is approximately the correlation matrix of original returns
cov_matrix = np.cov(scaled_returns, rowvar=False)
print('Covariance Matrix:\n', cov_matrix)

Covariance Matrix:
 [[1.00401606 0.3377596  0.05977322 0.36020063 0.01526369 0.226193   0.47151427 0.25500373 0.32957531 0.14681625]
 [0.3377596  1.00401606 0.1201376  0.54110954 0.24043504 0.58102271 0.68920676 0.43802133 0.31837241 0.25164645]
 [0.05977322 0.1201376  1.00401606 0.0804788  0.25850803 0.05545685 0.13900552 0.08253018 0.13952951 0.14967616]
 [0.36020063 0.54110954 0.0804788  1.00401606 0.14247483 0.3895367  0.5738031  0.31428636 0.25176369 0.24587224]
 [0.01526369 0.24043504 0.25850803 0.14247483 1.00401606 0.0742511  0.16163963 0.08655234 0.23466584 0.38116962]
 [0.226193   0.58102271 0.05545685 0.3895367  0.0742511  1.00401606 0.57252185 0.382839   0.14982836 0.17130895]
 [0.47151427 0.68920676 0.13900552 0.5738031  0.16163963 0.57252185 1.00401606 0.45953112 0.30147817 0.30490809]
 [0.25500373 0.43802133 0.08253018 0.31428636 0.08655234 0.382839   0.45953112 1.00401606 0.21435166 0.1115175 ]
 [0.32957531 0.31837241 0.13952951 0.25176369 0.23466584 0.14982836 0.301478

Generate the joint distribution using the covariance matrix.

In [14]:
mean = np.zeros(cov_matrix.shape[0])
joint_dist = multivariate_normal(mean, cov_matrix)

Construct a synthetic dataset that has the same underlying distribution and covariance matrix.

In [40]:
n_samples = 10000  # Number of samples in synthetic dataset
synthetic_data = joint_dist.rvs(size=n_samples)
synthetic_data = scaler.inverse_transform(synthetic_data)
synthetic_df = pd.DataFrame(synthetic_data, columns=data.columns)
print(synthetic_df.head())

print('Synthetic Data Statistics:')
print(synthetic_df.describe())
print('Original Data Statistics:')
print(returns.describe())

print('Synthetic Data Correlation Matrix:')
print(synthetic_df.corr())
print('Original Data Correlation Matrix:')
print(returns.corr())

Ticker      AAPL      AMZN       DIS     GOOGL       JPM      META      MSFT      NVDA      TSLA         V
0       0.006601 -0.012489  0.008346  0.011800  0.001861  0.032840 -0.003180  0.018115  0.014310 -0.000829
1       0.013440  0.010878 -0.002040  0.021606 -0.002456  0.008639 -0.001355  0.034944  0.088596  0.018492
2      -0.004394  0.008118  0.023525  0.013281  0.012918 -0.013053  0.004810  0.050919  0.021797 -0.001113
3      -0.024162  0.009082 -0.006436 -0.020394 -0.003079  0.002657 -0.005385  0.009613  0.049833 -0.015989
4      -0.002437 -0.005164 -0.024765  0.015048  0.001723 -0.003944 -0.015304 -0.023720 -0.031644  0.004201
Synthetic Data Statistics:
Ticker          AAPL          AMZN           DIS         GOOGL           JPM          META          MSFT          NVDA          TSLA             V
count   10000.000000  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000
mean        0.001404      0.001696   

## Method 2: Using manual steps of Gaussian copula matrix

The covariance matrix $ \Sigma $ is defined as:
$$ \Sigma = \frac{1}{n-1} \sum_{i=1}^{n} (X_i - \mu)(X_i - \mu)^T $$
where $ X_i $ is the vector of returns, and $ \mu $ is the mean vector.

In [23]:
cor_matrix = np.corrcoef(returns, rowvar=False) # cov_matrix of scaled returns is approximately the correlation matrix of original returns
print('\nCorrelation Matrix:\n', cor_matrix)
cov_matrix = np.cov(returns, rowvar=False)
print('\nCovariance Matrix:\n', cov_matrix)


Correlation Matrix:
 [[1.         0.33640856 0.05953413 0.35875982 0.01520264 0.22528823 0.46962821 0.25398372 0.32825701 0.14622898]
 [0.33640856 1.         0.11965705 0.53894511 0.2394733  0.57869862 0.68644993 0.43626925 0.31709892 0.25063987]
 [0.05953413 0.11965705 1.         0.08015688 0.257474   0.05523502 0.1384495  0.08220006 0.13897139 0.14907745]
 [0.35875982 0.53894511 0.08015688 1.         0.14190493 0.38797855 0.57150789 0.31302922 0.25075663 0.24488875]
 [0.01520264 0.2394733  0.257474   0.14190493 1.         0.0739541  0.16099307 0.08620613 0.23372718 0.37964494]
 [0.22528823 0.57869862 0.05523502 0.38797855 0.0739541  1.         0.57023177 0.38130764 0.14922904 0.17062372]
 [0.46962821 0.68644993 0.1384495  0.57150789 0.16099307 0.57023177 1.         0.457693   0.30027226 0.30368846]
 [0.25398372 0.43626925 0.08220006 0.31302922 0.08620613 0.38130764 0.457693   1.         0.21349426 0.11107143]
 [0.32825701 0.31709892 0.13897139 0.25075663 0.23372718 0.14922904 0.3002

In [24]:
eigenvalues, eigenvectors = np.linalg.eig(cor_matrix)
print('\nEigenvalues:', eigenvalues)
print('\nEigenvectors:', eigenvectors)


Eigenvalues: [3.65199894 1.40376315 0.27995585 0.36577314 0.95636467 0.87191844 0.72650695 0.53998454 0.61820265 0.58553167]

Eigenvectors: [[ 2.92966928e-01  1.46176639e-01  1.95861032e-01  1.45522649e-01 -6.15142042e-01 -1.66119518e-01 -2.75513977e-01 -4.96989589e-01 -2.76063646e-01 -1.67783250e-01]
 [ 4.31219938e-01  9.60603341e-02  5.60773081e-01 -6.11467816e-01  1.80295144e-01  4.73544083e-02  2.29200722e-02 -7.61537743e-02  2.74370381e-01 -1.04797993e-02]
 [ 1.21675177e-01 -4.48897937e-01  4.86418565e-02  8.24825235e-03  1.93813801e-01 -7.95286796e-01 -2.71091425e-01  1.11528781e-01 -4.57846919e-02  1.52254341e-01]
 [ 3.71376914e-01  1.09457365e-01  5.44633097e-02  3.41985372e-01 -2.43028213e-02  1.13469750e-01 -3.70616384e-01  5.98502467e-01  2.06157543e-01 -4.22669345e-01]
 [ 1.76241264e-01 -6.11415049e-01 -9.61139287e-02  1.62919548e-01  1.89802422e-01  1.84560148e-01  1.93181009e-01 -4.07851683e-01  1.91627303e-01 -4.98858357e-01]
 [ 3.51470662e-01  2.46321926e-01  4.1318502

To understand the structure of the covariance matrix, we perform eigen decomposition.

The eigen decomposition of the covariance matrix $ \Sigma $ is given by:
$$ \Sigma = EDE^T $$
where $ E $ is the matrix of eigenvectors, and $ D $ is the diagonal matrix of eigenvalues.

The transformation matrix $ T $ is then constructed as:
$$ T = D^{1/2} E^T $$
This matrix $ T $ is used to transform standard normal samples into samples that follow the desired covariance structure.

In [29]:
# make diagonal matrix from eigenvalues
D = np.diag(eigenvalues)
# square root of D
D_sqrt = np.sqrt(D)
# Transpose of eigenvectors
E = eigenvectors.T
# Transformation matrix
T = np.dot(D_sqrt, E)

We generate synthetic data by drawing samples from the multivariate normal distribution. This synthetic data retains the same dependency structure as the original data, making it useful for various financial analyses.

In [46]:
# Generate synthetic data
n_samples = 1000000
normal_sample = np.random.randn(n_samples, len(data.columns))
synthetic_data_manual = np.dot(normal_sample, T)
synthetic_data_manual = scaler.inverse_transform(synthetic_data_manual)
synthetic_df_manual = pd.DataFrame(synthetic_data_manual, columns=data.columns)
print(synthetic_df_manual.head())

print('\nSynthetic Data Statistics:')
print(synthetic_df_manual.describe())
print('\nOriginal Data Statistics:')
print(returns.describe())

print('\nSynthetic Data Correlation Matrix:')
print(synthetic_df_manual.corr())
print('\nOriginal Data Correlation Matrix:')
print(returns.corr())

Ticker      AAPL      AMZN       DIS     GOOGL       JPM      META      MSFT      NVDA      TSLA         V
0       0.034209 -0.013546 -0.002125 -0.010370  0.017371 -0.064480 -0.010584 -0.002158  0.067413  0.008777
1       0.016143  0.031007  0.034425  0.018615  0.008627  0.035118  0.020787 -0.009670  0.006205  0.024310
2       0.015717  0.030187  0.024171  0.026641  0.011984  0.022619  0.003938  0.012696  0.073056  0.012231
3      -0.000728  0.018875 -0.010180 -0.014817 -0.028274  0.011467  0.006200 -0.010251  0.052031 -0.005931
4      -0.008731 -0.014903 -0.017098 -0.010983  0.003809 -0.022630  0.005039  0.017494  0.024203 -0.002607

Synthetic Data Statistics:
Ticker            AAPL            AMZN             DIS           GOOGL             JPM            META            MSFT            NVDA            TSLA               V
count   1000000.000000  1000000.000000  1000000.000000  1000000.000000  1000000.000000  1000000.000000  1000000.000000  1000000.000000  1000000.000000  1000000.000

Directly using the covariance matrix instead of correlation matrix so that results are not required to be normalized.

In [49]:
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print('\nEigenvalues:', eigenvalues)
print('\nEigenvectors:', eigenvectors)
# make diagonal matrix from eigenvalues
D = np.diag(eigenvalues)
# square root of D
D_sqrt = np.sqrt(D)
# Transpose of eigenvectors
E = eigenvectors.T
# Transformation matrix
T = np.dot(D_sqrt, E)
n_samples = 1000000
normal_sample = np.random.randn(n_samples, len(data.columns))
synthetic_data_manual = np.dot(normal_sample, T)
synthetic_df_manual = pd.DataFrame(synthetic_data_manual, columns=data.columns)


print('\nSynthetic Data Covariance Matrix:')
print(synthetic_df_manual.cov())
print('\nOriginal Data Covariance Matrix:')
print(returns.cov())


Eigenvalues: [2.05410947e-03 1.11333593e-03 5.14466695e-04 3.03219474e-04 2.47863502e-04 1.91788125e-04 5.05665112e-05 8.32205442e-05 1.40351139e-04 1.20808156e-04]

Eigenvectors: [[ 0.14429263 -0.02411793  0.09517642 -0.06220645 -0.36934801 -0.36683897  0.20595361  0.1626669  -0.75569197  0.23395133]
 [ 0.24572039 -0.17982166  0.33719506  0.08065048 -0.13458124  0.15714164  0.2791197  -0.01748361 -0.10045101 -0.80992046]
 [ 0.06782927  0.01261285  0.03787692  0.75647716  0.23161875 -0.59125868  0.03493855 -0.01076871  0.12147699 -0.0471206 ]
 [ 0.19601209 -0.13313235  0.32290846  0.08514735 -0.65102689 -0.01464735  0.09036034  0.1229737   0.54408027  0.29828456]
 [ 0.09112794  0.03678232  0.07558278  0.54067717  0.0704008   0.65676811 -0.07483791  0.37959418 -0.24357469  0.21674061]
 [ 0.24373062 -0.32370459  0.58927834 -0.25425579  0.57374819 -0.05623282  0.07622896  0.07810924  0.02355012  0.28124744]
 [ 0.170835   -0.1333235   0.21077243  0.03542477 -0.15189046 -0.05749448 -0.8947