In [2]:
import pandas as pd
import numpy as np
import os

In [6]:
# ---- Select Dataset Source ----
# Options: 'NIFTY50', 'NASDAQ', 'Crypto'
dataset_option = 'NIFTY50'  # Change as needed

if dataset_option == 'NIFTY50':
    DATA_PATH = 'NIFTY50_Combined.csv'
elif dataset_option == 'NASDAQ':
    DATA_PATH = 'nasdaq_dataset.csv'
elif dataset_option == 'Crypto':
    DATA_PATH = 'crypto_dataset.csv'
else:
    raise ValueError('Invalid dataset option selected!')

print(f"Using dataset: {DATA_PATH}")

Using dataset: NIFTY50_Combined.csv


In [7]:

RESULTS_DIR = "results"
os.makedirs(RESULTS_DIR, exist_ok=True)

In [18]:
# Ensure results subdirectory for selected dataset exists
os.makedirs(os.path.join(RESULTS_DIR, dataset_option), exist_ok=True)

In [19]:
df = pd.read_csv(DATA_PATH)

In [20]:
print("Columns in dataset:", df.columns)

Columns in dataset: Index(['Date', 'Company', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')


In [21]:
# 2. Pivot table: each column = stock, values = Adj Close
price_df = df.pivot(index="Date", columns="Company", values="Close")

In [22]:
price_df.head()

Company,Adani Enterprises,Adani Green Energy,Adani Ports,Apollo Hospitals,Asian Paints,Axis Bank,Bajaj Auto,Bajaj Finance,Bajaj Finserv,Bharat Petroleum,...,State Bank of India,Sun Pharmaceutical,Tata Consultancy Services,Tata Motors,Tata Steel,Tech Mahindra,Titan Company,UPL,UltraTech Cement,Wipro
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-02,3835.772705,1888.699951,809.38147,4417.438965,2960.424316,939.112061,3354.876953,646.237976,1549.948853,147.562225,...,578.999573,968.350952,3121.404541,389.017822,110.620438,930.023743,2541.474365,703.149963,6901.092285,188.113312
2023-01-03,3825.537354,1892.949951,807.560547,4453.686035,2941.96582,959.757446,3380.925781,651.414795,1560.431519,148.888229,...,579.18866,980.10321,3169.161865,388.131012,109.878334,943.197937,2588.871826,700.569153,6877.147949,190.027725
2023-01-04,3821.642822,1860.5,797.27478,4396.5625,2930.890869,954.920288,3335.070068,647.938904,1545.306396,148.645126,...,572.379089,975.246887,3172.319824,379.952606,107.373726,939.973511,2572.973877,696.96582,6888.702148,186.51004
2023-01-05,3824.388916,1860.0,806.723877,4392.347656,2918.746826,947.041077,3399.183594,601.500549,1466.436157,151.319229,...,572.284607,986.999329,3168.922607,381.233521,108.115829,932.004517,2537.859131,701.494385,6890.324707,185.169952
2023-01-06,3818.846924,1838.25,793.435974,4351.09375,2893.535889,937.416687,3418.990234,589.702332,1427.999512,152.357956,...,568.075989,979.714783,3073.647461,376.405304,107.28096,908.512146,2514.086182,699.400513,6865.9375,182.824799


In [23]:
# 3. Calculate daily returns (percentage change)
returns_df = price_df.pct_change().dropna()

In [24]:
# 4. Calculate expected returns (mean of daily returns)
expected_returns = returns_df.mean()

In [25]:
# 5. Calculate covariance matrix of returns
cov_matrix = returns_df.cov()

In [26]:
returns_df.to_csv(os.path.join(RESULTS_DIR, f"{dataset_option}/daily_returns.csv"))
expected_returns.to_csv(os.path.join(RESULTS_DIR, f"{dataset_option}/expected_returns.csv"))
cov_matrix.to_csv(os.path.join(RESULTS_DIR, f"{dataset_option}/cov_matrix.csv"))

In [27]:

print("✅ Preprocessing done. Files saved in /results")
print("Expected Returns (first few):\n", expected_returns.head())
print("Covariance Matrix shape:", cov_matrix.shape)

✅ Preprocessing done. Files saved in /results
Expected Returns (first few):
 Company
Adani Enterprises    -0.000039
Adani Green Energy   -0.000508
Adani Ports           0.001241
Apollo Hospitals      0.001112
Asian Paints         -0.000478
dtype: float64
Covariance Matrix shape: (50, 50)
