# ============================================================================
# Data Acquisition – Synthetic Dataset
# ============================================================================


This notebook is responsible for generating the market data used throughout
the project.

Since full intraday historical options chain data is not freely available,
I chose to generate a synthetic but statistically realistic dataset. 
The goal of this project is to demonstrate the complete quantitative research workflow, data engineering, feature creation, regime detection, strategy design, and
machine learning rather than focusing on data scraping.

The dataset is generated in a reproducible manner and saved locally so that
all subsequent notebooks can consume the same data.

The following datasets are created:
1. NIFTY Spot – 5-minute OHLCV data
2. NIFTY Futures – 5-minute OHLCV with open interest
3. NIFTY Options Chain – ATM and ATM ±2 strikes (Call & Put)



In [1]:

# ============================================================================
# Cell 2: Imports
# ============================================================================

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("✅ Imports successful")



✅ Imports successful


In [2]:


# ============================================================================
# Cell 3: Configuration
# ============================================================================

# Data directories
DATA_DIR = Path('../data')
RAW_DIR = DATA_DIR / 'raw'
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Date range
START_DATE = datetime(2024, 1, 1, 9, 15)
END_DATE = datetime(2025, 1, 14, 15, 30)
BASE_PRICE = 18000

print(f"Period: {START_DATE.date()} to {END_DATE.date()}")
print(f"Base Price: ₹{BASE_PRICE:,}")



Period: 2024-01-01 to 2025-01-14
Base Price: ₹18,000


In [3]:
# ============================================================================
# Cell 4: Generate Timestamps
# ============================================================================

# Create 5-minute timestamps for market hours (9:15 AM - 3:30 PM)
timestamps = pd.date_range(start=START_DATE, end=END_DATE, freq='5min')

# Filter for market hours only
timestamps = timestamps[
    ((timestamps.hour == 9) & (timestamps.minute >= 15)) |
    ((timestamps.hour > 9) & (timestamps.hour < 15)) |
    ((timestamps.hour == 15) & (timestamps.minute <= 30))
]

print(f"Total timestamps: {len(timestamps):,}")
print(f"First: {timestamps[0]}")
print(f"Last: {timestamps[-1]}")



Total timestamps: 28,880
First: 2024-01-01 09:15:00
Last: 2025-01-14 15:30:00


In [4]:
# ============================================================================
# Cell 5: Generate NIFTY Spot Data
# ============================================================================

print("\n" + "="*70)
print("GENERATING NIFTY SPOT DATA")
print("="*70)

n = len(timestamps)
np.random.seed(42)  # For reproducibility

# Generate realistic price movements
returns = np.random.normal(0.0001, 0.01, n)
prices = BASE_PRICE * (1 + returns).cumprod()

# Add upward trend (realistic for Indian markets)
trend = np.linspace(0, 0.15, n)
prices = prices * (1 + trend)

# Generate OHLC
noise = np.random.normal(0, 0.002, n)
spot_df = pd.DataFrame({
    'timestamp': timestamps,
    'open': prices * (1 + noise),
    'high': prices * (1 + np.abs(np.random.normal(0, 0.003, n))),
    'low': prices * (1 - np.abs(np.random.normal(0, 0.003, n))),
    'close': prices,
    'volume': np.random.randint(500000, 2000000, n)
})

# Ensure OHLC validity
spot_df['high'] = spot_df[['open', 'high', 'low', 'close']].max(axis=1)
spot_df['low'] = spot_df[['open', 'high', 'low', 'close']].min(axis=1)

# Save
spot_file = RAW_DIR / 'nifty_spot_5min.csv'
spot_df.to_csv(spot_file, index=False)

print(f"✅ Generated {len(spot_df):,} rows")
print(f"✅ Saved: {spot_file}")
print(f"\nSample Data:")
print(spot_df.head())




GENERATING NIFTY SPOT DATA
✅ Generated 28,880 rows
✅ Saved: ..\data\raw\nifty_spot_5min.csv

Sample Data:
            timestamp          open          high           low         close  \
0 2024-01-01 09:15:00  18031.061554  18096.399679  18002.290282  18091.208548   
1 2024-01-01 09:20:00  18059.899742  18132.221053  18059.171267  18068.097832   
2 2024-01-01 09:25:00  18192.682828  18226.495859  18154.706775  18187.024105   
3 2024-01-01 09:30:00  18469.180687  18469.180687  18337.545500  18465.932526   
4 2024-01-01 09:35:00  18420.133196  18546.840175  18384.642231  18424.636212   

    volume  
0   764010  
1  1428462  
2  1610771  
3  1415442  
4  1960923  


In [5]:
# ============================================================================
# Cell 6: Generate NIFTY Futures Data
# ============================================================================

print("\n" + "="*70)
print("GENERATING NIFTY FUTURES DATA")
print("="*70)

futures_df = spot_df.copy()

# Add futures basis (cost of carry)
for idx, row in futures_df.iterrows():
    days_to_expiry = (30 - row['timestamp'].day) % 30 + 1
    basis = 0.002 * (days_to_expiry / 30)
    basis += np.random.normal(0, 0.0005)
    
    futures_df.loc[idx, 'open'] = row['open'] * (1 + basis)
    futures_df.loc[idx, 'high'] = row['high'] * (1 + basis)
    futures_df.loc[idx, 'low'] = row['low'] * (1 + basis)
    futures_df.loc[idx, 'close'] = row['close'] * (1 + basis)

# Add Open Interest
base_oi = 2000000
oi_trend = np.linspace(base_oi, base_oi * 2, n)
monthly_cycle = np.sin(np.arange(n) * 2 * np.pi / (78 * 20)) * 500000
futures_df['open_interest'] = (oi_trend + monthly_cycle).astype(int)

# Add expiry dates
def get_last_thursday(year, month):
    if month == 12:
        last_day = datetime(year + 1, 1, 1) - timedelta(days=1)
    else:
        last_day = datetime(year, month + 1, 1) - timedelta(days=1)
    while last_day.weekday() != 3:
        last_day -= timedelta(days=1)
    return last_day

futures_df['expiry'] = futures_df['timestamp'].apply(
    lambda x: get_last_thursday(x.year, x.month)
)

# Save
futures_file = RAW_DIR / 'nifty_futures_5min.csv'
futures_df.to_csv(futures_file, index=False)

print(f"✅ Generated {len(futures_df):,} rows")
print(f"✅ Saved: {futures_file}")
print(f"\nSample Data:")
print(futures_df.head())




GENERATING NIFTY FUTURES DATA
✅ Generated 28,880 rows
✅ Saved: ..\data\raw\nifty_futures_5min.csv

Sample Data:
            timestamp          open          high           low         close  \
0 2024-01-01 09:15:00  18065.417927  18130.880548  18036.591835  18125.679525   
1 2024-01-01 09:20:00  18086.376577  18158.803915  18085.647034  18094.586686   
2 2024-01-01 09:25:00  18225.315076  18259.188757  18187.270904  18219.646202   
3 2024-01-01 09:30:00  18496.344977  18496.344977  18364.516183  18493.092039   
4 2024-01-01 09:35:00  18460.035558  18587.017014  18424.467711  18464.548329   

    volume  open_interest     expiry  
0   764010        2000000 2024-01-25  
1  1428462        2002083 2024-01-25  
2  1610771        2004166 2024-01-25  
3  1415442        2006249 2024-01-25  
4  1960923        2008332 2024-01-25  


In [6]:
# ============================================================================
# Cell 7: Generate NIFTY Options Chain
# ============================================================================

print("\n" + "="*70)
print("GENERATING NIFTY OPTIONS CHAIN")
print("="*70)

all_options = []

for idx, row in spot_df.iterrows():
    if idx % 1000 == 0:
        print(f"Progress: {idx:,}/{n:,} ({idx/n*100:.1f}%)", end='\r')
    
    spot_price = row['close']
    atm_strike = round(spot_price / 50) * 50
    timestamp = row['timestamp']
    
    # Generate for ATM ±2 strikes
    for i in range(-2, 3):
        strike = atm_strike + i * 50
        moneyness = (spot_price - strike) / strike
        
        # Base IV
        base_iv = 0.15 + abs(i) * 0.025
        iv_noise = np.random.normal(0, 0.01)
        
        # Days to expiry
        days_to_expiry = (futures_df.loc[idx, 'expiry'] - timestamp).days
        time_value = base_iv * spot_price * np.sqrt(days_to_expiry / 365) * 0.4
        
        # CALL OPTION
        call_iv = max(base_iv + iv_noise, 0.05)
        call_intrinsic = max(spot_price - strike, 0)
        call_ltp = max(call_intrinsic + time_value + np.random.normal(0, 2), 0.05)
        
        all_options.append({
            'timestamp': timestamp,
            'strike': strike,
            'option_type': 'CE',
            'ltp': call_ltp,
            'iv': call_iv,
            'open_interest': int(np.random.randint(50000, 500000) * (1 + abs(moneyness))),
            'volume': int(np.random.randint(5000, 50000) * (1 + abs(moneyness))),
            'strike_position': i
        })
        
        # PUT OPTION
        put_iv = max(base_iv + iv_noise + 0.01, 0.05)
        put_intrinsic = max(strike - spot_price, 0)
        put_ltp = max(put_intrinsic + time_value + np.random.normal(0, 2), 0.05)
        
        all_options.append({
            'timestamp': timestamp,
            'strike': strike,
            'option_type': 'PE',
            'ltp': put_ltp,
            'iv': put_iv,
            'open_interest': int(np.random.randint(50000, 500000) * (1 + abs(moneyness))),
            'volume': int(np.random.randint(5000, 50000) * (1 + abs(moneyness))),
            'strike_position': i
        })

options_df = pd.DataFrame(all_options)

# Save
options_file = RAW_DIR / 'nifty_options_5min.csv'
options_df.to_csv(options_file, index=False)

print(f"\n✅ Generated {len(options_df):,} rows")
print(f"✅ Saved: {options_file}")
print(f"\nSample Data:")
print(options_df.head(10))




GENERATING NIFTY OPTIONS CHAIN
Progress: 28,000/28,880 (97.0%)
✅ Generated 288,800 rows
✅ Saved: ..\data\raw\nifty_options_5min.csv

Sample Data:
            timestamp  strike option_type         ltp        iv  \
0 2024-01-01 09:15:00   18000          CE  457.198333  0.189037   
1 2024-01-01 09:15:00   18000          PE  362.492068  0.199037   
2 2024-01-01 09:15:00   18050          CE  359.544778  0.165595   
3 2024-01-01 09:15:00   18050          PE  318.958470  0.175595   
4 2024-01-01 09:15:00   18100          CE  269.261553  0.143144   
5 2024-01-01 09:15:00   18100          PE  280.293934  0.153144   
6 2024-01-01 09:15:00   18150          CE  316.406965  0.184701   
7 2024-01-01 09:15:00   18150          PE  373.968038  0.194701   
8 2024-01-01 09:15:00   18200          CE  361.024833  0.187750   
9 2024-01-01 09:15:00   18200          PE  473.408012  0.197750   

   open_interest  volume  strike_position  
0         215696   28706               -2  
1         179565   19637   

In [7]:
# ============================================================================
# Cell 8: Summary
# ============================================================================

print("\n" + "="*70)
print("DATA GENERATION COMPLETE!")
print("="*70)

print(f"\nGenerated Files:")
print(f"1. Spot:    {len(spot_df):,} rows")
print(f"2. Futures: {len(futures_df):,} rows")
print(f"3. Options: {len(options_df):,} rows")

print(f"\nData Period: {timestamps.min()} to {timestamps.max()}")
print(f"Total Days: {(timestamps.max() - timestamps.min()).days}")

print("\n✅ Ready for next step: Data Cleaning")


DATA GENERATION COMPLETE!

Generated Files:
1. Spot:    28,880 rows
2. Futures: 28,880 rows
3. Options: 288,800 rows

Data Period: 2024-01-01 09:15:00 to 2025-01-14 15:30:00
Total Days: 379

✅ Ready for next step: Data Cleaning


In [8]:
import os

# Ensure directories exist
os.makedirs("data/raw", exist_ok=True)

# Save datasets
spot_df.to_csv("data/raw/nifty_spot_5min.csv", index=False)
futures_df.to_csv("data/raw/nifty_futures_5min.csv", index=False)
options_df.to_csv("data/raw/nifty_options_5min.csv", index=False)

print("Datasets saved to data/raw/")


Datasets saved to data/raw/


In [9]:
import os
os.listdir("data/raw")


['nifty_futures_5min.csv', 'nifty_options_5min.csv', 'nifty_spot_5min.csv']