In [4]:
import pandas as pd
import numpy as np
import pathlib

# Paths
RAW = pathlib.Path("../data/raw/sample_data.csv")
RAW.parent.mkdir(parents=True, exist_ok=True)

# Generate synthetic dataset
np.random.seed(42)  # for reproducibility
n_rows = 500  # larger dataset

df = pd.DataFrame({
    "date": pd.date_range("2024-01-01", periods=n_rows, freq="D"),
    "price": np.random.normal(loc=100, scale=10, size=n_rows),   # random prices around 100
    "volume": np.random.randint(500, 2000, size=n_rows),         # random volume between 500–2000
    "returns": np.random.normal(loc=0.001, scale=0.02, size=n_rows)  # daily returns
})

# Inject missing values randomly
for col in ["price", "volume", "returns"]:
    missing_idx = np.random.choice(n_rows, size=int(n_rows * 0.1), replace=False)  # 10% missing
    df.loc[missing_idx, col] = np.nan

# Save to raw directory
df.to_csv(RAW, index=False)
print(f"✅ Sample dataset with {n_rows} rows saved -> {RAW}")

df.head(500)


✅ Sample dataset with 500 rows saved -> ../data/raw/sample_data.csv


Unnamed: 0,date,price,volume,returns
0,2024-01-01,,1805.0,0.020714
1,2024-01-02,98.617357,1843.0,-0.010889
2,2024-01-03,106.476885,,0.005350
3,2024-01-04,115.230299,814.0,0.039917
4,2024-01-05,97.658466,1376.0,0.014832
...,...,...,...,...
495,2025-05-10,105.389100,1958.0,-0.004644
496,2025-05-11,89.627538,,0.018306
497,2025-05-12,98.096613,1365.0,
498,2025-05-13,91.243817,1023.0,0.016462
