In [5]:
import os
from dotenv import load_dotenv
from pathlib import Path
from datetime import datetime
import pandas as pd
import yfinance as yf

def load_stock_data(symbol="TSLA", period="3mo", interval="1d"):
    """
    Download stock data and return a clean DataFrame with columns: date, adj_close
    """
    df = yf.download(symbol, period=period, interval=interval, auto_adjust=False).reset_index()

    # If columns are multi-index, flatten them
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [c[0] if isinstance(c, tuple) else c for c in df.columns]

    # Ensure we always have 'date' and 'adj_close'
    if "Adj Close" in df.columns:
        df = df[["Date", "Adj Close"]].rename(columns={"Date": "date", "Adj Close": "adj_close"})
    elif "Close" in df.columns:
        df = df[["Date", "Close"]].rename(columns={"Date": "date", "Close": "adj_close"})
    else:
        raise ValueError("No 'Close' or 'Adj Close' column found in downloaded data!")

    # Fix data types
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["adj_close"] = pd.to_numeric(df["adj_close"], errors="coerce").astype(float)

    return df

load_dotenv()

RAW = Path(os.getenv("DATA_DIR_RAW"))
PROCESSED = Path(os.getenv("DATA_DIR_PROCESSED"))

RAW.mkdir(parents=True, exist_ok=True)
PROCESSED.mkdir(parents=True, exist_ok=True)

SYMBOL = "TSLA"
df_api = load_stock_data(SYMBOL)

df_api["date"] = pd.to_datetime(df_api["date"], errors="coerce")
df_api["adj_close"] = pd.to_numeric(df_api["adj_close"], errors="coerce").astype(float)

print("✅ Downloaded TSLA data:")
print(df_api.head())

ts = datetime.now().strftime("%Y%m%d-%H%M")
csv_path = RAW / f"{SYMBOL}_{ts}.csv"
parquet_path = PROCESSED / f"{SYMBOL}_{ts}.parquet"

df_api.to_csv(csv_path, index=False)
df_api.to_parquet(parquet_path, index=False, engine="fastparquet")

print(f"✅ Saved {csv_path} and {parquet_path}")

[*********************100%***********************]  1 of 1 completed


✅ Downloaded TSLA data:
        date   adj_close
0 2025-05-20  343.820007
1 2025-05-21  334.619995
2 2025-05-22  341.040009
3 2025-05-23  339.339996
4 2025-05-27  362.890015
✅ Saved data/raw/TSLA_20250820-1041.csv and data/processed/TSLA_20250820-1041.parquet


In [None]:
df_csv = pd.read_csv(csv_path, parse_dates=["date"])
df_parquet = pd.read_parquet(parquet_path)

def validate(df):
    if not pd.api.types.is_datetime64_any_dtype(df["date"]):
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
    if not pd.api.types.is_float_dtype(df["adj_close"]):
        df["adj_close"] = pd.to_numeric(df["adj_close"], errors="coerce").astype(float)
    assert df["date"].notna().all()
    assert df["adj_close"].notna().all()
    print("✅ Validation passed")

validate(df_csv)
validate(df_parquet)

✅ Validation passed
✅ Validation passed


In [None]:
def write_df(df, path):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    if path.suffix == ".csv":
        df.to_csv(path, index=False)
    elif path.suffix == ".parquet":
        try:
            df.to_parquet(path, index=False, engine="fastparquet")
        except ImportError:
            raise ImportError("⚠️ Parquet support requires `pyarrow` or `fastparquet`. Please install one.")
    else:
        raise ValueError("Unsupported file format")
    print(f"✅ Saved {path}")

def read_df(path):
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"❌ File not found: {path}")

    if path.suffix == ".csv":
        return pd.read_csv(path, parse_dates=["date"])
    elif path.suffix == ".parquet":
        return pd.read_parquet(path)
    else:
        raise ValueError("Unsupported file format")

In [None]:
test_csv = RAW / f"{SYMBOL}_{ts}_test.csv"
test_parquet = PROCESSED / f"{SYMBOL}_{ts}_test.parquet"

write_df(df_api, test_csv)
write_df(df_api, test_parquet)

df1 = read_df(test_csv)
df2 = read_df(test_parquet)

validate(df1)
validate(df2)

✅ Saved data/raw/TSLA_20250819-0957_test.csv
✅ Saved data/processed/TSLA_20250819-0957_test.parquet
✅ Validation passed
✅ Validation passed
