In [19]:
import os, numpy as np, pandas as pd, datetime as dt, pathlib
from dotenv import load_dotenv

load_dotenv()

# Generate synthetic financial dataset
np.random.seed(42)
n = 100

df = pd.DataFrame({
    'customer_id': range(1, n+1),
    'income': np.random.normal(60000, 15000, n).astype(int),
    'credit_score': np.random.normal(680, 50, n).astype(int),
    'monthly_spend': np.random.normal(2000, 600, n).astype(int),
    'region': np.random.choice(['North','South','East','West'], n),
})
df

Unnamed: 0,customer_id,income,credit_score,monthly_spend,region
0,1,67450,609,2214,North
1,2,57926,658,2336,West
2,3,69715,662,2649,South
3,4,82845,639,2632,North
4,5,56487,671,1173,North
...,...,...,...,...,...
95,96,38047,699,1584,South
96,97,64441,635,2539,East
97,98,63915,687,2184,South
98,99,60076,682,2487,North


In [9]:
path = r"C:\Users\Kartavya\bootcamp_Kartavya_Joshi\homework\homework\homework5"
os.chdir(path)
os.getcwd()

'C:\\Users\\Kartavya\\bootcamp_Kartavya_Joshi\\homework\\homework\\homework5'

In [91]:
RAW_DIR=pathlib.Path((os.getenv("DATA_DIR_RAW","data/raw")))
PROCESS_DIR=pathlib.Path(os.getenv("PROCESS_DIR_RAW","data/processed"))
NOTE_DIR=pathlib.Path(os.getenv("NOTE_DIR_RAW","notebook"))
SRC_DIR=pathlib.Path(os.getenv("SRC_DIR_RAW","src"))
RAW_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_DIR:",RAW_DIR.resolve())
PROCESS_DIR.mkdir(parents=True, exist_ok=True)
print("PROCESS_DIR:",PROCESS_DIR.resolve())
NOTE_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_DIR:",NOTE_DIR.resolve())
SRC_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_DIR:",SRC_DIR.resolve())

RAW_DIR: C:\Users\Kartavya\bootcamp_Kartavya_Joshi\homework\homework\homework5\data\raw
PROCESS_DIR: C:\Users\Kartavya\bootcamp_Kartavya_Joshi\homework\homework\homework5\data\processed
RAW_DIR: C:\Users\Kartavya\bootcamp_Kartavya_Joshi\homework\homework\homework5\notebook
RAW_DIR: C:\Users\Kartavya\bootcamp_Kartavya_Joshi\homework\homework\homework5\src


In [51]:
# defining timestamp function
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# saving file with timestamp
raw_path= RAW_DIR/ f"income_{ts()}.csv"
parq_path = PROCESS_DIR/ f"incomeparquet_{ts()}.csv"

df.to_csv(raw_path, index=False)
print("Saved CSV →", raw_path)

try:
    df.to_parquet(parq_path)  # uses installed engine if available
    print("Saved Parquet →", parq_path)
except Exception as e:
    print("Parquet save failed (engine missing?). Skipping Parquet demo.")
    print("Error:", e)

Saved CSV → data\raw\income_20250820-105051.csv
Saved Parquet → data\processed\incomeparquet_20250820-105051.csv


Reloading and Validation

In [81]:
def reload_validate(data: pd.DataFrame, rel: pd.DataFrame, cols=('customer_id','income','credit_score','monthly_spend','region')):
    checks = {
        'shape_equal': data.shape == rel.shape,
        'cols_present': all(c in rel.columns for c in cols)
    }
    if 'customer_id' in rel.columns:
        checks['customer_id_is_numeric'] = pd.api.types.is_numeric_dtype(rel['customer_id'])
    if 'region' in rel.columns:
        checks['region'] = pd.api.types.is_string_dtype(rel['region'])
    return checks

df_csv = pd.read_csv(raw_path)
print('CSV validation:', reload_validate(df, df_csv))

if parq_path.exists():
    try:
        df_parq = pd.read_parquet(parq_path)
        print('Parquet validation:', reload_validate(df, df_parq))
    except Exception as e:
        print('Parquet read failed:', e)
else:
    print('Parquet file not present (skipped earlier).')

CSV validation: {'shape_equal': True, 'cols_present': True, 'customer_id_is_numeric': True, 'region': True}
Parquet validation: {'shape_equal': True, 'cols_present': True, 'customer_id_is_numeric': True, 'region': True}


In [None]:
Utility fx: write_df and read_df

In [87]:
from typing import Union

def ensure_dir(path: pathlib.Path):
    path.parent.mkdir(parents=True, exist_ok=True)

def detect_format(path: Union[str, pathlib.Path]):
    suf = str(path).lower()
    if suf.endswith('.csv'): return 'csv'
    if suf.endswith('.parquet') or suf.endswith('.pq') or suf.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format for: ' + str(path))

def write_df(df: pd.DataFrame, path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    ensure_dir(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        df.to_csv(path, index=False)
    elif fmt == 'parquet':
        try:
            df.to_parquet(path)
        except Exception as e:
            print("Parquet engine not available. Install pyarrow or fastparquet")
            #raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return path

def read_df(path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        return pd.read_csv(path, parse_dates=['date']) if 'date' in pd.read_csv(path, nrows=0).columns else pd.read_csv(path)
    elif fmt == 'parquet':
        try:
            return pd.read_parquet(path)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# Demo utility usage
csv2 = RAW_DIR / f"prices_util_{ts()}.csv"
pq2  = PROCESS_DIR / f"prices_util_{ts()}.parquet"
write_df(df, csv2)
df2 = read_df(csv2)
print('Reloaded CSV via util, shape:', df2.shape)

try:
    write_df(df, pq2)
    df3 = read_df(pq2)
    print('Reloaded Parquet via util, shape:', df3.shape)
except RuntimeError as e:
    print('Parquet util demo skipped:', e)

Reloaded CSV via util, shape: (100, 5)
Reloaded Parquet via util, shape: (100, 5)
