In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

DATA_PATH = Path("../data/raw/starter_data.csv")
df = pd.read_csv(DATA_PATH)
df.head()  # quick peek

Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


In [2]:
# take the numeric column as a NumPy array
vals = df["value"].to_numpy()

# simple elementwise + reduction examples
vals_times2 = vals * 2
vals_mean = vals.mean()
vals_std  = vals.std()

# add one NumPy-derived column back to the DataFrame (z-score is common)
df["value_z"] = (vals - vals_mean) / (vals_std if vals_std != 0 else 1)
df.head()


Unnamed: 0,category,value,date,value_z
0,A,10,2025-08-01,-1.085271
1,B,15,2025-08-02,-0.371277
2,A,12,2025-08-03,-0.799674
3,B,18,2025-08-04,0.05712
4,C,25,2025-08-05,1.056712


In [3]:
df["date"] = pd.to_datetime(df["date"])
df.dtypes

category            object
value                int64
date        datetime64[ns]
value_z            float64
dtype: object

In [4]:
summary = (
    df.groupby("category", as_index=False)
      .agg(count=("value", "size"),
           mean_value=("value", "mean"),
           min_value=("value", "min"),
           max_value=("value", "max"))
      .sort_values("category")
)
summary


Unnamed: 0,category,count,mean_value,min_value,max_value
0,A,4,11.5,10,13
1,B,3,15.666667,14,18
2,C,3,27.666667,25,30


In [5]:
OUT_PATH = Path("../data/processed/summary.csv")
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
summary.to_csv(OUT_PATH, index=False)
OUT_PATH.resolve()


PosixPath('/Users/ivysingal/bootcamp_ivy_singal/data/processed/summary.csv')

In [6]:
def summarize_by_category(frame: pd.DataFrame) -> pd.DataFrame:
    tmp = frame.copy()
    tmp["date"] = pd.to_datetime(tmp["date"])
    res = (
        tmp.groupby("category", as_index=False)
           .agg(count=("value", "size"),
                mean_value=("value", "mean"),
                min_value=("value", "min"),
                max_value=("value", "max"))
           .sort_values("category")
    )
    return res

# sanity check
summarize_by_category(df)

Unnamed: 0,category,count,mean_value,min_value,max_value
0,A,4,11.5,10,13
1,B,3,15.666667,14,18
2,C,3,27.666667,25,30
