In [8]:
import numpy as np 
arr = np.array([1, 2, 3, 4, 5])
arr2 = arr * 10 
print("result:", arr2)

py_list = [1, 2, 3, 4, 5] 
scaled = [x * 10 for x in py_list]
print("result:", scaled)

large = np.arange(1_000_000) 
# %timeit large * 10 
# %timeit [x * 10 for x in large] 

import os
print("Current working directory:", os.getcwd())
print("Files here:", os.listdir("."))

import pandas as pd
from pathlib import Path

csv_path = Path("data/starter_data.csv")
assert csv_path.exists(), "data/starter_data.csv not found, please check the path"

df = pd.read_csv(csv_path)
print(df.head())
df.info() 

desc = df.describe()  # count/mean/std/min/25%/50%/75%/max
display(desc)

cat_cols = df.select_dtypes(include=["object"]).columns
if len(cat_cols) == 0:
    raise ValueError("The category column (of type object) was not found. Please manually specify the grouping column name.")
category_col = cat_cols[0]
print("Grouping according to this column", category_col)

num_cols = df.select_dtypes(include="number").columns
grouped = df.groupby(category_col)[num_cols].agg(["mean", "count"])
display(grouped.head())

out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True) 

desc_path = out_dir / "summary_describe.csv"
grouped_path = out_dir / "summary_grouped.csv"
desc.to_csv(desc_path, index=True)
grouped.to_csv(grouped_path, index=True)
print("saved：", desc_path, grouped_path)

!pip install matplotlib
import matplotlib.pyplot as plt

value_col = num_cols[0] if len(num_cols) > 0 else None
if value_col:
    means = df.groupby(category_col)[value_col].mean().sort_values()
    ax = means.plot(kind="bar", title=f"{category_col} vs mean({value_col})")
    ax.set_xlabel(category_col)
    ax.set_ylabel(f"mean({value_col})")
    fig = ax.get_figure()
    fig.tight_layout()
    fig_path = out_dir / "basic_plot.png"
    fig.savefig(fig_path, dpi=150)
    plt.close(fig)
    print("image saved：", fig_path)
else:
    print("column not found, skip mapping")

import pandas as pd

def get_summary_stats(df: pd.DataFrame):
    """
    Return:
      - desc: numeric summary (df.describe())
      - grouped: groupby on first categorical column with mean and count
    """
    desc = df.describe()

    cat_cols = df.select_dtypes(include=["object"]).columns
    if len(cat_cols) == 0:
        raise ValueError("No categorical (object) columns found.")
    category_col = cat_cols[0]

    num_cols = df.select_dtypes(include="number").columns
    grouped = df.groupby(category_col)[num_cols].agg(["mean", "count"])
    return desc, grouped

# Use the function
desc2, grouped2 = get_summary_stats(df)
display(desc2.head(), grouped2.head())

result: [10 20 30 40 50]
result: [10, 20, 30, 40, 50]
Current working directory: /Users/liuphoebe/Desktop/bootcamp4/bootcamp_Kexu_Liu/homework
Files here: ['.DS_Store', 'stage03.ipynb', 'stage05.ipynb', 'data', 'stage06.ipynb', 'stage04.ipynb']
  category  value        date
0        A     10  2025-08-01
1        B     15  2025-08-02
2        A     12  2025-08-03
3        B     18  2025-08-04
4        C     25  2025-08-05
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 372.0+ bytes


Unnamed: 0,value
count,10.0
mean,17.6
std,7.381659
min,10.0
25%,12.25
50%,14.5
75%,23.25
max,30.0


Grouping according to this column category


Unnamed: 0_level_0,value,value
Unnamed: 0_level_1,mean,count
category,Unnamed: 1_level_2,Unnamed: 2_level_2
A,11.5,4
B,15.666667,3
C,27.666667,3


saved： data/processed/summary_describe.csv data/processed/summary_grouped.csv
image saved： data/processed/basic_plot.png


Unnamed: 0,value
count,10.0
mean,17.6
std,7.381659
min,10.0
25%,12.25


Unnamed: 0_level_0,value,value
Unnamed: 0_level_1,mean,count
category,Unnamed: 1_level_2,Unnamed: 2_level_2
A,11.5,4
B,15.666667,3
C,27.666667,3
