# Baby Names - Data Wrangling

**Goal: plot a timeline of name frequencies**

Data from: www.ssa.gov/oact/babynames/limits.html

In [None]:
import os

import pandas as pd

In [None]:
# read all Baby Name files
data = []
for fn in sorted(os.listdir("names")):
    if fn.startswith("yob"):
        df = pd.read_csv(f"names/{fn}", names=["name", "sex", "count"])
        df["year"] = fn[3:7]  # could use int(fn[3:7])
        data.append(df)

In [None]:
len(data)  # number of files

In [None]:
df = pd.concat(data, axis=0)  # concatenate vertically

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# example: merge DFs of two years horizontall
a = pd.merge(left=data[0], right=data[1], on="name", how="inner")
a.shape

In [None]:
a.head()

### Sort

In [None]:
df.sort_values(by="name", ascending=False, inplace=True).head()

In [None]:
df.sort_values(by=["name", "year"], ascending=[True, False]).head()

### Edit columns

In [None]:
df["year_int"] = df["year"].astyape(int)

In [None]:
pd.to_datetime(["2025-01-02 11:00"]) # or "January 2nd, 2025", 

In [None]:
df["year_ts"] = pd.to_datetime(df["year"])

In [None]:
def square(x):
    return x ** 2

In [None]:
df["square"] = df["year_int"].apply(square)  # <-- function pointer, cool isn't it

In [None]:
df.head()

### Plot a timeline

In [None]:
df.query("name == 'Zzyzx'")

In [None]:
df.sort_values(by="year_ts", inplace=True)

In [None]:
name = df[df["name"] == "Ronaldo"].set_index("year_ts")
name = name[name["sex"] == "M"]
name["count"].plot()