In [1]:
import os
os.chdir("../..")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [5]:
COMBINED_CSV_GZ = "data/processed/ssa_baby_names_1880_2024.csv.gz"
OUT_DIR = "artifacts"
os.makedirs(OUT_DIR, exist_ok=True)

In [6]:
df = pd.read_csv(COMBINED_CSV_GZ)

In [8]:
df.head(), df.tail(), df.shape

(        Name Sex  Count  Year
 0       Mary   F   7065  1880
 1       Anna   F   2604  1880
 2       Emma   F   2003  1880
 3  Elizabeth   F   1939  1880
 4     Minnie   F   1746  1880,
             Name Sex  Count  Year
 2149472    Zylyn   M      5  2024
 2149473  Zymiere   M      5  2024
 2149474   Zypher   M      5  2024
 2149475     Zyre   M      5  2024
 2149476   Zyrell   M      5  2024,
 (2149477, 4))

### 1. Diversity: Top 10 female names over time

In [14]:
female = df[df["Sex"] == "F"]

In [15]:
def share_top10(group):
    return group.nlargest(10, "Count")["Count"].sum() / group["Count"].sum()

In [16]:
top10_share_f = female.groupby("Year").apply(share_top10).reset_index(name="Top10Share")
top10_share_f.to_csv(os.path.join(OUT_DIR, "top10_share_female_by_year.csv"), index=False)

  top10_share_f = female.groupby("Year").apply(share_top10).reset_index(name="Top10Share")


In [17]:
plt.figure()
plt.plot(top10_share_f["Year"], top10_share_f["Top10Share"])
plt.title("Share of Top 10 Girls' Names Over Time (1880–2024)")
plt.xlabel("Year")
plt.ylabel("Share")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "top10_share_female_line.png"))
plt.close()

### 2. Volatile Names

In [18]:
name_totals = df.groupby("Name")["Count"].sum()
popular_names = name_totals[name_totals > 50000].index
trends = df[df["Name"].isin(popular_names)].groupby(["Name", "Year"])["Count"].sum().reset_index()

In [19]:
volatility = trends.groupby("Name")["Count"].agg(["min", "max"]).reset_index()
volatility["volatility"] = (volatility["max"] - volatility["min"]) / volatility["max"]

In [20]:
volatile_top10 = volatility.sort_values("volatility", ascending=False).head(10)
volatile_top10.to_csv(os.path.join(OUT_DIR, "most_volatile_names.csv"), index=False)

In [21]:
volatile_top10["amplitude"] = volatile_top10["max"] - volatile_top10["min"]
plt.figure()
plt.barh(volatile_top10["Name"], volatile_top10["amplitude"])
plt.title("Most Volatile Popular Names")
plt.xlabel("Peak - Trough (counts)")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "most_volatile_names_barh.png"))
plt.close()

In [24]:
#making line plots for each volatile name

volatile_names = volatile_top10["Name"].tolist()

sel = (
    df[df["Name"].isin(volatile_names)]
    .groupby(["Name", "Year"])["Count"]
    .sum()
    .reset_index()
    .sort_values(["Name", "Year"])
)

In [25]:
#combined line plot, all names

plt.figure(figsize=(10, 6))
for name, g in sel.groupby("Name"):
    plt.plot(g["Year"], g["Count"], label=name, linewidth=1.6)
plt.title("Most Volatile Popular Names — Yearly Counts")
plt.xlabel("Year"); plt.ylabel("Count")
plt.legend(ncol=2, fontsize=9)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "volatile_names_combined_counts.png"))
plt.close()

In [26]:
#combined line plot (normalized to each name's peak = 1.0) for shape comparison
sel_norm = sel.copy()
sel_norm["norm"] = sel_norm.groupby("Name")["Count"].transform(lambda x: x / x.max())
plt.figure(figsize=(10, 6))
for name, g in sel_norm.groupby("Name"):
    plt.plot(g["Year"], g["norm"], label=name, linewidth=1.6)
plt.title("Most Volatile Popular Names — Normalized to Peak = 1.0")
plt.xlabel("Year"); plt.ylabel("Normalized Popularity")
plt.legend(ncol=2, fontsize=9)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "volatile_names_combined_normalized.png"))
plt.close()

In [27]:
#Individual line plots
for name, g in sel.groupby("Name"):
    plt.figure(figsize=(8, 4.8))
    plt.plot(g["Year"], g["Count"], linewidth=1.8)
    plt.title(f"{name}: Yearly Counts")
    plt.xlabel("Year"); plt.ylabel("Count")
    plt.tight_layout()
    fname = f"volatile_{name.lower().replace(' ', '_')}.png"
    plt.savefig(os.path.join(OUT_DIR, fname))
    plt.close()

### 3. Gender Neutral Names

In [22]:
gender_counts = df.groupby(["Name", "Sex"])["Count"].sum().unstack(fill_value=0)
gender_counts["ratio"] = gender_counts.min(axis=1) / gender_counts.max(axis=1)
gender_counts["total"] = gender_counts.sum(axis=1)

neutral_names = (
    gender_counts[gender_counts["ratio"] >= 0.45]
    .sort_values("total", ascending=False)
    .head(20)
    .reset_index()
)
neutral_names.to_csv(os.path.join(OUT_DIR, "gender_neutral_names_top20.csv"), index=False)

### 4. Pop Culture Trends and SPikes

In [29]:
pop_names = [
    "Arya", "Kobe", "Elsa", "Khaleesi", "Moana",
    "Kylo", "Rihanna", "Beyonce", "Zendaya",
    "Draco", "Hermione", "Messi", "Ronaldo"
]


In [30]:
def get_name_trend(name):
    sub = df[df["Name"].str.lower() == name.lower()]
    return sub.groupby("Year")["Count"].sum().reset_index().assign(Name=name)

In [31]:
tr_pop = pd.concat([get_name_trend(n) for n in pop_names], ignore_index=True)
tr_pop.to_csv(os.path.join(OUT_DIR, "pop_culture_trends_extended.csv"), index=False)

In [34]:
#combined line plot
plt.figure(figsize=(12, 7))
for name, group in tr_pop.groupby("Name"):
    if group["Count"].max() > 50:  # filtering out ultra rare names
        plt.plot(group["Year"], group["Count"], label=name)
plt.title("Pop Culture & Celebrity Name Spikes")
plt.xlabel("Year"); plt.ylabel("Count")
plt.legend(ncol=2, fontsize=9)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "pop_culture_trends_extended.png"))
plt.close()

In [35]:
#individual line plots
for name, group in tr_pop.groupby("Name"):
    if group["Count"].max() > 50:  # skip ultra-rare names
        plt.figure(figsize=(8, 4.5))
        plt.plot(group["Year"], group["Count"], linewidth=1.8)
        plt.title(f"{name}: Popularity Over Time")
        plt.xlabel("Year")
        plt.ylabel("Count")
        plt.tight_layout()
        fname = f"pop_{name.lower().replace(' ', '_')}.png"
        plt.savefig(os.path.join(OUT_DIR, fname))
        plt.close()

### 5. Iconic names of all time

In [36]:
iconic_names = [
    "Mary", "John", "William", "James",        # early classics
    "Robert", "Linda", "David", "Susan",       # mid century
    "Michael", "Jennifer", "Jessica", "Ashley",# late 20th century
    "Olivia", "Emma", "Liam", "Noah"           # modern
]

In [37]:
iconic_df = (
    df[df["Name"].isin(iconic_names)]
    .groupby(["Name", "Year"])["Count"].sum()
    .reset_index()
)

In [38]:
iconic_df.to_csv(os.path.join(OUT_DIR, "iconic_names_trends_expanded.csv"), index=False)

In [39]:
plt.figure(figsize=(12, 7))
for name, group in iconic_df.groupby("Name"):
    plt.plot(group["Year"], group["Count"], label=name)
plt.title("Generational Shifts in Iconic Names (1880–2024)")
plt.xlabel("Year")
plt.ylabel("Count")
plt.legend(ncol=2, fontsize=9)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "iconic_names_trends_expanded.png"))
plt.close()

In [40]:
#individual line plots

ymax = iconic_df["Count"].max() #same y-scale for all plots

for name, group in iconic_df.groupby("Name"):
    plt.figure(figsize=(8, 4.5))
    plt.plot(group["Year"], group["Count"], linewidth=1.8)
    plt.title(f"{name}: Popularity Over Time")
    plt.xlabel("Year")
    plt.ylabel("Count")
    plt.ylim(0, ymax) 
    plt.tight_layout()
    fname = f"iconic_{name.lower()}.png"
    plt.savefig(os.path.join(OUT_DIR, fname))
    plt.close()

### 6. Top 10 names per year

In [42]:
import bar_chart_race as bcr

In [43]:
topN = 10
df_top = (
    df.groupby(['Year', 'Name'])['Count'].sum()
    .reset_index()
)

In [44]:
top_by_year = (
    df_top.groupby('Year')
    .apply(lambda g: g.nlargest(topN, 'Count'))
    .reset_index(drop=True)
)
names_keep = top_by_year['Name'].unique()

  .apply(lambda g: g.nlargest(topN, 'Count'))


In [45]:

df_filtered = df_top[df_top['Name'].isin(names_keep)]
pivoted = df_filtered.pivot(index='Year', columns='Name', values='Count').fillna(0)

In [47]:

bcr.bar_chart_race(
    df=pivoted,
    filename='artifacts/name_evolution_race.gif', 
    orientation='h',
    n_bars=10,
    title='Top 10 Baby Names in the U.S. (1880–2024)',
    period_length=400,
    figsize=(8,5)
)

  df_values.iloc[:, 0] = df_values.iloc[:, 0].fillna(method='ffill')
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))
MovieWriter imagemagick unavail

In [None]:
#decade-wise top names

names = df.copy()
names["Decade"] = (names["Year"] // 10) * 10

decade_counts = (
    names.groupby(["Decade", "Name"])["Count"]
    .sum()
    .reset_index()
)

In [49]:
TOP_N = 8
top_by_decade = (
    decade_counts.groupby("Decade")
    .apply(lambda g: g.nlargest(TOP_N, "Count"))
    .reset_index(drop=True)
)
keep = top_by_decade["Name"].unique()

  .apply(lambda g: g.nlargest(TOP_N, "Count"))


In [50]:
filtered = decade_counts[decade_counts["Name"].isin(keep)]
pivoted = (
    filtered.pivot(index="Decade", columns="Name", values="Count")
    .sort_index()
    .fillna(0)
)

In [51]:
fixed_max = float(pivoted.to_numpy().max()) * 1.1

In [52]:

bcr.bar_chart_race(
    df=pivoted,
    filename=f"{OUT_DIR}/name_evolution_race_decades.gif",
    n_bars=TOP_N,
    orientation='h',
    sort='desc',
    fixed_max=fixed_max,
    steps_per_period=10,       
    interpolate_period=True,    
    period_length=900,          
    period_fmt='{x:.0f}s',      
    period_label={'x': .99, 'y': .12, 'ha': 'right', 'size': 14},
    bar_size=.92,
    shared_fontdict={'size': 12},
    tick_label_size=11,
    dpi=140,
    title='Top 8 Baby Names in the U.S. by Decade (1880s–2020s)'
)

  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))
MovieWriter imagemagick unavailable; using Pillow instead.


### 7. Name Diversity Explosion

In [53]:
#computing yearly diversity metrics

diversity = (
    df.groupby('Year')
    .apply(lambda g: pd.Series({
        'UniqueNames': g['Name'].nunique(),
        'CommonNames100+': (g['Count'] > 100).sum(),
        'ShannonEntropy': -np.sum((g['Count']/g['Count'].sum()) * np.log2(g['Count']/g['Count'].sum()))
    }))
    .reset_index()
)

diversity.to_csv(f"{OUT_DIR}/name_diversity.csv", index=False)

  .apply(lambda g: pd.Series({


In [54]:
#number of unique vs common names plot
plt.figure(figsize=(10,6))
plt.plot(diversity['Year'], diversity['UniqueNames'], label='All unique names', alpha=0.6)
plt.plot(diversity['Year'], diversity['CommonNames100+'], label='Names with >100 births')
plt.title("The Explosion of Baby Name Diversity in the U.S. (1880–2024)")
plt.xlabel("Year")
plt.ylabel("Number of Names")
plt.legend()
plt.tight_layout()
plt.savefig(f"{OUT_DIR}/name_diversity_explosion.png")
plt.close()

In [55]:
#evenness of name distribution (Shannon entropy)
plt.figure(figsize=(10,6))
plt.plot(diversity['Year'], diversity['ShannonEntropy'], color="purple")
plt.title("Shannon Entropy of Baby Names (1880–2024)")
plt.xlabel("Year")
plt.ylabel("Entropy (bits)")
plt.tight_layout()
plt.savefig(f"{OUT_DIR}/name_entropy.png")
plt.close()