# ML US baby names

## 2. Compute features

In [1]:
from constants import *
print("YEAR_START:", YEAR_START)
print("YEAR_END:", YEAR_END)
print("COMPUTE_YEARS_BACKWARDS:", COMPUTE_YEARS_BACKWARDS)
print("RAW_CSV:", RAW_CSV)
print("RAW_CSV_COUNTRY:", RAW_CSV_COUNTRY)
print("FEATURES_CSV:", FEATURES_CSV)

YEAR_START: 1960
YEAR_END: 2021
COMPUTE_YEARS_BACKWARDS: 5
RAW_CSV: data_raw_year_state.csv
RAW_CSV_COUNTRY: data_raw_country.csv
FEATURES_CSV: data_features_%s.csv


In [2]:
import pandas

In [3]:
print("Reading raw data, if it exists...")
print()

df = pandas.read_csv(RAW_CSV)
print("Done.")

df

Reading raw data, if it exists...

Done.


Unnamed: 0,year,state,m_name,m_num,f_name,f_num
0,1960,AK,David,152,Mary,79
1,1960,AK,Michael,140,Linda,56
2,1960,AK,Robert,136,Karen,53
3,1960,AK,John,125,Debra,50
4,1960,AK,James,124,Susan,50
...,...,...,...,...,...,...
316195,2021,WY,Jacob,6,Gracelynn,5
316196,2021,WY,Jasper,6,Hadlee,5
316197,2021,WY,Luca,6,Isla,5
316198,2021,WY,Odin,6,Kendall,5


### Yearly totals per name

In [4]:
print("Computing yearly totals per name...")
print()

df_country = df[["year", "state", "m_name", "m_num"]]
df_country = df_country.groupby(["year", "m_name"]).agg({"m_num": "sum"})
df_country.to_csv(RAW_CSV_COUNTRY)
df_country = pandas.read_csv(RAW_CSV_COUNTRY)

print("Done.")
print('Data saved to "%s".' % RAW_CSV_COUNTRY)

df_country

Computing yearly totals per name...

Done.
Data saved to "data_raw_country.csv".


Unnamed: 0,year,m_name,m_num
0,1960,Aaron,38
1,1960,Alan,8221
2,1960,Albert,3440
3,1960,Alex,50
4,1960,Alfred,630
...,...,...,...
16450,2021,Yosef,115
16451,2021,Zachary,498
16452,2021,Zane,13
16453,2021,Zayden,27


### Normalized rankings for each name per year and state

In [5]:
print("Normalizing the ranking of each name per year and state...")
print()

years = sorted(set(df["year"]))
states = sorted(set(df["state"]))
names = sorted(set(df["m_name"]))

def minmaxZeroOne(pdcolumn: pandas.Series, forced_min = None):
    column_min = pdcolumn.min() if forced_min is None else 0
    return (pdcolumn - column_min) / (pdcolumn.max() - column_min)
    
def minmaxMinusOnePlusOne(pdcolumn: pandas.Series):
    return (pdcolumn - pdcolumn.min()) / (pdcolumn.max() - pdcolumn.min()) * 2 - 1

def normalize_names_year_state(df_raw, year, state, field_to_normalize, new_field):
    df = df_raw[(df_raw["year"] == year) & (df_raw["state"] == state)].copy()
    df[new_field] = minmaxZeroOne(df[field_to_normalize], 0)
    return df

features = {}

for year in years:
    features[year] = {}
    for name in names:
        features[year][name] = {}

for year in years:
    print("%i " % year, end="")
    for state in states:
        new_field = state + "_0"
        df[new_field] = 0.0
        norm = normalize_names_year_state(df, year, state, "m_num", new_field)
        norm.set_index("m_name", inplace=True)
        norm = dict(norm[new_field])
        
        for name in names:
            features[year][name][new_field] = 0.0
        
        for name in norm:
            features[year][name][new_field] = norm[name]
            
        print(".", end="")
        
    print()

tabular = []
for year in features:
    for name in features[year]:
        row = {}
        row.update({"year": year, "name": name})
        row.update(features[year][name])
        tabular.append(row)

df_state_0 = pandas.DataFrame(tabular)
df_state_0.to_csv(FEATURES_CSV % "0", index=False)

print()
print("Done.")
print('Data saved to "%s".' % (FEATURES_CSV % "0"))

df_state_0

Normalizing the ranking of each name per year and state...

1960 ...................................................
1961 ...................................................
1962 ...................................................
1963 ...................................................
1964 ...................................................
1965 ...................................................
1966 ...................................................
1967 ...................................................
1968 ...................................................
1969 ...................................................
1970 ...................................................
1971 ...................................................
1972 ...................................................
1973 ...................................................
1974 ...................................................
1975 ...................................................
1976 .......................

Unnamed: 0,year,name,AK_0,AL_0,AR_0,AZ_0,CA_0,CO_0,CT_0,DC_0,...,SD_0,TN_0,TX_0,UT_0,VA_0,VT_0,WA_0,WI_0,WV_0,WY_0
0,1960,Aaron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1960,Abel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1960,Abigail,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1960,Abraham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1960,Ace,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53687,2021,Zackary,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53688,2021,Zander,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53689,2021,Zane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53690,2021,Zayden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
import math

print("Computing differences with previous years...")

for back in range(1, COMPUTE_YEARS_BACKWARDS + 1):
    print()
    print("Going back %i year%s..." % (back, "s" if back > 1 else ""))
    
    for year in years:
        if year - back < YEAR_START:
            continue
        
        print("%i " % year, end="")
        
        for state in states:
            label_current = "%s_%i" % (state, 0)
            label_back = "%s_%i" % (state, back)
           
            for name in names:
                features[year][name][label_back] = 0.0
                
            for name in names:
                features[year][name][label_back] = \
                    features[year][name][label_current] - features[year - back][name][label_current]
                
            print(".", end="")
        
        print()

tabular = []
for year in features:
    for name in features[year]:
        row = {}
        row.update({"year": year, "name": name})
        row.update(features[year][name])
        tabular.append(row)

df_features = pandas.DataFrame(tabular)
df_features.to_csv(FEATURES_CSV % COMPUTE_YEARS_BACKWARDS, index=False)
        
print()
print("Done.")
print('Data saved to "%s".' % (FEATURES_CSV % COMPUTE_YEARS_BACKWARDS))

df_features

Computing differences with previous years...

Going back 1 year...
1961 ...................................................
1962 ...................................................
1963 ...................................................
1964 ...................................................
1965 ...................................................
1966 ...................................................
1967 ...................................................
1968 ...................................................
1969 ...................................................
1970 ...................................................
1971 ...................................................
1972 ...................................................
1973 ...................................................
1974 ...................................................
1975 ...................................................
1976 ...................................................
1977 ................

1989 ...................................................
1990 ...................................................
1991 ...................................................
1992 ...................................................
1993 ...................................................
1994 ...................................................
1995 ...................................................
1996 ...................................................
1997 ...................................................
1998 ...................................................
1999 ...................................................
2000 ...................................................
2001 ...................................................
2002 ...................................................
2003 ...................................................
2004 ...................................................
2005 ...................................................
2006 ..........................

2021 ...................................................

Done.
Data saved to "data_features_5.csv".


Unnamed: 0,year,name,AK_0,AL_0,AR_0,AZ_0,CA_0,CO_0,CT_0,DC_0,...,SD_5,TN_5,TX_5,UT_5,VA_5,VT_5,WA_5,WI_5,WV_5,WY_5
0,1960,Aaron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,
1,1960,Abel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,
2,1960,Abigail,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,
3,1960,Abraham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,
4,1960,Ace,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53687,2021,Zackary,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53688,2021,Zander,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53689,2021,Zane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53690,2021,Zayden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_country

Unnamed: 0,year,m_name,m_num
0,1960,Aaron,38
1,1960,Alan,8221
2,1960,Albert,3440
3,1960,Alex,50
4,1960,Alfred,630
...,...,...,...
16450,2021,Yosef,115
16451,2021,Zachary,498
16452,2021,Zane,13
16453,2021,Zayden,27


In [8]:
df_max_per_year = df_country.copy()
df_max_per_year.sort_values("m_num", ascending=False, inplace=True)
df_max_per_year.drop_duplicates(subset='year', keep="first", inplace=True)
df_max_per_year.sort_values("year", inplace=True)
df_max_per_year

Unnamed: 0,year,m_name,m_num
57,1960,David,85926
422,1961,Michael,86919
673,1962,Michael,85037
924,1963,Michael,83778
1179,1964,Michael,82636
...,...,...,...
15268,2017,Liam,18838
15528,2018,Liam,19940
15812,2019,Liam,20578
16084,2020,Liam,19777


In [9]:
print("Computing winning names each year...")

for year in features:
    next_year = year + 1
    max_df = df_max_per_year[df_max_per_year["year"] == next_year]
    max_name = max_df.iloc[0]["m_name"] if len(max_df) > 0 else None
    
    if max_name is None:
        for name in features[year]:
            features[year][name]["next_max"] = None
    else:
        for name in features[year]:
            features[year][name]["next_max"] = 0
        features[year][max_name]["next_max"] = 1

tabular = []
for year in features:
    for name in features[year]:
        row = {}
        row.update({"year": year, "name": name})
        row.update(features[year][name])
        tabular.append(row)

df_features = pandas.DataFrame(tabular)
df_features = df_features.reindex(columns = [col for col in df_features.columns if col != 'next_max'] + ['next_max'])
df_features.to_csv(FEATURES_CSV % "all", index=False)
        
print()
print("Done.")
print('Data saved to "%s".' % (FEATURES_CSV % "all"))

df_features

Computing winning names each year...

Done.
Data saved to "data_features_all.csv".


Unnamed: 0,year,name,AK_0,AL_0,AR_0,AZ_0,CA_0,CO_0,CT_0,DC_0,...,TN_5,TX_5,UT_5,VA_5,VT_5,WA_5,WI_5,WV_5,WY_5,next_max
0,1960,Aaron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,0.0
1,1960,Abel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,0.0
2,1960,Abigail,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,0.0
3,1960,Abraham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,0.0
4,1960,Ace,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53687,2021,Zackary,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
53688,2021,Zander,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
53689,2021,Zane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
53690,2021,Zayden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
