# ML US baby names

## 2. Compute features

In [1]:
from constants import *
print("YEAR_START:", YEAR_START)
print("YEAR_END:", YEAR_END)
print("COMPUTE_YEARS_BACKWARDS:", COMPUTE_YEARS_BACKWARDS)
print("CSV_RAW:", CSV_RAW)
print("CSV_COUNTRY:", CSV_COUNTRY)
print("CSV_FEATURES_CURRENT:", CSV_FEATURES_CURRENT)
print("CSV_FEATURES_PAST:", CSV_FEATURES_PAST)
print("CSV_FEATURES_LABEL:", CSV_FEATURES_LABEL)

YEAR_START: 1960
YEAR_END: 2021
COMPUTE_YEARS_BACKWARDS: 5
CSV_RAW: data_raw.csv
CSV_COUNTRY: data_country.csv
CSV_FEATURES_CURRENT: data_features_1_current.csv
CSV_FEATURES_PAST: data_features_2_past.csv
CSV_FEATURES_LABEL: data_features_3_label.csv


In [2]:
import pandas

In [3]:
print("Reading raw data, if it exists...")
print()

df = pandas.read_csv(CSV_RAW)

print("Done.")

df

Reading raw data, if it exists...

Done.


Unnamed: 0,year,state,m_name,m_num,f_name,f_num
0,1960,AK,David,152,Mary,79
1,1960,AK,Michael,140,Linda,56
2,1960,AK,Robert,136,Karen,53
3,1960,AK,John,125,Debra,50
4,1960,AK,James,124,Susan,50
...,...,...,...,...,...,...
316195,2021,WY,Jacob,6,Gracelynn,5
316196,2021,WY,Jasper,6,Hadlee,5
316197,2021,WY,Luca,6,Isla,5
316198,2021,WY,Odin,6,Kendall,5


### Yearly totals per name

In [4]:
print("Computing yearly totals per name...")
print()

df_country = df[["year", "state", "m_name", "m_num"]]
df_country = df_country.groupby(["year", "m_name"]).agg({"m_num": "sum"})
df_country.to_csv(CSV_COUNTRY)
df_country = pandas.read_csv(CSV_COUNTRY)

print("Done.")
print('Data saved to "%s".' % CSV_COUNTRY)

df_country

Computing yearly totals per name...

Done.
Data saved to "data_country.csv".


Unnamed: 0,year,m_name,m_num
0,1960,Aaron,38
1,1960,Alan,8221
2,1960,Albert,3440
3,1960,Alex,50
4,1960,Alfred,630
...,...,...,...
16450,2021,Yosef,115
16451,2021,Zachary,498
16452,2021,Zane,13
16453,2021,Zayden,27


### Normalized rankings for each name per year and state

In [5]:
print("Extracting lists of years, states and names...")

years = sorted(set(df["year"]))
states = sorted(set(df["state"]))
names = sorted(set(df["m_name"]))

print("Done.")

Extracting lists of years, states and names...
Done.


In [6]:
print("Creating structure to store features...")

features = {}

for year in set(df_country["year"]):
    features[year] = {}
    for name in set(df_country[df_country["year"] == year]["m_name"]):
        features[year][name] = {}

print("Done.")

Creating structure to store features...
Done.


In [7]:
print("Normalizing the ranking of each name per year and state...")
print()

def minmaxZeroOne(pdcolumn: pandas.Series, forced_min = None):
    column_min = pdcolumn.min() if forced_min is None else 0
    return (pdcolumn - column_min) / (pdcolumn.max() - column_min)
    
def minmaxMinusOnePlusOne(pdcolumn: pandas.Series):
    return (pdcolumn - pdcolumn.min()) / (pdcolumn.max() - pdcolumn.min()) * 2 - 1

def normalize_names_year_state(df_raw, year, state, field_to_normalize, new_field):
    df = df_raw[(df_raw["year"] == year) & (df_raw["state"] == state)].copy()
    df[new_field] = minmaxZeroOne(df[field_to_normalize], 0)
    return df

try:
    df_feat_current = pandas.read_csv(CSV_FEATURES_CURRENT)

    for i in range(0, len(df_feat_current)):
        row = dict(df_feat_current.iloc[i])
        
        year = row["year"]
        name = row["name"]
        del row["year"]
        del row["name"]
        
        features[year][name] = row
except:
    print("Could not load CSV. Computing...")
    
    for year in years:
        print("%i " % year, end="")
        for state in states:
            new_field = state + "_0"
            df[new_field] = 0.0
            norm = normalize_names_year_state(df, year, state, "m_num", new_field)
            norm.set_index("m_name", inplace=True)
            norm = dict(norm[new_field])

            for name in features[year]:
                features[year][name][new_field] = 0.0
            
            for name in norm:
                features[year][name][new_field] = norm[name]

            print(".", end="")

        print()

    tabular = []
    for year in features:
        for name in features[year]:
            row = {}
            row.update({"year": year, "name": name})
            row.update(features[year][name])
            tabular.append(row)

    df_feat_current = pandas.DataFrame(tabular)
    df_feat_current.to_csv(CSV_FEATURES_CURRENT, index=False)

    print()
    print('Data saved to "%s".' % (CSV_FEATURES_CURRENT))

print("Done.")

df_feat_current

Normalizing the ranking of each name per year and state...

Done.


Unnamed: 0,year,name,AK_0,AL_0,AR_0,AZ_0,CA_0,CO_0,CT_0,DC_0,...,SD_0,TN_0,TX_0,UT_0,VA_0,VT_0,WA_0,WI_0,WV_0,WY_0
0,1960,Francis,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.047653,0.056954,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.064777,0.000000,0.000000,0.000000,0.000000
1,1960,Charlie,0.0,0.046014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,1960,Rory,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,1960,Louis,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.084477,0.041060,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.061644
4,1960,Greg,0.0,0.049759,0.063731,0.071629,0.085869,0.099220,0.031047,0.041060,...,0.090909,0.000000,0.000000,0.082218,0.000000,0.000000,0.117176,0.075032,0.062159,0.157534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16450,2021,Mason,0.5,0.435294,0.575000,0.368910,0.344655,0.392749,0.485944,0.213333,...,0.454545,0.532710,0.283402,0.318792,0.464765,0.451613,0.467662,0.514881,0.712963,0.387097
16451,2021,Lane,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.272727,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.290323
16452,2021,Jasper,0.0,0.000000,0.343750,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.212121,0.221963,0.000000,0.000000,0.000000,0.225806,0.000000,0.000000,0.240741,0.193548
16453,2021,Karson,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [8]:
import math

print("Computing differences with previous years...")

for back in range(1, COMPUTE_YEARS_BACKWARDS + 1):
    print()
    print("Going back %i year%s..." % (back, "s" if back > 1 else ""))
    
    for year in years:
        year_back = year - back
        
        if year_back < YEAR_START:
            continue
        
        print(".", end="")
        
        for state in states:
            label_current = "%s_%i" % (state, 0)
            label_back = "%s_%i" % (state, back)
           
            for name in features[year]:
                try:
                    if name not in features[year_back]:
                        features[year][name][label_back] = features[year][name][label_current]
                    else:
                        features[year][name][label_back] = \
                            features[year][name][label_current] - features[year_back][name][label_current]
                except:
                    print()
                    print("year:", year)
                    print("year_back:", year_back)
                    print("state:", state)
                    print("name:", name)
                    print("label_current:", label_current)
                    print("label_back:", label_back)
                    print("features[year][name]:", features[year][name])
                    print("features[year][name][label_back]:", features[year][name][label_back])
                    print("features[year][name][label_current]:", features[year][name][label_current])
                    print("features[year_back][name][label_current]:", features[year_back][name][label_current])
                    raise
                
    print()

tabular = []
for year in features:
    for name in features[year]:
        row = {}
        row.update({"year": year, "name": name})
        row.update(features[year][name])
        tabular.append(row)

df_feat_current_past = pandas.DataFrame(tabular)
df_feat_current_past.to_csv(CSV_FEATURES_PAST, index=False)
        
print()
print("Done.")
print('Data saved to "%s".' % CSV_FEATURES_PAST)

df_feat_current_past

Computing differences with previous years...

Going back 1 year...
.............................................................

Going back 2 years...
............................................................

Going back 3 years...
...........................................................

Going back 4 years...
..........................................................

Going back 5 years...
.........................................................

Done.
Data saved to "data_features_2_past.csv".


Unnamed: 0,year,name,AK_0,AL_0,AR_0,AZ_0,CA_0,CO_0,CT_0,DC_0,...,SD_5,TN_5,TX_5,UT_5,VA_5,VT_5,WA_5,WI_5,WV_5,WY_5
0,1960,Tommy,0.000000,0.113430,0.144844,0.064607,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
1,1960,Barry,0.000000,0.100054,0.085747,0.000000,0.000000,0.000000,0.037545,0.041060,...,,,,,,,,,,
2,1960,Neil,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
3,1960,Daryl,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
4,1960,Eric,0.138158,0.069021,0.053302,0.102528,0.185629,0.158305,0.100361,0.176159,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16450,2021,Kainoa,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
16451,2021,Sawyer,0.000000,0.279412,0.312500,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.363636,0.075656,0.0,-0.104561,-0.187719,-0.156486,-0.338235,-0.179498,-0.090090,0.233520
16452,2021,Simon,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.266667,...,0.000000,0.000000,0.0,0.000000,0.000000,0.225806,0.000000,0.000000,0.000000,0.000000
16453,2021,Bridger,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,-0.181529,0.000000,0.000000,0.000000,0.000000,0.000000,-0.176718


In [9]:
df_max_per_year = df_country.copy()
df_max_per_year.sort_values("m_num", ascending=False, inplace=True)
df_max_per_year.drop_duplicates(subset='year', keep="first", inplace=True)
df_max_per_year.sort_values("year", inplace=True)
df_max_per_year

Unnamed: 0,year,m_name,m_num
57,1960,David,85926
422,1961,Michael,86919
673,1962,Michael,85037
924,1963,Michael,83778
1179,1964,Michael,82636
...,...,...,...
15268,2017,Liam,18838
15528,2018,Liam,19940
15812,2019,Liam,20578
16084,2020,Liam,19777


In [10]:
print("Computing winning names each year...")

for year in features:
    next_year = year + 1
    max_df = df_max_per_year[df_max_per_year["year"] == next_year]
    max_name = max_df.iloc[0]["m_name"] if len(max_df) > 0 else None
    
    if max_name is None:
        for name in features[year]:
            features[year][name]["next_max"] = None
    else:
        for name in features[year]:
            features[year][name]["next_max"] = 0
        features[year][max_name]["next_max"] = 1

tabular = []
for year in features:
    for name in features[year]:
        row = {}
        row.update({"year": year, "name": name})
        row.update(features[year][name])
        tabular.append(row)

df_features = pandas.DataFrame(tabular)
df_features = df_features.reindex(columns = [col for col in df_features.columns if col != 'next_max'] + ['next_max'])
df_features.to_csv(CSV_FEATURES_LABEL, index=False)
        
print()
print("Done.")
print('Data saved to "%s".' % (CSV_FEATURES_LABEL))

df_features

Computing winning names each year...

Done.
Data saved to "data_features_3_label.csv".


Unnamed: 0,year,name,AK_0,AL_0,AR_0,AZ_0,CA_0,CO_0,CT_0,DC_0,...,TN_5,TX_5,UT_5,VA_5,VT_5,WA_5,WI_5,WV_5,WY_5,next_max
0,1960,Tommy,0.000000,0.113430,0.144844,0.064607,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,0.0
1,1960,Barry,0.000000,0.100054,0.085747,0.000000,0.000000,0.000000,0.037545,0.041060,...,,,,,,,,,,0.0
2,1960,Neil,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,0.0
3,1960,Daryl,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,0.0
4,1960,Eric,0.138158,0.069021,0.053302,0.102528,0.185629,0.158305,0.100361,0.176159,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16450,2021,Kainoa,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,
16451,2021,Sawyer,0.000000,0.279412,0.312500,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.075656,0.0,-0.104561,-0.187719,-0.156486,-0.338235,-0.179498,-0.090090,0.233520,
16452,2021,Simon,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.266667,...,0.000000,0.0,0.000000,0.000000,0.225806,0.000000,0.000000,0.000000,0.000000,
16453,2021,Bridger,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,-0.181529,0.000000,0.000000,0.000000,0.000000,0.000000,-0.176718,
