In [10]:
import pandas as pd
import os
import requests, zipfile, io, shutil
if os.path.exists("names"):
    shutil.rmtree("names")
os.mkdir("names")
source = "https://www.ssa.gov/oact/babynames/names.zip"
req = requests.get(source, stream=True)
z = zipfile.ZipFile(io.BytesIO(req.content))
z.extractall("names")

combination.ipynb  gender_compendium.csv  [0m[01;34mnames[0m/


# Part 1: Reading files

## Schema: 
- NAME (index): (total Men), (total Women)

In [11]:
dataframes = []
for filename in os.listdir('names'):
    if not filename.endswith(".txt"):
        continue
    df = pd.read_csv("names/"+filename, header=None, names=["name", "sex", "count"])
    dataframes.append(df)

df = pd.concat(dataframes)
df = df.groupby(["name", "sex"], as_index=False).sum()

df = df.pivot(index='name', columns='sex', values='count').fillna(0)
df.columns = ["F", "M"]
df.reset_index(inplace=True)
df = df[["name", "M", "F"]]
df = df.astype({"M": int, "F": int})

df.to_csv("gender_compendium.csv", index=False, header=False)

In [None]:
def build_compendium():
    if os.path.exists("names"):
        shutil.rmtree("names")
    os.mkdir("names")
    source = "https://www.ssa.gov/oact/babynames/names.zip"
    req = requests.get(source, stream=True)
    z = zipfile.ZipFile(io.BytesIO(req.content))
    z.extractall("names")
    
    dataframes = []
    for filename in os.listdir('names'):
        if not filename.endswith(".txt"):
            continue
        df = pd.read_csv("names/"+filename, header=None, names=["name", "sex", "count"])
        dataframes.append(df)

        df = pd.concat(dataframes)
        df = df.groupby(["name", "sex"], as_index=False).sum()

        df = df.pivot(index='name', columns='sex', values='count').fillna(0)
        df.columns = ["F", "M"]
        df.reset_index(inplace=True)
        df = df[["name", "M", "F"]]
        df = df.astype({"M": int, "F": int})

        df.to_csv("gender_compendium.csv", index=False, header=False)

In [17]:
import pandas as pd
import os
import requests, zipfile, io, shutil

class Genderizer:
    df: pd.DataFrame = None
    
    def __init__(self, path: str = "gender_compendium.csv"):
        if not os.path.exists(path):
            self.df = self.build_compendium()
        else:
            self.df = pd.read_csv(path)

    def build_compendium(self) -> pd.DataFrame:
        if os.path.exists("names"):
            shutil.rmtree("names")
        os.mkdir("names")
        source = "https://www.ssa.gov/oact/babynames/names.zip"
        req = requests.get(source, stream=True)
        z = zipfile.ZipFile(io.BytesIO(req.content))
        z.extractall("names")

        dataframes = []
        for filename in os.listdir('names'):
            if not filename.endswith(".txt"):
                continue
            df = pd.read_csv("names/"+filename, header=None, names=["name", "sex", "count"])
            dataframes.append(df)

            df = pd.concat(dataframes)
            df = df.groupby(["name", "sex"], as_index=False).sum()

            df = df.pivot(index='name', columns='sex', values='count').fillna(0)
            df.columns = ["F", "M"]
            df.reset_index(inplace=True)
            df = df[["name", "M", "F"]]
            df = df.astype({"M": int, "F": int})

            df.to_csv("gender_compendium.csv", index=False)
            shutil.rmtree("names")
            return df

    def gender(self, name: str) -> dict[str, int]:
        name = name.capitalize()
        row = self.df[self.df["name"] == name]
        if row.empty:
            return self.request_gender(name)
        m = int(row["M"].values[0])
        f = int(row["F"].values[0])
        if m > f:
            return {"gender": "M", "prob": m/(m+f), "count": m+f}
        elif f > m:
            return {"gender": "F", "prob": f/(m+f), "count": m+f}
        else :
            return {"gender": "U", "prob": 0.5, "count": m+f}

    def request_gender(self, name: str) -> dict[str, int]:
        name = name.capitalize()
        url = f"https://api.genderize.io?name={name}"
        response = requests.get(url)
        data = response.json()
        gender = "M" if data["gender"] == "male" else "F"
        return {"gender": gender, "prob": data["probability"], "count": data["count"]}
        
    def genders(self, names: list[str]) -> dict[str, dict[str, int]]:
        return {name: self.gender(name) for name in names}

In [20]:
gender = Genderizer()

print(gender.gender("mary"))
print(gender.genders(["mary", "www"]))

{'gender': 'F', 'prob': 0.9969630412972152, 'count': 73758}
{'mary': {'gender': 'F', 'prob': 0.9969630412972152, 'count': 73758}, 'www': {'gender': 'M', 'prob': 0.63, 'count': 2956}}
