In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_rows = 7
np.random.seed(451)

In [2]:
df = pd.read_csv('baby-names.csv')
df.head()

Unnamed: 0,year,name,percent,sex
0,1880,John,0.081541,boy
1,1880,William,0.080511,boy
2,1880,James,0.050057,boy
3,1880,Charles,0.045167,boy
4,1880,George,0.043292,boy


In [3]:
df.shape

(258000, 4)

In [4]:
df = df.sample(10_000)

## Preprocessing

In [5]:
df.sex.value_counts()

sex
girl    5034
boy     4966
Name: count, dtype: int64

In [6]:
df.groupby(['name', 'sex'])['percent'].count()[('George')]
#df.groupby(['name', 'sex'])['percent'].count()[('George')][('girl')] # George and girl become indexes after grouping

sex
boy     7
girl    1
Name: percent, dtype: int64

In [7]:
df

Unnamed: 0,year,name,percent,sex
29125,1909,Orville,0.001233,boy
71943,1951,Brendan,0.000029,boy
96033,1976,Shawn,0.006190,boy
...,...,...,...,...
175976,1926,Mariam,0.000048,girl
97444,1977,Mickey,0.000189,boy
221409,1972,Carey,0.000298,girl


In [8]:
df = df.set_index(['name', 'sex'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,year,percent
name,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
Orville,boy,1909,0.001233
Brendan,boy,1951,0.000029
Shawn,boy,1976,0.006190
...,...,...,...
Mariam,girl,1926,0.000048
Mickey,boy,1977,0.000189
Carey,girl,1972,0.000298


In [9]:
groups = df.groupby(['name', 'sex'])
groups.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,percent
name,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
Orville,boy,1909,0.001233
Brendan,boy,1951,0.000029
Shawn,boy,1976,0.006190
...,...,...,...
Mariam,girl,1926,0.000048
Mickey,boy,1977,0.000189
Carey,girl,1972,0.000298


In [10]:
counts = groups['percent'].count().sort_values(ascending=False)
counts

name     sex 
Antonio  boy     12
Eddie    boy     11
Kathryn  girl    11
                 ..
Elmo     boy      1
Eloy     boy      1
Zula     girl     1
Name: percent, Length: 3837, dtype: int64

In [11]:
#df[df.index.duplicated(keep='first')]

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1,3),
    use_idf=False 
    )

vectorizer

In [13]:
counts.index.values

array([('Antonio', 'boy'), ('Eddie', 'boy'), ('Kathryn', 'girl'), ...,
       ('Elmo', 'boy'), ('Eloy', 'boy'), ('Zula', 'girl')], dtype=object)

In [14]:
df = pd.DataFrame([list(tup) for tup in counts.index.values],
                 columns=['name','sex'])
df['count'] = counts.values
df

Unnamed: 0,name,sex,count
0,Antonio,boy,12
1,Eddie,boy,11
2,Kathryn,girl,11
...,...,...,...
3834,Elmo,boy,1
3835,Eloy,boy,1
3836,Zula,girl,1


In [15]:
#split into train and test
df['istrain'] = np.random.rand(len(df))<.9
df

Unnamed: 0,name,sex,count,istrain
0,Antonio,boy,12,False
1,Eddie,boy,11,True
2,Kathryn,girl,11,True
...,...,...,...,...
3834,Elmo,boy,1,True
3835,Eloy,boy,1,True
3836,Zula,girl,1,True


In [16]:
df.index = pd.MultiIndex.from_tuples(
    zip(df['name'], df['sex']), names=['name_', 'sex_']
)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,name,sex,count,istrain
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Antonio,boy,Antonio,boy,12,False
Eddie,boy,Eddie,boy,11,True
Kathryn,girl,Kathryn,girl,11,True
...,...,...,...,...,...
Elmo,boy,Elmo,boy,1,True
Eloy,boy,Eloy,boy,1,True
Zula,girl,Zula,girl,1,True


In [17]:
df_most_common = {}

for name, group in df.groupby('name'):
    row_dict = group.iloc[group['count'].argmax()].to_dict()
    df_most_common[(name, row_dict['sex'])] = row_dict
    
df_most_common = pd.DataFrame(df_most_common).T
#df_most_common[df_most_common['name']=='Antonio']

In [18]:
df_most_common['istest'] = ~df_most_common['istrain'].astype(bool)
df_most_common

Unnamed: 0,Unnamed: 1,name,sex,count,istrain,istest
Abbey,girl,Abbey,girl,2,True,False
Abbie,girl,Abbie,girl,4,True,False
Abby,girl,Abby,girl,4,True,False
...,...,...,...,...,...,...
Zona,girl,Zona,girl,4,True,False
Zora,girl,Zora,girl,2,True,False
Zula,girl,Zula,girl,1,True,False


In [19]:
df['istest'] = df_most_common['istest']
df['istest'] = df['istest'].fillna(False)
df['istrain'] = ~df['istest']
istrain = df['istrain']

df['istrain'].sum()/len(df)

  df['istest'] = df['istest'].fillna(False)


0.9150377899400574

In [20]:
df['istest'].sum()/len(df)

0.08496221005994266

In [21]:
(df['istrain'].sum() + df['istest'].sum())/len(df)

1.0

In [22]:
unique_names = df['name'][istrain].unique()

vectorizer.fit(unique_names)
vecs = vectorizer.transform(df['name'])
vecs

<3837x2638 sparse matrix of type '<class 'numpy.float64'>'
	with 53038 stored elements in Compressed Sparse Row format>

In [23]:
vecs = pd.DataFrame(vecs.toarray())
vecs.columns = vectorizer.get_feature_names_out()
vecs.index = df.index
vecs.iloc[:,:7]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,aa,aak,ab,aba,abb,abd
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Antonio,boy,0.213201,0.0,0.0,0.0,0.0,0.0,0.0
Eddie,boy,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Kathryn,girl,0.235702,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
Elmo,boy,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Eloy,boy,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Zula,girl,0.333333,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1,3),
    use_idf=False,
    lowercase=False
    )

vectorizer.fit(unique_names)
vecs = vectorizer.transform(df['name'])
vecs = pd.DataFrame(vecs.toarray())
vecs.columns = vectorizer.get_feature_names_out()
vecs.index = df.index
vecs.iloc[:,:7]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,Ab,Abb,Abd,Abe,Abi,Abn
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Antonio,boy,0.213201,0.0,0.0,0.0,0.0,0.0,0.0
Eddie,boy,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Kathryn,girl,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
Elmo,boy,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Eloy,boy,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Zula,girl,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


## Building neural network (Only forward pass)

In [26]:
import torch

class LogisticRegressionNN(torch.nn.Module):
    def __init__(self, num_features, num_outputs=1):
        super().__init__()
        self.linear = torch.nn.Linear(num_features, num_outputs)

    def forward(self, X):
        return torch.sigmoid(self.linear(X))

model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
model

LogisticRegressionNN(
  (linear): Linear(in_features=3395, out_features=1, bias=True)
)

In [27]:
loss_func_train = torch.nn.BCELoss(
    weight=torch.Tensor(df[['count']][istrain].values))

loss_func_test = torch.nn.BCELoss(
    weight=torch.Tensor(df[['count']][~istrain].values))

loss_func_train

BCELoss()

In [28]:
from torch.optim import SGD

hyperparams = {'momentum': 0.001, 'lr': 0.02}
optimizer = SGD(
    model.parameters(), **hyperparams)

optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.02
    maximize: False
    momentum: 0.001
    nesterov: False
    weight_decay: 0
)

In [29]:
X = vecs.values
y = (df[['sex']] == 'girl').values

X_train = torch.Tensor(X[istrain])
X_test = torch.Tensor(X[~istrain])
y_train = torch.Tensor(y[istrain])
y_test = torch.Tensor(y[~istrain])

In [31]:
from tqdm import tqdm
num_epochs = 200
pbar_epochs = tqdm(range(num_epochs), desc='Epoch:', total=num_epochs)

for epoch in pbar_epochs:
    optimizer.zero_grad()
    outputs = model(X_train)
    loss_train = loss_func_train(outputs, y_train)
    loss_train.backward()
    optimizer.step()

Epoch:: 100%|███████████████████████████████████████████████████████████████████████| 200/200 [00:01<00:00, 106.47it/s]


In [32]:
#utility functions for monitoring neuron over time

def make_array(x):
    if hasattr(x, 'detach'):
        return torch.squeeze(x).detach().numpy()
    return x

In [33]:
def measure_binary_accuracy(y_pred, y):
    y_pred = make_array(y_pred).round()
    y = make_array(y).round()
    num_correct = (y_pred==y).sum()
    return num_correct/len(y)

In [35]:
# rerun the training using the above utility functions

for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss_train = loss_func_train(outputs, y_train)
    loss_train.backward()
    optimizer.step()
    outputs_test = model(X_test)
    loss_test = loss_func_test(outputs_test, y_test).item()
    accuracy_test = measure_binary_accuracy(outputs_test, y_test)
    if epoch%20==19:
        print(f"Epoch {epoch}:"
              f" loss_train/test: {loss_train.item():.4f}/{loss_test:.4f}",
        f" accuracy_test: {accuracy_test:.4f}")

Epoch 19: loss_train/test: 1.7729/1.7738  accuracy_test: 0.6779
Epoch 39: loss_train/test: 1.7700/1.7708  accuracy_test: 0.6840
Epoch 59: loss_train/test: 1.7672/1.7678  accuracy_test: 0.6871
Epoch 79: loss_train/test: 1.7645/1.7649  accuracy_test: 0.6871
Epoch 99: loss_train/test: 1.7617/1.7621  accuracy_test: 0.6933
Epoch 119: loss_train/test: 1.7590/1.7592  accuracy_test: 0.6840
Epoch 139: loss_train/test: 1.7563/1.7564  accuracy_test: 0.6840
Epoch 159: loss_train/test: 1.7537/1.7536  accuracy_test: 0.6871
Epoch 179: loss_train/test: 1.7511/1.7508  accuracy_test: 0.6871
Epoch 199: loss_train/test: 1.7485/1.7481  accuracy_test: 0.6902


In [36]:
# test on unseen samples

X = vectorizer.transform(
    ['John', 'Greg', 'Vishvesh',
     'Ruby', 'Carlana', 'Sarah']
)

model(torch.Tensor(X.todense()))

tensor([[0.4768],
        [0.4858],
        [0.5001],
        [0.4829],
        [0.5494],
        [0.5401]], grad_fn=<SigmoidBackward0>)