In [144]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

#nltk.download('names')
from nltk.corpus import names

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [325]:
names_dict = {'name':names.words()}
names_df = pd.DataFrame(data= names_dict)

In [326]:
def label_names(row):
    if row in names.words('female.txt'):
        return "female"
    if row in names.words('male.txt'):
        return "male"
    else: return -1
    
def get_last_letter(row):
    return row[-1]

In [327]:
names_df['label'] = names_df.name.apply(lambda x: label_names(x))

In [328]:
names_df['last_letter'] = names_df.name.apply(lambda x: get_last_letter(x))
names_df = names_df.sample(frac=1).reset_index(drop=True)

In [329]:
print(f"Min length name: {names_df.name.apply(len).min()}",
      f"Average length name: {names_df.name.apply(len).mean()}",
      f"Median length name: {names_df.name.apply(len).median()}",
      f"Max length name: {names_df.name.apply(len).max()}",
      sep="\n")

Min length name: 2
Average length name: 6.03285498489426
Median length name: 6.0
Max length name: 15


## Base Line Model

The book uses a Naive Bayes classifer for determining name-gender congruence. The model shown in the book achieved a 0.782 accuracy. This is what we intend to beat. 

We will leverage the fact that there is a diversity of name length to create character n-grams of the names and assign count scores to those character n-grams. Above we see that the minimum length name is 2 characters and maximum is 15 characters. We will build n-grams to count from 0 - 15. After several tests we found that this combination is likely to be the best for model accuaracy and precision. 

In [330]:
char_vectorizer = CountVectorizer(analyzer='char', ngram_range=(0, 15))
X = char_vectorizer.fit_transform(names_df.name)

In [331]:
# collapse to sparse matrix
X = X.tocsc()

In [332]:
train_dev = X[500:]
test = X[:500]

In [333]:
X_train, X_dev, y_train, y_dev = train_test_split(train_dev, names_df['label'][500:],
                                                    test_size=0.07, random_state=42)

In [334]:
naive_model = MultinomialNB().fit(X_train, y_train)

In [335]:
print(classification_report(y_dev, naive_model.predict(X_dev)))

              precision    recall  f1-score   support

      female       0.84      0.95      0.89       340
        male       0.87      0.65      0.75       182

    accuracy                           0.84       522
   macro avg       0.85      0.80      0.82       522
weighted avg       0.85      0.84      0.84       522



In [336]:
print(confusion_matrix(y_dev, naive_model.predict(X_dev)))

[[322  18]
 [ 63 119]]


In [337]:
svm_model = SVC().fit(X_train, y_train)

In [338]:
print(classification_report(y_dev, svm_model.predict(X_dev)))

              precision    recall  f1-score   support

      female       0.82      0.95      0.88       340
        male       0.87      0.61      0.72       182

    accuracy                           0.83       522
   macro avg       0.85      0.78      0.80       522
weighted avg       0.84      0.83      0.82       522



In [339]:
print(confusion_matrix(y_dev, svm_model.predict(X_dev)))

[[324  16]
 [ 71 111]]


#### Logistic Regression is the best model

In [340]:
logit_model = LogisticRegression(max_iter=3000).fit(X_train, y_train)

In [341]:
print(classification_report(y_dev, logit_model.predict(X_dev)))

              precision    recall  f1-score   support

      female       0.87      0.96      0.91       340
        male       0.91      0.74      0.81       182

    accuracy                           0.88       522
   macro avg       0.89      0.85      0.86       522
weighted avg       0.88      0.88      0.88       522



In [342]:
print(confusion_matrix(y_dev, logit_model.predict(X_dev)))

[[326  14]
 [ 48 134]]


The Logit also minimizes False Negtives/False Positive ratio.

In [343]:
gmb_model = GradientBoostingClassifier().fit(X_train, y_train)

In [344]:
print(classification_report(y_dev, gmb_model.predict(X_dev)))

              precision    recall  f1-score   support

      female       0.76      0.96      0.85       340
        male       0.86      0.43      0.58       182

    accuracy                           0.78       522
   macro avg       0.81      0.70      0.71       522
weighted avg       0.79      0.78      0.75       522



In [345]:
print(confusion_matrix(y_dev, gmb_model.predict(X_dev)))

[[327  13]
 [103  79]]


## Development Results

We see that the best model is the most simple. The logistic regression model significantly better, with the SVM classifier a close second place. Whereas the, more complex, and generally superior Tree Boosted model was equal to the base model. If constrained to a Navie Bayesian model only, our NB model trained on character n-grams demonstrated better results (83% accuracy) as compared to the base model. 

## Test Set Results

Fit the best development model to the entire training set and evaluate on the test set. 

In [346]:
final_model = LogisticRegression(max_iter=3000).fit(train_dev, names_df.label[500:])

In [347]:
print(classification_report(names_df.label[:500], final_model.predict(test)))

              precision    recall  f1-score   support

      female       0.88      0.95      0.91       346
        male       0.87      0.69      0.77       154

    accuracy                           0.87       500
   macro avg       0.87      0.82      0.84       500
weighted avg       0.87      0.87      0.87       500



In [348]:
print(confusion_matrix(names_df.label[:500], final_model.predict(test)))

[[330  16]
 [ 47 107]]


Consistent results from the development set. Great!