In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from collections import defaultdict
from math import log

# Считывание данных

In [2]:
data = pd.read_csv("names.csv")
data

Unnamed: 0,year,name,percent,sex
0,1880,John,0.081541,boy
1,1880,William,0.080511,boy
2,1880,James,0.050057,boy
3,1880,Charles,0.045167,boy
4,1880,George,0.043292,boy
...,...,...,...,...
257995,2008,Carleigh,0.000128,girl
257996,2008,Iyana,0.000128,girl
257997,2008,Kenley,0.000127,girl
257998,2008,Sloane,0.000127,girl


# Разделение на тестовую и обучающую выборку

In [3]:
data_train, data_test = train_test_split(data, test_size=0.3)

In [4]:
data_test

Unnamed: 0,year,name,percent,sex
20617,1900,Ole,0.000105,boy
68470,1948,Gail,0.000113,boy
1289,1881,Abram,0.000277,boy
128774,2008,Marcelo,0.000131,boy
58287,1938,Garry,0.000324,boy
...,...,...,...,...
120277,2000,Randy,0.000544,boy
180063,1931,Audrey,0.003431,girl
191479,1942,Nona,0.000170,girl
207347,1958,Cherie,0.000333,girl


In [6]:
data_train

Unnamed: 0,year,name,percent,sex
192636,1943,Pattie,0.000099,girl
157814,1908,Jackie,0.000065,girl
225107,1976,Ann,0.001549,girl
146510,1897,Delphia,0.000133,girl
100303,1980,Toby,0.000350,boy
...,...,...,...,...
222215,1973,Darlene,0.000748,girl
222722,1973,Adrian,0.000128,girl
233502,1984,Christopher,0.000218,girl
73951,1953,Oren,0.000029,boy


# Функция для наивной байесовской классификации

In [5]:
def naive_bayesian_train(samples):
    classes, freq = defaultdict(lambda: 0), defaultdict(lambda: 0)

    for feats, label in samples:
        classes[label] += 1
        for feat in feats:
            freq[label, feat] += 1

    for label, feat in freq:
        freq[label, feat] /= classes[label]
    for c in classes:
        classes[c] /= len(samples)

    return classes, freq


def classify(classifier, feats):
    classes, prob = classifier
    return min(classes.keys(),
               key=lambda cl: -log(classes[cl]) \
                              + sum(-log(prob.get((cl, feat))) if (cl, feat) in prob else 0 for feat in feats))

In [6]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последняя буква.
    """
    return sample[-1]  # get last letter

# Применение функции

In [7]:
# обучение
samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
np.array(samples) # (преобразую в np.array для нормального вывода на github)

array([['a', 'girl'],
       ['a', 'girl'],
       ['s', 'boy'],
       ...,
       ['a', 'girl'],
       ['n', 'girl'],
       ['s', 'boy']], dtype='<U4')

In [8]:
classifier = naive_bayesian_train(samples)
classifier

(defaultdict(<function __main__.naive_bayesian_train.<locals>.<lambda>()>,
             {'girl': 0.5002436323366556, 'boy': 0.4997563676633444}),
 defaultdict(<function __main__.naive_bayesian_train.<locals>.<lambda>()>,
             {('girl', 'a'): 0.38399893739484636,
              ('boy', 's'): 0.07875376706257756,
              ('boy', 'e'): 0.138439549725226,
              ('girl', 'n'): 0.08002745063313557,
              ('boy', 'l'): 0.08334071973054423,
              ('boy', 'n'): 0.21367000531820599,
              ('girl', 'e'): 0.30212299654653324,
              ('boy', 'a'): 0.015744105655025705,
              ('boy', 'd'): 0.07288158127991491,
              ('boy', 'x'): 0.0045093954972522604,
              ('girl', 'o'): 0.004958824050296644,
              ('girl', 'y'): 0.07828964845479501,
              ('boy', 'y'): 0.11064084382201737,
              ('girl', 'i'): 0.02776055963871425,
              ('boy', 'r'): 0.061225846481120366,
              ('boy', 'h'): 0.02684

In [9]:
# проверка
names = ['Max', 'Nikita', 'Vika']
for name in names:
    print(f"{name} is a {classify(classifier, get_features(name))}")

Max is a boy
Nikita is a girl
Vika is a girl


In [10]:
# посмотрим на результат
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
data_test[['name', 'sex', 'predicted_sex']]

Unnamed: 0,name,sex,predicted_sex
20617,Ole,boy,girl
68470,Gail,boy,boy
1289,Abram,boy,boy
128774,Marcelo,boy,boy
58287,Garry,boy,boy
...,...,...,...
120277,Randy,boy,boy
180063,Audrey,girl,boy
191479,Nona,girl,girl
207347,Cherie,girl,girl


# Проверка через метрики

In [11]:
# точность (accuracy)

accuracy_naive = accuracy_score(data_test['sex'], data_test['predicted_sex'])
print(f"{accuracy_naive = }")

accuracy_naive = 0.773811369509044


In [12]:
# полнота (recall) вычисленная вручную

recall = [0, 0]

# насколько верно он определяет мужские имена
sex = 'boy'
recall[0] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['predicted_sex'].str.contains(sex)].shape[0] / \
            data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

# насколько верно он определяет мужские имена
sex = 'girl'
recall[1] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['predicted_sex'].str.contains(sex)].shape[0] / \
            data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

print(f"{recall = }")

recall = [0.8319997935164154, 0.7154904801324503]


In [14]:
# полнота (recall) через sklearn

print(f"recall_score = {recall_score(data_test['sex'], data_test['predicted_sex'], average=None)}")

recall_score = [0.83199979 0.71549048]


In [15]:
# precision

print(f"precision_score = {precision_score(data_test['sex'], data_test['predicted_sex'], average=None)}")

precision_score = [0.74561099 0.80949454]


# Изменим get_features

In [19]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последние 2 буквы.
    """
    return sample[-2:]


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
print(f"{accuracy = }")

accuracy = 0.6994186046511628


In [20]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: Первая + последняя буквы.
    """
    return sample[1] + sample[-1]  # get last letter


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
print(f"{accuracy = }")

accuracy = 0.7071447028423773


При обоих изменениях точность упала. Вывод: признаки влияют на результат. В нашем случае самый удачный признак — последняя буква

# Изменим classify

In [23]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последняя буква.
    """
    return sample[-1]  # get last letter

In [24]:
def classify(classifier, feats):
    """
    Функция для классификации
    """
    classes, prob = classifier
    return max(classes.keys(),
               key=lambda cl: classes[cl] \
                              + sum(prob.get((cl, feat)) if (cl, feat) in prob else 0 for feat in feats))


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
print(f"{accuracy = }")

accuracy = 0.7767312661498708


In [27]:
def classify(classifier, feats):
    """
    Функция для классификации
    """
    classes, prob = classifier
    return max(classes.keys(),
               key=lambda cl: classes[cl] \
                              + sum(prob.get((cl, feat)) if (cl, feat) in prob else 1 for feat in feats))


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
print(f"{accuracy = }")

accuracy = 0.7734237726098191


Видно, что незначительно изменилась. Как будто бы можно считать, что не изменилась.

# sklearn.naive_bayes

In [28]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

data = pd.read_csv('names.csv')
X, y = data['name'], data['sex']

# превратим имена в последнюю букву
X_numbers = np.array([ord(x[-1]) for x in X])

# разделим данные
X_train, X_test, y_train, y_test = train_test_split(X_numbers, y, test_size=0.2)

# гауссовский метод
clf = GaussianNB()
clf.fit(X_train.reshape(-1, 1), y_train)
y_pred_gauss = clf.predict(X_test.reshape(-1, 1))

# мультиноминальный
clf = MultinomialNB()
clf.fit(X_train.reshape(-1, 1), y_train)
y_pred_multi = clf.predict(X_test.reshape(-1, 1))

# точность
accuracy_gauss = accuracy_score(y_test, y_pred_gauss)
accuracy_multi = accuracy_score(y_test, y_pred_multi)
print(f"Наивный: {accuracy_naive}")
print(f"Гаусс: {accuracy_gauss}")
print(f"Мультиномиальный: {accuracy_multi}")

Наивный: 0.773811369509044
Гаусс: 0.7325775193798449
Мультиномиальный: 0.498953488372093


Видно, что наиболее точный метод — наивный байесовский, который написан руками. Гауссовский показал себя похуже. У мультиномиального точность сопоставима с угадыванием.