In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from collections import defaultdict
from math import log

# Считывание данных

In [53]:
data = pd.read_csv("names.csv")
data

Unnamed: 0,year,name,percent,sex
0,1880,John,0.081541,boy
1,1880,William,0.080511,boy
2,1880,James,0.050057,boy
3,1880,Charles,0.045167,boy
4,1880,George,0.043292,boy
...,...,...,...,...
257995,2008,Carleigh,0.000128,girl
257996,2008,Iyana,0.000128,girl
257997,2008,Kenley,0.000127,girl
257998,2008,Sloane,0.000127,girl


# Разделение на тестовую и обучающую выборку

In [54]:
data_train, data_test = train_test_split(data, test_size=0.3)

In [55]:
data_test

Unnamed: 0,year,name,percent,sex
29023,1909,Louis,0.006961,boy
126206,2006,Drew,0.000850,boy
24532,1904,Randall,0.000130,boy
157246,1908,Winnie,0.000556,girl
103451,1983,Ty,0.000181,boy
...,...,...,...,...
144116,1895,Ora,0.001683,girl
147429,1898,Lovie,0.000190,girl
136863,1887,Almira,0.000051,girl
117714,1997,Fred,0.000111,boy


In [56]:
data_train

Unnamed: 0,year,name,percent,sex
207691,1958,Lu,0.000099,girl
3475,1883,Rex,0.000133,boy
169573,1920,Lavina,0.000126,girl
177910,1928,Maryellen,0.000055,girl
248771,1999,Kacey,0.000147,girl
...,...,...,...,...
230747,1981,Brandon,0.000123,girl
116181,1996,Darius,0.000883,boy
193489,1944,Bennie,0.000158,girl
210220,1961,Dianna,0.000733,girl


# Функция для наивной байесовской классификации

In [75]:
def naive_bayesian_train(samples):
    classes, freq = defaultdict(lambda: 0), defaultdict(lambda: 0)

    for feats, label in samples:
        classes[label] += 1
        for feat in feats:
            freq[label, feat] += 1

    for label, feat in freq:
        freq[label, feat] /= classes[label]
    for c in classes:
        classes[c] /= len(samples)

    return classes, freq


def classify(classifier, feats):
    classes, prob = classifier
    return min(classes.keys(),
               key=lambda cl: -log(classes[cl]) \
                              + sum(-log(prob.get((cl, feat))) if (cl, feat) in prob else 0 for feat in feats))

In [58]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последняя буква.
    """
    return sample[-1]  # get last letter

# Применение функции

In [59]:
# обучение
samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
samples

[['u', 'girl'],
 ['x', 'boy'],
 ['a', 'girl'],
 ['n', 'girl'],
 ['y', 'girl'],
 ['i', 'girl'],
 ['n', 'boy'],
 ['t', 'boy'],
 ['e', 'girl'],
 ['n', 'boy'],
 ['e', 'boy'],
 ['a', 'girl'],
 ['s', 'girl'],
 ['a', 'girl'],
 ['n', 'girl'],
 ['e', 'girl'],
 ['l', 'girl'],
 ['e', 'boy'],
 ['a', 'girl'],
 ['e', 'girl'],
 ['d', 'boy'],
 ['a', 'girl'],
 ['k', 'boy'],
 ['s', 'boy'],
 ['a', 'girl'],
 ['n', 'boy'],
 ['h', 'girl'],
 ['e', 'boy'],
 ['y', 'boy'],
 ['a', 'girl'],
 ['a', 'girl'],
 ['n', 'girl'],
 ['a', 'girl'],
 ['a', 'girl'],
 ['d', 'boy'],
 ['s', 'boy'],
 ['y', 'boy'],
 ['n', 'girl'],
 ['n', 'boy'],
 ['e', 'girl'],
 ['s', 'girl'],
 ['e', 'girl'],
 ['a', 'girl'],
 ['e', 'girl'],
 ['l', 'boy'],
 ['n', 'girl'],
 ['h', 'girl'],
 ['a', 'girl'],
 ['l', 'boy'],
 ['k', 'boy'],
 ['n', 'girl'],
 ['o', 'boy'],
 ['n', 'boy'],
 ['e', 'girl'],
 ['e', 'boy'],
 ['y', 'girl'],
 ['y', 'boy'],
 ['o', 'boy'],
 ['a', 'girl'],
 ['a', 'boy'],
 ['y', 'girl'],
 ['r', 'boy'],
 ['o', 'boy'],
 ['n', 'boy'],
 ['n

In [60]:
classifier = naive_bayesian_train(samples)
classifier

(defaultdict(<function __main__.naive_bayesian_train.<locals>.<lambda>()>,
             {'girl': 0.4993466223698782, 'boy': 0.5006533776301219}),
 defaultdict(<function __main__.naive_bayesian_train.<locals>.<lambda>()>,
             {('girl', 'u'): 0.0019626976558515005,
              ('boy', 'x'): 0.00442389789643655,
              ('girl', 'a'): 0.3842008383047615,
              ('girl', 'n'): 0.08033753964205717,
              ('girl', 'y'): 0.07799782661728505,
              ('girl', 'i'): 0.027876959925484023,
              ('boy', 'n'): 0.2147913026167356,
              ('boy', 't'): 0.05265544471233604,
              ('girl', 'e'): 0.3016344725111441,
              ('boy', 'e'): 0.13832422747682985,
              ('girl', 's'): 0.022421325763456124,
              ('girl', 'l'): 0.0357721052981748,
              ('boy', 'd'): 0.07276206065164016,
              ('boy', 'k'): 0.02210842973744166,
              ('boy', 's'): 0.07737397420867527,
              ('girl', 'h'): 0.02479

In [61]:
# проверка
names = ['Max', 'Nikita', 'Vika']
for name in names:
    print(f"{name} is a {classify(classifier, get_features(name))}")

Max is a boy
Nikita is a girl
Vika is a girl


In [62]:
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
data_test[['name', 'sex', 'predicted_sex']]

Unnamed: 0,name,sex,predicted_sex
29023,Louis,boy,boy
126206,Drew,boy,boy
24532,Randall,boy,boy
157246,Winnie,girl,girl
103451,Ty,boy,boy
...,...,...,...
144116,Ora,girl,girl
147429,Lovie,girl,girl
136863,Almira,girl,girl
117714,Fred,boy,boy


# Проверка через метрики

In [63]:
# точность (accuracy)

accuracy_naive = accuracy_score(data_test['sex'], data_test['predicted_sex'])
accuracy_naive

0.7745219638242894

In [64]:
# полнота (recall) вычисленная вручную

recall = [0, 0]

# насколько верно он определяет мужские имена
sex = 'boy'
recall[0] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['predicted_sex'].str.contains(sex)].shape[0] / \
            data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

# насколько верно он определяет мужские имена
sex = 'girl'
recall[1] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['predicted_sex'].str.contains(sex)].shape[0] / \
            data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

recall

[0.833756674096729, 0.7156473800814055]

In [65]:
# полнота (recall) через sklearn

list(recall_score(data_test['sex'], data_test['predicted_sex'], average=None))

[0.833756674096729, 0.7156473800814055]

In [66]:
# precision

list(precision_score(data_test['sex'], data_test['predicted_sex'], average=None))

[0.7445262232097394, 0.8124232321459905]

# Изменим get_features

In [67]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последние 2 буквы.
    """
    return sample[-2:]


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
accuracy

0.7009173126614987

In [73]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: Первая + последняя буквы.
    """
    return sample[1] + sample[-1]  # get last letter


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
accuracy

0.6761111111111111

При обоих изменениях точность упала

# Изменим classify

In [76]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последняя буква.
    """
    return sample[-1]  # get last letter

In [81]:
def classify(classifier, feats):
    """
    Функция для классификации
    """
    classes, prob = classifier
    return max(classes.keys(),
               key=lambda cl: classes[cl] \
                              + sum(prob.get((cl, feat)) if (cl, feat) in prob else 0 for feat in feats))


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
accuracy

0.7775193798449612

Видно, что незначительно увеличилась. Как будто бы можно считать, что не изменилась

# sklearn.naive_bayes

In [82]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

data = pd.read_csv('names.csv')
X, y = data['name'], data['sex']

# превратим имена в последнюю букву
X_numbers = np.array([ord(x[-1]) for x in X])

# разделим данные
X_train, X_test, y_train, y_test = train_test_split(X_numbers, y, test_size=0.2)

# гауссовский метод
gnb = GaussianNB()
gnb.fit(X_train.reshape(-1, 1), y_train)
y_pred_gauss = gnb.predict(X_test.reshape(-1, 1))

# мультиноминальный
mnb = MultinomialNB()
mnb.fit(X_train.reshape(-1, 1), y_train)
y_pred_multi = gnb.predict(X_test.reshape(-1, 1))

# точность
accuracy_gauss = accuracy_score(y_test, y_pred_gauss)
accuracy_multi = accuracy_score(y_test, y_pred_multi)
print(f"Наивный: {accuracy_naive}")
print(f"Гаусс: {accuracy_gauss}")
print(f"Мультиномиальный: {accuracy_multi}")

Наивный: 0.7745219638242894
Гаусс: 0.7315116279069768
Мультиномиальный: 0.7315116279069768


Видно, что наиболее точный метод — наивный байесовский, который написан руками