In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from collections import defaultdict
from math import log

# Считывание данных

In [3]:
data = pd.read_csv("names.csv")
data

Unnamed: 0,year,name,percent,sex
0,1880,John,0.081541,boy
1,1880,William,0.080511,boy
2,1880,James,0.050057,boy
3,1880,Charles,0.045167,boy
4,1880,George,0.043292,boy
...,...,...,...,...
257995,2008,Carleigh,0.000128,girl
257996,2008,Iyana,0.000128,girl
257997,2008,Kenley,0.000127,girl
257998,2008,Sloane,0.000127,girl


# Разделение на тестовую и обучающую выборку

In [4]:
data_train, data_test = train_test_split(data, test_size=0.3)

In [5]:
data_test

Unnamed: 0,year,name,percent,sex
18246,1898,Carroll,0.000386,boy
203093,1954,Michelle,0.002225,girl
66537,1946,Rolland,0.000086,boy
197916,1948,Marilou,0.000052,girl
135351,1886,Eda,0.000241,girl
...,...,...,...,...
90598,1970,Domingo,0.000096,boy
253949,2004,Tracy,0.000126,girl
218355,1969,Annmarie,0.000371,girl
214148,1965,Elaine,0.001367,girl


In [6]:
data_train

Unnamed: 0,year,name,percent,sex
192636,1943,Pattie,0.000099,girl
157814,1908,Jackie,0.000065,girl
225107,1976,Ann,0.001549,girl
146510,1897,Delphia,0.000133,girl
100303,1980,Toby,0.000350,boy
...,...,...,...,...
222215,1973,Darlene,0.000748,girl
222722,1973,Adrian,0.000128,girl
233502,1984,Christopher,0.000218,girl
73951,1953,Oren,0.000029,boy


# Функция для наивной байесовской классификации

In [7]:
def naive_bayesian_train(samples):
    classes, freq = defaultdict(lambda: 0), defaultdict(lambda: 0)

    for feats, label in samples:
        classes[label] += 1
        for feat in feats:
            freq[label, feat] += 1

    for label, feat in freq:
        freq[label, feat] /= classes[label]
    for c in classes:
        classes[c] /= len(samples)

    return classes, freq


def classify(classifier, feats):
    classes, prob = classifier
    return min(classes.keys(),
               key=lambda cl: -log(classes[cl]) \
                              + sum(-log(prob.get((cl, feat))) if (cl, feat) in prob else 0 for feat in feats))

In [8]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последняя буква.
    """
    return sample[-1]  # get last letter

# Применение функции

In [9]:
# обучение
samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
np.array(samples)

array([['e', 'girl'],
       ['e', 'girl'],
       ['n', 'girl'],
       ...,
       ['r', 'girl'],
       ['n', 'boy'],
       ['n', 'boy']], dtype='<U4')

In [96]:
classifier = naive_bayesian_train(samples)
classifier

(defaultdict(<function __main__.naive_bayesian_train.<locals>.<lambda>()>,
             {'boy': 0.49988372093023253, 'girl': 0.5001162790697674}),
 defaultdict(<function __main__.naive_bayesian_train.<locals>.<lambda>()>,
             {('boy', 'n'): 0.21466786295816304,
              ('boy', 'o'): 0.06299360870191296,
              ('girl', 'a'): 0.3844953000963231,
              ('girl', 'e'): 0.3006609758527917,
              ('boy', 't'): 0.053766656697570865,
              ('boy', 'i'): 0.00559377042280043,
              ('girl', 'h'): 0.025531161081033205,
              ('girl', 'n'): 0.0803246199665637,
              ('boy', 'e'): 0.13867012261987838,
              ('boy', 'h'): 0.02626302905437588,
              ('girl', 'y'): 0.0781988684801984,
              ('boy', 'y'): 0.11094495951439427,
              ('girl', 'r'): 0.012023781844753712,
              ('boy', 'd'): 0.07219840716002614,
              ('boy', 'm'): 0.01663731321791336,
              ('girl', 'l'): 0.0349309

In [97]:
# проверка
names = ['Max', 'Nikita', 'Vika']
for name in names:
    print(f"{name} is a {classify(classifier, get_features(name))}")

Max is a boy
Nikita is a girl
Vika is a girl


In [98]:
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
data_test[['name', 'sex', 'predicted_sex']]

Unnamed: 0,name,sex,predicted_sex
129389,Antonia,girl,girl
144396,Eddie,girl,girl
63818,Ian,boy,boy
200861,Phoebe,girl,girl
61723,Arther,boy,boy
...,...,...,...
177529,Isabell,girl,boy
122941,Darrin,boy,boy
158518,Ramona,girl,girl
159336,Joyce,girl,girl


# Проверка через метрики

In [99]:
# точность (accuracy)

accuracy_naive = accuracy_score(data_test['sex'], data_test['predicted_sex'])
accuracy_naive

0.7762532299741602

In [100]:
# полнота (recall) вычисленная вручную

recall = [0, 0]

# насколько верно он определяет мужские имена
sex = 'boy'
recall[0] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['predicted_sex'].str.contains(sex)].shape[0] / \
            data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

# насколько верно он определяет мужские имена
sex = 'girl'
recall[1] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['predicted_sex'].str.contains(sex)].shape[0] / \
            data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

recall

[0.8336303297952016, 0.7188138266242664]

In [101]:
# полнота (recall) через sklearn

list(recall_score(data_test['sex'], data_test['predicted_sex'], average=None))

[0.8336303297952016, 0.7188138266242664]

In [102]:
# precision

list(precision_score(data_test['sex'], data_test['predicted_sex'], average=None))

[0.7479782180512108, 0.8118849467075485]

# Изменим get_features

In [103]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последние 2 буквы.
    """
    return sample[-2:]


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
accuracy

0.7014082687338501

In [104]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: Первая + последняя буквы.
    """
    return sample[1] + sample[-1]  # get last letter


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
accuracy

0.7095865633074936

При обоих изменениях точность упала

# Изменим classify

In [105]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последняя буква.
    """
    return sample[-1]  # get last letter

In [106]:
def classify(classifier, feats):
    """
    Функция для классификации
    """
    classes, prob = classifier
    return max(classes.keys(),
               key=lambda cl: classes[cl] \
                              + sum(prob.get((cl, feat)) if (cl, feat) in prob else 0 for feat in feats))


samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = naive_bayesian_train(samples)
data_test['predicted_sex'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = accuracy_score(data_test['sex'], data_test['predicted_sex'])
accuracy

0.7799354005167959

Видно, что незначительно увеличилась. Как будто бы можно считать, что не изменилась

# sklearn.naive_bayes

In [107]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

data = pd.read_csv('names.csv')
X, y = data['name'], data['sex']

# превратим имена в последнюю букву
X_numbers = np.array([ord(x[-1]) for x in X])

# разделим данные
X_train, X_test, y_train, y_test = train_test_split(X_numbers, y, test_size=0.2)

# гауссовский метод
gnb = GaussianNB()
gnb.fit(X_train.reshape(-1, 1), y_train)
y_pred_gauss = gnb.predict(X_test.reshape(-1, 1))

# мультиноминальный
mnb = MultinomialNB()
mnb.fit(X_train.reshape(-1, 1), y_train)
y_pred_multi = gnb.predict(X_test.reshape(-1, 1))

# точность
accuracy_gauss = accuracy_score(y_test, y_pred_gauss)
accuracy_multi = accuracy_score(y_test, y_pred_multi)
print(f"Наивный: {accuracy_naive}")
print(f"Гаусс: {accuracy_gauss}")
print(f"Мультиномиальный: {accuracy_multi}")

Наивный: 0.7762532299741602
Гаусс: 0.7308914728682171
Мультиномиальный: 0.7308914728682171


Видно, что наиболее точный метод — наивный байесовский, который написан руками