In [122]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn

# Считывание данных

In [123]:
data = pd.read_csv("names.csv")
data

Unnamed: 0,year,name,percent,sex
0,1880,John,0.081541,boy
1,1880,William,0.080511,boy
2,1880,James,0.050057,boy
3,1880,Charles,0.045167,boy
4,1880,George,0.043292,boy
...,...,...,...,...
257995,2008,Carleigh,0.000128,girl
257996,2008,Iyana,0.000128,girl
257997,2008,Kenley,0.000127,girl
257998,2008,Sloane,0.000127,girl


# Разделение на тестовую и обучающую выборку

In [124]:
data_train, data_test = train_test_split(data, test_size=0.3)

In [125]:
data_test

Unnamed: 0,year,name,percent,sex
89322,1969,Julio,0.000315,boy
132088,1883,Mollie,0.002507,girl
28844,1908,Ellie,0.000066,boy
52027,1932,Carl,0.006245,boy
149312,1900,Linnie,0.000355,girl
...,...,...,...,...
150925,1901,Daphne,0.000051,girl
76898,1956,Rich,0.000032,boy
74374,1954,Dominic,0.000172,boy
68797,1948,Bonnie,0.000040,boy


In [126]:
data_train

Unnamed: 0,year,name,percent,sex
171539,1922,Theodora,0.000138,girl
234864,1985,Janel,0.000100,girl
24756,1904,Vernie,0.000079,boy
97962,1977,Domenic,0.000046,boy
84775,1964,Merlin,0.000047,boy
...,...,...,...,...
12315,1892,Porter,0.000266,boy
52846,1932,Edison,0.000053,boy
145989,1896,Pollie,0.000048,girl
83431,1963,Willis,0.000165,boy


# Функция для наивной байесовской классификации

In [127]:
from collections import defaultdict
from math import log


def train(samples):
    """
    Функция для обучения модели
    """
    classes, freq = defaultdict(lambda: 0), defaultdict(lambda: 0) # классы и частоты

    # заполняем словари из классов и частот
    for feats, label in samples: # идем по признакам (feats) и полам (label)
        classes[label] += 1  # count classes frequencies
        for feat in feats:
            freq[label, feat] += 1  # count features frequencies

    # превращаем count в количество
    for label, feat in freq:  # normalize features frequencies
        freq[label, feat] /= classes[label]
    for c in classes:  # normalize classes frequencies
        classes[c] /= len(samples)

    return classes, freq  # return P(C) and P(O|C)


def classify(classifier, feats):
    """
    Функция для классификации
    """
    classes, prob = classifier
    return min(classes.keys(),  # calculate argmin(-log(P(C|O))) -> argmax(P(C|O))
           key=lambda cl: -log(classes[cl]) \
                          + sum(-log(prob.get((cl, feat))) if (cl, feat) in prob else 0 for feat in feats))

def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последняя буква.
    """
    return sample[-1]  # get last letter

# Применение функции

In [128]:
# обучение
samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
samples

[['a', 'girl'],
 ['l', 'girl'],
 ['e', 'boy'],
 ['c', 'boy'],
 ['n', 'boy'],
 ['n', 'boy'],
 ['d', 'boy'],
 ['n', 'boy'],
 ['l', 'girl'],
 ['r', 'girl'],
 ['a', 'girl'],
 ['a', 'boy'],
 ['d', 'boy'],
 ['a', 'girl'],
 ['x', 'boy'],
 ['l', 'girl'],
 ['n', 'boy'],
 ['e', 'girl'],
 ['e', 'girl'],
 ['a', 'girl'],
 ['e', 'girl'],
 ['a', 'girl'],
 ['l', 'boy'],
 ['y', 'boy'],
 ['a', 'girl'],
 ['a', 'girl'],
 ['a', 'girl'],
 ['h', 'boy'],
 ['m', 'boy'],
 ['l', 'boy'],
 ['n', 'girl'],
 ['e', 'girl'],
 ['o', 'boy'],
 ['h', 'girl'],
 ['n', 'girl'],
 ['e', 'girl'],
 ['a', 'girl'],
 ['y', 'boy'],
 ['e', 'girl'],
 ['h', 'girl'],
 ['t', 'boy'],
 ['l', 'boy'],
 ['o', 'boy'],
 ['e', 'boy'],
 ['r', 'boy'],
 ['a', 'girl'],
 ['e', 'girl'],
 ['a', 'girl'],
 ['h', 'girl'],
 ['t', 'boy'],
 ['a', 'girl'],
 ['i', 'boy'],
 ['y', 'boy'],
 ['n', 'boy'],
 ['a', 'girl'],
 ['k', 'boy'],
 ['a', 'girl'],
 ['a', 'girl'],
 ['y', 'girl'],
 ['n', 'boy'],
 ['e', 'boy'],
 ['s', 'boy'],
 ['i', 'girl'],
 ['d', 'boy'],
 ['h', 

In [129]:
classifier = train(samples)
classifier

(defaultdict(<function __main__.train.<locals>.<lambda>()>,
             {'girl': 0.49962901439645624, 'boy': 0.5003709856035438}),
 defaultdict(<function __main__.train.<locals>.<lambda>()>,
             {('girl', 'a'): 0.3849146099542296,
              ('girl', 'l'): 0.03525317788392273,
              ('boy', 'e'): 0.13735102415704847,
              ('boy', 'c'): 0.009339692586895659,
              ('boy', 'n'): 0.21506744718757953,
              ('boy', 'd'): 0.0727145971427623,
              ('girl', 'r'): 0.011813859674398502,
              ('boy', 'a'): 0.015824360662631272,
              ('boy', 'x'): 0.004293602753217436,
              ('girl', 'e'): 0.3027717132313012,
              ('boy', 'l'): 0.08320515232330386,
              ('boy', 'y'): 0.11029468721989222,
              ('boy', 'h'): 0.026281717883740747,
              ('boy', 'm'): 0.016123142297520113,
              ('girl', 'n'): 0.07941662141345184,
              ('boy', 'o'): 0.06316465081279671,
              ('

In [130]:
# проверка
names = ['Max', 'Nikita', 'Vika']
for name in names:
    print(f"{name} is a {classify(classifier, get_features(name))}")

Max is a boy
Nikita is a girl
Vika is a girl


In [131]:
data_test['class'] = [classify(classifier, get_features(name)) for name in data_test['name']]
data_test[['name', 'sex', 'class']]

Unnamed: 0,name,sex,class
89322,Julio,boy,boy
132088,Mollie,girl,girl
28844,Ellie,boy,girl
52027,Carl,boy,boy
149312,Linnie,girl,girl
...,...,...,...
150925,Daphne,girl,girl
76898,Rich,boy,boy
74374,Dominic,boy,boy
68797,Bonnie,boy,girl


# Точность

## Через частоту

In [132]:
# вручную
accuracy = sum([sex == cl for sex, cl in zip(data_test['sex'], data_test['class'])]) / data_test.shape[0]
accuracy

0.7715503875968992

In [133]:
accuracy = sklearn.metrics.accuracy_score(data_test['sex'], data_test['class'])
accuracy_naive = sklearn.metrics.accuracy_score(data_test['sex'], data_test['class'])
accuracy

0.7715503875968992

## Через полноту

In [134]:
# вручную

recall = [0, 0]

# насколько верно он определяет мужские имена
sex = 'boy'
recall[0] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['class'].str.contains(sex)].shape[0] / data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

# насколько верно он определяет мужские имена
sex = 'girl'
recall[1] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['class'].str.contains(sex)].shape[0] / data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

recall

[0.8302228664613155, 0.7130807129775324]

In [135]:
# автоматически

sklearn.metrics.recall_score(data_test['sex'], data_test['class'], average=None),

(array([0.83022287, 0.71308071]),)

# Изменим get_features

In [136]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последние 2 буквы.
    """
    return sample[-2:]  # get last letter

samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = train(samples)
data_test['class'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = sklearn.metrics.accuracy_score(data_test['sex'], data_test['class'])
accuracy

0.6977131782945737

In [137]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: Первая + последняя буквы.
    """
    return sample[1] + sample[-1]  # get last letter

samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = train(samples)
data_test['class'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = sklearn.metrics.accuracy_score(data_test['sex'], data_test['class'])
accuracy

0.7053617571059432

# Изменим classify

In [138]:
def classify(classifier, feats):
    """
    Функция для классификации
    """
    classes, prob = classifier
    return max(classes.keys(),
           key=lambda cl: log(classes[cl]) \
                          + sum(prob.get((cl, feat)) if (cl, feat) in prob else 0 for feat in feats))

samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = train(samples)
data_test['class'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = sklearn.metrics.accuracy_score(data_test['sex'], data_test['class'])
accuracy

0.6738372093023256

Видно, что точность упала

# sklearn.naive_bayes

In [139]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

data = pd.read_csv('names.csv')
X, y = data['name'], data['sex']

# превратим имена в последнюю букву
X_numbers = np.array([ord(x[-1]) for x in X])

# разделим данные
X_train, X_test, y_train, y_test = train_test_split(X_numbers, y, test_size=0.2)

# гауссовский метод
gnb = GaussianNB()
gnb.fit(X_train.reshape(-1, 1), y_train)
y_pred_gauss = gnb.predict(X_test.reshape(-1, 1))

# мультиноминальный
mnb = MultinomialNB()
mnb.fit(X_train.reshape(-1, 1), y_train)
y_pred_multi = gnb.predict(X_test.reshape(-1, 1))

# точность
accuracy_gauss = sklearn.metrics.accuracy_score(y_test, y_pred_gauss)
accuracy_multi = sklearn.metrics.accuracy_score(y_test, y_pred_multi)
print(f"Наивный: {accuracy_naive}")
print(f"Гаусс: {accuracy_gauss}")
print(f"Мультиномиальный: {accuracy_multi}")

Наивный: 0.7715503875968992
Гаусс: 0.7329263565891473
Мультиномиальный: 0.7329263565891473
