In [274]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn

# Считывание данных

In [275]:
data = pd.read_csv("names.csv")
data

Unnamed: 0,year,name,percent,sex
0,1880,John,0.081541,boy
1,1880,William,0.080511,boy
2,1880,James,0.050057,boy
3,1880,Charles,0.045167,boy
4,1880,George,0.043292,boy
...,...,...,...,...
257995,2008,Carleigh,0.000128,girl
257996,2008,Iyana,0.000128,girl
257997,2008,Kenley,0.000127,girl
257998,2008,Sloane,0.000127,girl


# Разделение на тестовую и обучающую выборку

In [276]:
data_train, data_test = train_test_split(data, test_size=0.3)

In [277]:
data_test

Unnamed: 0,year,name,percent,sex
190170,1941,Lynn,0.001012,girl
162345,1913,Molly,0.000301,girl
214684,1965,Flora,0.000113,girl
221982,1972,Julianna,0.000076,girl
161221,1912,Jewel,0.000709,girl
...,...,...,...,...
189383,1940,Cleo,0.000255,girl
216614,1967,Kerrie,0.000139,girl
83674,1963,Carmine,0.000065,boy
182476,1933,Merle,0.000174,girl


In [278]:
data_train

Unnamed: 0,year,name,percent,sex
39244,1919,Murray,0.000457,boy
144187,1895,Selma,0.000842,girl
166943,1917,Thomas,0.000049,girl
196026,1947,Gloria,0.006936,girl
198301,1949,Gertrude,0.000373,girl
...,...,...,...,...
436,1880,Merle,0.000144,boy
59761,1939,Theo,0.000057,boy
116578,1996,Vicente,0.000157,boy
249707,2000,Rebeca,0.000165,girl


# Функция для наивной байесовской классификации

In [279]:
from collections import defaultdict
from math import log


def train(samples):
    """
    Функция для обучения модели
    """
    classes, freq = defaultdict(lambda: 0), defaultdict(lambda: 0) # классы и частоты

    # заполняем словари из классов и частот
    for feats, label in samples: # идем по признакам (feats) и полам (label)
        classes[label] += 1  # count classes frequencies
        for feat in feats:
            freq[label, feat] += 1  # count features frequencies

    # превращаем count в количество
    for label, feat in freq:  # normalize features frequencies
        freq[label, feat] /= classes[label]
    for c in classes:  # normalize classes frequencies
        classes[c] /= len(samples)

    return classes, freq  # return P(C) and P(O|C)


def classify(classifier, feats):
    """
    Функция для классификации
    """
    classes, prob = classifier
    return min(classes.keys(),  # calculate argmin(-log(P(C|O))) -> argmax(P(C|O))
           key=lambda cl: -log(classes[cl]) \
                          + sum(-log(prob.get((cl, feat))) if (cl, feat) in prob else 0 for feat in feats))

def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последняя буква.
    """
    return sample[-1]  # get last letter

# Применение функции

In [280]:
# обучение
samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
samples

[['y', 'boy'],
 ['a', 'girl'],
 ['s', 'girl'],
 ['a', 'girl'],
 ['e', 'girl'],
 ['t', 'boy'],
 ['a', 'girl'],
 ['a', 'girl'],
 ['a', 'girl'],
 ['n', 'girl'],
 ['y', 'girl'],
 ['a', 'girl'],
 ['n', 'boy'],
 ['e', 'girl'],
 ['y', 'boy'],
 ['a', 'girl'],
 ['e', 'girl'],
 ['l', 'boy'],
 ['h', 'girl'],
 ['e', 'girl'],
 ['x', 'boy'],
 ['e', 'girl'],
 ['e', 'boy'],
 ['e', 'girl'],
 ['r', 'boy'],
 ['t', 'girl'],
 ['a', 'girl'],
 ['s', 'girl'],
 ['n', 'boy'],
 ['n', 'boy'],
 ['d', 'boy'],
 ['e', 'boy'],
 ['a', 'girl'],
 ['r', 'boy'],
 ['a', 'girl'],
 ['n', 'boy'],
 ['l', 'girl'],
 ['d', 'boy'],
 ['a', 'girl'],
 ['y', 'girl'],
 ['y', 'boy'],
 ['y', 'boy'],
 ['o', 'girl'],
 ['e', 'girl'],
 ['m', 'boy'],
 ['s', 'boy'],
 ['d', 'boy'],
 ['r', 'boy'],
 ['h', 'boy'],
 ['a', 'girl'],
 ['y', 'boy'],
 ['k', 'boy'],
 ['a', 'girl'],
 ['e', 'girl'],
 ['y', 'girl'],
 ['l', 'boy'],
 ['e', 'boy'],
 ['e', 'boy'],
 ['a', 'girl'],
 ['e', 'boy'],
 ['s', 'boy'],
 ['n', 'boy'],
 ['r', 'boy'],
 ['r', 'girl'],
 ['r', 

In [281]:
classifier = train(samples)
classifier

(defaultdict(<function __main__.train.<locals>.<lambda>()>,
             {'boy': 0.4998449612403101, 'girl': 0.5001550387596899}),
 defaultdict(<function __main__.train.<locals>.<lambda>()>,
             {('boy', 'y'): 0.11165145338532435,
              ('girl', 'a'): 0.38392303604640865,
              ('girl', 's'): 0.022362943937649454,
              ('girl', 'e'): 0.30275219201133646,
              ('boy', 't'): 0.05229750088621057,
              ('girl', 'n'): 0.08024089983172439,
              ('girl', 'y'): 0.07858028518288902,
              ('boy', 'n'): 0.21423032612548742,
              ('boy', 'l'): 0.08299361928394186,
              ('girl', 'h'): 0.024787441324949075,
              ('boy', 'x'): 0.004375664657922723,
              ('boy', 'e'): 0.1392126019142148,
              ('boy', 'r'): 0.060461715703651185,
              ('girl', 't'): 0.010971127446638916,
              ('boy', 'd'): 0.0722040056717476,
              ('girl', 'l'): 0.03469577539633336,
              

In [282]:
# проверка
names = ['Max', 'Nikita', 'Vika']
for name in names:
    print(f"{name} is a {classify(classifier, get_features(name))}")

Max is a boy
Nikita is a girl
Vika is a girl


In [283]:
data_test['class'] = [classify(classifier, get_features(name)) for name in data_test['name']]
data_test[['name', 'sex', 'class']]

Unnamed: 0,name,sex,class
190170,Lynn,girl,boy
162345,Molly,girl,boy
214684,Flora,girl,girl
221982,Julianna,girl,girl
161221,Jewel,girl,boy
...,...,...,...
189383,Cleo,girl,boy
216614,Kerrie,girl,girl
83674,Carmine,boy,girl
182476,Merle,girl,girl


# Точность

## Через частоту

In [284]:
# вручную
accuracy = sum([sex == cl for sex, cl in zip(data_test['sex'], data_test['class'])]) / data_test.shape[0]
accuracy

0.7752454780361757

In [287]:
accuracy = sklearn.metrics.accuracy_score(data_test['sex'], data_test['class'])
accuracy

0.7752454780361757

## Через полноту

In [294]:
# вручную

recall = [0, 0]

# насколько верно он определяет мужские имена
sex = 'boy'
recall[0] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['class'].str.contains(sex)].shape[0] / data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

# насколько верно он определяет мужские имена
sex = 'girl'
recall[1] = data_test.loc[data_test['sex'].str.contains(sex) & data_test['class'].str.contains(sex)].shape[0] / data_test.loc[data_test['sex'].str.contains(sex)].shape[0]

recall

[0.8344866763065483, 0.7159184940008275]

In [295]:
# автоматически

sklearn.metrics.recall_score(data_test['sex'], data_test['class'], average=None),

(array([0.83448668, 0.71591849]),)

# Изменим get_features

In [299]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последние 2 буквы.
    """
    return sample[-2:]  # get last letter

samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = train(samples)
data_test['class'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = sklearn.metrics.accuracy_score(data_test['sex'], data_test['class'])
accuracy

0.7007364341085272

In [300]:
def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: Первая + последняя буквы.
    """
    return sample[1] + sample[-1]  # get last letter

samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = train(samples)
data_test['class'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = sklearn.metrics.accuracy_score(data_test['sex'], data_test['class'])
accuracy

0.7107235142118863

# Изменим classify

In [304]:
def classify(classifier, feats):
    """
    Функция для классификации
    """
    classes, prob = classifier
    return max(classes.keys(),
           key=lambda cl: log(classes[cl]) \
                          + sum(prob.get((cl, feat)) if (cl, feat) in prob else 0 for feat in feats))

samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = train(samples)
data_test['class'] = [classify(classifier, get_features(name)) for name in data_test['name']]
accuracy = sklearn.metrics.accuracy_score(data_test['sex'], data_test['class'])
accuracy

0.674031007751938

Видно, что точность упала

# sklearn.naive_bayes

In [310]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
data_train['class'] = gnb.fit(data_train['name'], data_train['sex']).predict(data_train['name'])
print("Number of mislabeled points out of a total %d points : %d" % (data_train['name'].shape[0], (data_train['sex'] != data_train['class']).sum()))

ValueError: could not convert string to float: 'Murray'