In [116]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Считывание данных

In [117]:
data = pd.read_csv("names.csv")
data

Unnamed: 0,year,name,percent,sex
0,1880,John,0.081541,boy
1,1880,William,0.080511,boy
2,1880,James,0.050057,boy
3,1880,Charles,0.045167,boy
4,1880,George,0.043292,boy
...,...,...,...,...
257995,2008,Carleigh,0.000128,girl
257996,2008,Iyana,0.000128,girl
257997,2008,Kenley,0.000127,girl
257998,2008,Sloane,0.000127,girl


# Разделение на тестовую и обучающую выборку

In [118]:
data_train, data_test = train_test_split(data, test_size=0.3)

In [119]:
data_test

Unnamed: 0,year,name,percent,sex
112312,1992,Geoffrey,0.000403,boy
205089,1956,Helen,0.002561,girl
76029,1956,Anthony,0.007251,boy
24535,1904,Valentine,0.000130,boy
51612,1931,Ken,0.000089,boy
...,...,...,...,...
24723,1904,Alois,0.000079,boy
244051,1995,Brooke,0.003318,girl
255149,2006,Liliana,0.001088,girl
88590,1968,Efrain,0.000093,boy


In [120]:
data_train

Unnamed: 0,year,name,percent,sex
169852,1920,Elinore,0.000058,girl
108381,1988,Clifton,0.000259,boy
146973,1897,Silvia,0.000048,girl
167513,1918,Marianne,0.000149,girl
14175,1894,Adam,0.000689,boy
...,...,...,...,...
110530,1990,Dana,0.000161,boy
110617,1990,Leonel,0.000124,boy
215180,1966,Shelia,0.001028,girl
173796,1924,Savannah,0.000068,girl


# Функция для наивной байесовской классификации

In [121]:
from collections import defaultdict
from math import log


def train(samples):
    """
    Функция для обучения модели
    """
    classes, freq = defaultdict(lambda: 0), defaultdict(lambda: 0) # классы и частоты

    # заполняем словари из классов и частот
    for feats, label in samples: # идем по признакам (feats) и полам (label)
        classes[label] += 1  # count classes frequencies
        for feat in feats:
            freq[label, feat] += 1  # count features frequencies

    # превращаем count в количество
    for label, feat in freq:  # normalize features frequencies
        freq[label, feat] /= classes[label]
    for c in classes:  # normalize classes frequencies
        classes[c] /= len(samples)

    return classes, freq  # return P(C) and P(O|C)


def classify(classifier, feats):
    """
    Функция для классификации
    """
    classes, prob = classifier
    return min(classes.keys(),  # calculate argmin(-log(P(C|O))) -> argmax(P(C|O))
           key=lambda cl: -log(classes[cl]) \
                          + sum(-log(prob.get((cl, feat))) if (cl, feat) in prob else 0 for feat in feats))

def get_features(sample: str):
    """
    Функция для превращения слова в признак.
    Признак: последняя буква.
    """
    return sample[-1]  # get last letter

# Применение функции

In [122]:
# обучение
samples = [[get_features(name), lable] for name, lable in zip(data_train['name'], data_train['sex'])]
classifier = train(samples)
classifier

(defaultdict(<function __main__.train.<locals>.<lambda>()>,
             {'girl': 0.49861018826135106, 'boy': 0.5013898117386489}),
 defaultdict(<function __main__.train.<locals>.<lambda>()>,
             {('girl', 'e'): 0.30026985307998977,
              ('boy', 'n'): 0.21403408024207352,
              ('girl', 'a'): 0.38456840164799166,
              ('boy', 'm'): 0.01652107652041391,
              ('boy', 'h'): 0.026857792846020474,
              ('girl', 'y'): 0.07945674021921398,
              ('boy', 'y'): 0.11100926549679187,
              ('boy', 's'): 0.07856346147474903,
              ('boy', 'r'): 0.060728208412938564,
              ('girl', 'n'): 0.08031183022576598,
              ('boy', 'e'): 0.13838610285916225,
              ('girl', 'd'): 0.007895701229330698,
              ('boy', 'l'): 0.08320173162085455,
              ('boy', 'd'): 0.07292023279698734,
              ('boy', 'o'): 0.062163863458161696,
              ('boy', 'i'): 0.005621141677065963,
              

In [123]:
# проверка
names = ['Max', 'Nikita', 'Vika']
for name in names:
    print(f"{name} is a {classify(classifier, get_features(name))}")

Max is a boy
Nikita is a girl
Vika is a girl


In [124]:
data_test['class'] = [classify(classifier, get_features(name)) for name in data_test['name']]
data_test[['name', 'sex', 'class']]

Unnamed: 0,name,sex,class
112312,Geoffrey,boy,boy
205089,Helen,girl,boy
76029,Anthony,boy,boy
24535,Valentine,boy,girl
51612,Ken,boy,boy
...,...,...,...
24723,Alois,boy,boy
244051,Brooke,girl,girl
255149,Liliana,girl,girl
88590,Efrain,boy,boy


# Точность

In [125]:
accuracy = sum([sex == cl for sex, cl in zip(data_test['sex'], data_test['class'])]) / data_test.shape[0]

0.7761498708010336