In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [10]:
data = pd.read_csv("names.csv")
data

Unnamed: 0,year,name,percent,sex
0,1880,John,0.081541,boy
1,1880,William,0.080511,boy
2,1880,James,0.050057,boy
3,1880,Charles,0.045167,boy
4,1880,George,0.043292,boy
...,...,...,...,...
257995,2008,Carleigh,0.000128,girl
257996,2008,Iyana,0.000128,girl
257997,2008,Kenley,0.000127,girl
257998,2008,Sloane,0.000127,girl


In [12]:
# Разделить данные в выборке на обучающий набор и тестирование (выбор принципа разделения за вами – например, 70% данных в обучении и 30% в тестировании)
test = data.sample(frac=0.7)
test

Unnamed: 0,year,name,percent,sex
212735,1963,Candi,0.000098,girl
165951,1916,Claribel,0.000048,girl
192728,1943,Katheryn,0.000079,girl
234366,1985,Clarissa,0.000344,girl
8111,1888,Warren,0.001139,boy
...,...,...,...,...
159720,1910,Zada,0.000083,girl
194008,1945,Judith,0.014996,girl
122964,2002,Dontae,0.000080,boy
172519,1923,Adell,0.000146,girl


In [32]:
from collections import defaultdict
from math import log


def train(samples):
    classes, freq = defaultdict(lambda: 0), defaultdict(lambda: 0) # классы и частоты

    # заполняем словари из классов и частот
    for feats, label in samples: # идем по признакам (feats) и полам (label)
        classes[label] += 1  # count classes frequencies
        for feat in feats:
            freq[label, feat] += 1  # count features frequencies

    # превращаем count в колличество
    for label, feat in freq:  # normalize features frequencies
        freq[label, feat] /= classes[label]
    for c in classes:  # normalize classes frequencies
        classes[c] /= len(samples)

    return classes, freq  # return P(C) and P(O|C)


def classify(classifier, feats):
    # просто формула из теории
    classes, prob = classifier
    return min(classes.keys(),  # calculate argmin(-log(P(C|O))) -> argmax(P(C|O))
               key=lambda cl: -log(classes[cl]) + \
                              sum(-log(prob.get((cl, feat))) for feat in feats))


def get_features(sample: str):
    return sample[-1]  # get last letter

In [39]:
samples = [("Anna", "F"), ("Michael", "M"), ("Jane", "F"), ("Ilya", "M"), ("Anthony", "M")]
features = [(get_features(feat), label) for feat, label in samples]
classifier = train(features)

In [40]:
samples

[('Anna', 'F'),
 ('Michael', 'M'),
 ('Jane', 'F'),
 ('Ilya', 'M'),
 ('Anthony', 'M')]

In [41]:
features

[('a', 'F'), ('l', 'M'), ('e', 'F'), ('a', 'M'), ('y', 'M')]

In [42]:
classifier

(defaultdict(<function __main__.train.<locals>.<lambda>()>,
             {'F': 0.4, 'M': 0.6}),
 defaultdict(<function __main__.train.<locals>.<lambda>()>,
             {('F', 'a'): 0.5,
              ('M', 'l'): 0.3333333333333333,
              ('F', 'e'): 0.5,
              ('M', 'a'): 0.3333333333333333,
              ('M', 'y'): 0.3333333333333333}))