# Naive Bayes

In [1]:
import numpy as np

## Datos

In [2]:
training = [
    ('chinese beijing chinese', 'zh'),
    ('chinese chinese shangai', 'zh'),
    ('chinese macao', 'zh'),
    ('tokyo japan chinese', 'ja'),
]

In [3]:
X_train = [doc for doc, _ in training]
y_train = [cls for _, cls in training]

In [4]:
X_train

['chinese beijing chinese',
 'chinese chinese shangai',
 'chinese macao',
 'tokyo japan chinese']

In [5]:
classes = ['zh', 'ja']

In [6]:
features = ['chinese', 'beijing', 'shangai', 'macao', 'tokyo', 'japan']

## Priors

Probabilidad de cada clase usando máxima verosimilitud.

In [7]:
from collections import Counter
class_counts = Counter(y_train)
class_counts

Counter({'zh': 3, 'ja': 1})

In [8]:
prior_prob = {}
for c in classes:
    prior_prob[c] = class_counts[c] / len(y_train)

In [9]:
prior_prob

{'zh': 0.75, 'ja': 0.25}

## Feature Probs

Probabilidad de cada feature para cada clase usando máxima verosimilitud y suavizado "add-one".

In [10]:
X_train[0].split()

['chinese', 'beijing', 'chinese']

In [11]:
zh_mega_doc = sum([doc.split() for doc, cls in training if cls == 'zh'], [])

In [12]:
zh_mega_doc

['chinese',
 'beijing',
 'chinese',
 'chinese',
 'chinese',
 'shangai',
 'chinese',
 'macao']

In [13]:
zh_counts = Counter(zh_mega_doc)
zh_counts

Counter({'chinese': 5, 'beijing': 1, 'shangai': 1, 'macao': 1})

In [14]:
V = len(features)

cond_prob = {}
for c in classes:
    cond_prob[c] = {}
    
    mega_doc = sum([doc.split() for doc, cls in training if cls == c], [])
    counts = Counter(mega_doc)
    
    for f in features:
        cond_prob[c][f] = (counts[f] + 1) / (len(mega_doc) + V)

In [15]:
cond_prob

{'zh': {'chinese': 0.42857142857142855,
  'beijing': 0.14285714285714285,
  'shangai': 0.14285714285714285,
  'macao': 0.14285714285714285,
  'tokyo': 0.07142857142857142,
  'japan': 0.07142857142857142},
 'ja': {'chinese': 0.2222222222222222,
  'beijing': 0.1111111111111111,
  'shangai': 0.1111111111111111,
  'macao': 0.1111111111111111,
  'tokyo': 0.2222222222222222,
  'japan': 0.2222222222222222}}

In [16]:
3/7

0.42857142857142855

## Predict

Dado un documento, la probabilidad de cada clase para el documento.

In [17]:
doc = 'chinese chinese chinese tokyo japan'.split()

In [18]:
zh_prob = prior_prob['zh']
for w in doc:
    zh_prob = zh_prob * cond_prob['zh'][w]

In [19]:
ja_prob = prior_prob['ja']
for w in doc:
    ja_prob = ja_prob * cond_prob['ja'][w]

In [20]:
zh_prob, ja_prob

(0.00030121377997263036, 0.00013548070246744226)

In [21]:
zh_prob / (zh_prob + ja_prob), ja_prob / (zh_prob + ja_prob)

(0.6897586117634673, 0.31024138823653263)

## Bag of Words

Representación vectorial usando "bolsa de palabras".

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [23]:
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [24]:
vect.get_feature_names()

['beijing', 'chinese', 'japan', 'macao', 'shangai', 'tokyo']

In [25]:
X2 = vect.transform(X_train)

In [26]:
X2

<4x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [27]:
X2.todense()

matrix([[1, 2, 0, 0, 0, 0],
        [0, 2, 0, 0, 1, 0],
        [0, 1, 0, 1, 0, 0],
        [0, 1, 1, 0, 0, 1]])

In [28]:
vect.vocabulary_

{'chinese': 1, 'beijing': 0, 'shangai': 4, 'macao': 3, 'tokyo': 5, 'japan': 2}

Vectorizar un nuevo documento:

In [29]:
doc = 'chinese chinese chinese tokyo japan'

In [30]:
X_test = vect.transform([doc])

In [31]:
X_test.todense()

matrix([[0, 3, 1, 0, 0, 1]])

## MNB usando Scikit-learn

In [32]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

Entrenar:

In [33]:
mnb.fit(X2, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Predecir:

In [34]:
mnb.predict(X_test)

array(['zh'], dtype='<U2')

Obtener probabilidades:

In [35]:
mnb.predict_proba(X_test)

array([[0.31024139, 0.68975861]])

### Parámetros

In [36]:
mnb.classes_

array(['ja', 'zh'], dtype='<U2')

In [37]:
mnb.class_count_

array([1., 3.])

In [38]:
mnb.feature_count_

array([[0., 1., 1., 0., 0., 1.],
       [1., 5., 0., 1., 1., 0.]])

In [39]:
import numpy as np
np.exp(mnb.class_log_prior_)

array([0.25, 0.75])

In [40]:
np.exp(mnb.feature_log_prob_)

array([[0.11111111, 0.22222222, 0.22222222, 0.11111111, 0.11111111,
        0.22222222],
       [0.14285714, 0.42857143, 0.07142857, 0.14285714, 0.14285714,
        0.07142857]])

## Referencias

- [Naive Bayes classifier (Wikipedia)](https://en.wikipedia.org/wiki/Naive_Bayes_classifier)
- [Naive Bayes (scikit-learn)](https://scikit-learn.org/stable/modules/naive_bayes.html#naive-bayes)