In [1]:
import numpy as np

In [2]:
# example dataset (movie classification)

X_train = np.array([[0, 1, 1], 
                    [0, 0, 1],
                    [0, 0, 0],
                    [1, 1, 0]])
                   
Y_train = ['Y', 'N', 'Y', 'Y']
                   
X_test = np.array([[1, 1, 0]])

In [3]:
# a_priori probability

def get_label_indices(labels):
    
    # group samples by labels, return indexes
    # return dict {class: indexes}
    
    from collections import defaultdict
    
    label_indices = defaultdict(list)
    
    for index, label in enumerate(labels):
        label_indices[label].append(index)
        
    return label_indices



In [4]:
label_indices = get_label_indices(Y_train)
print('Indices: \n', label_indices)

Indices: 
 defaultdict(<class 'list'>, {'Y': [0, 2, 3], 'N': [1]})


In [5]:
def get_prior(indices):
    
    # calculate a-priori probability based on test samples
    # return dict {label: a priori}
    
    prior = {label: len(indices) for label, indices in label_indices.items()}
    
    total_count = sum(prior.values())
    
    for label in prior:
        prior[label] /= total_count
    
    return prior   

In [6]:
prior = get_prior(label_indices)
print('A priori: ', prior)

A priori:  {'Y': 0.75, 'N': 0.25}


In [7]:
# likelyhood (szansa) calculation

def get_likelihood(features, label_indices, smoothing = 0):
    
    # return dict {class : conditional probability (likelihood)}
    
    likelihood = {}
    
    for label, indices in label_indices.items():
        likelihood[label] = features[indices, :].sum(axis = 0) + smoothing
        total_count = len(indices)
        likelihood[label] = likelihood[label] / (total_count + 2 * smoothing)
        
    return likelihood

In [8]:
smoothing = 1
likelihood = get_likelihood(X_train, label_indices, smoothing)
print('Likelihood: ', likelihood)

Likelihood:  {'Y': array([0.4, 0.6, 0.4]), 'N': array([0.33333333, 0.33333333, 0.66666667])}


In [9]:
# a posteriori calculation

def get_posterior(X, prior, likelihood):
    
    # X - training samplec
    # prior - {label: a priori} dict
    # return {label : a posteriori}
    
    posteriors = []
    
    for x in X:
        
        # a posteriori is proportional to (a priori * chance)
        
        posterior = prior.copy()
        
        for label, likelihood_label in likelihood.items():
            for index, bool_value in enumerate(x):
                posterior[label] *= likelihood_label[index] if bool_value else (1 - likelihood_label[index])
                
        # normalise, so everything sums to 1
        sum_posterior = sum(posterior.values())
        
        for label in posterior:
            if posterior[label] == float('inf'):
                posterior[label] = 1.0
            else:
                posterior[label] /= sum_posterior
        posteriors.append(posterior.copy())
    return posteriors

In [10]:
posterior = get_posterior(X_test, prior, likelihood)
print('A posteriori: \n', posterior)

A posteriori: 
 [{'Y': 0.9210360075805433, 'N': 0.07896399241945673}]
