In [1]:
import numpy as np
import pandas as pd

In [2]:
df_data = pd.DataFrame(data = [['<=30', 'high', 'no', 'fair', 'no'],
                              ['<=30', 'high', 'no', 'excellent', 'no'],
                              ['31-40', 'high', 'no', 'fair', 'yes'],
                              ['>40', 'medium', 'no', 'fair', 'yes'],
                              ['>40', 'low', 'yes', 'fair', 'yes'],
                              ['>40', 'low', 'yes', 'excellent', 'no'],
                              ['31-40', 'low', 'yes', 'excellent', 'yes'],
                              ['<=30', 'medium', 'no', 'fair', 'no'],
                              ['<=30', 'low', 'yes', 'fair', 'yes'],
                              ['>40', 'medium', 'yes', 'fair', 'yes'],
                              ['<=30', 'medium', 'yes', 'excellent', 'yes'],
                              ['31-40', 'medium', 'no', 'excellent', 'yes'],
                              ['31-40', 'high', 'yes', 'fair', 'yes'],
                              ['>40', 'medium', 'no', 'excellent', 'no']],
                      columns=['age', 'income', 'student', 'credit_rating', 'buys_computer'])

In [3]:
df_data

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,31-40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
6,31-40,low,yes,excellent,yes
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes


In [4]:
# input test data

X = {'age' : '31-40',
    'income' : 'low',
    'student' : 'yes',
    'credit_rating' : 'fair'}

In [5]:
# get unique value in buys_computer column

target_unique = df_data['buys_computer'].unique()

target_unique

array(['no', 'yes'], dtype=object)

In [6]:
# count for every value in target_unique

target_count = {}
buys_computer_list = df_data['buys_computer'].tolist()

for i in target_unique :
    target_count[i] = buys_computer_list.count(i)
    
target_count

{'no': 5, 'yes': 9}

In [7]:
# calculate P(Ci)

prior = {}

for key in target_count :
    prob = target_count[key] / len(df_data)
    prior[key] = prob
    
prior

{'no': 0.35714285714285715, 'yes': 0.6428571428571429}

In [8]:
# calculate P(X|Ci) for every X

likelihood = {}

for key in X:
    
    # look for zero probability
    zero_prob = False
    for j in target_unique :
        val = df_data[key][df_data['buys_computer']==j].tolist().count(X[key])
        if val == 0:
            zero_prob = True
            break
    
    for j in target_unique :
        val = df_data[key][df_data['buys_computer']==j].tolist().count(X[key])    
        if zero_prob :
            likelihood[key + ' : ' + X[key] + '|' + j] = (val+1)/(target_count[j] + len(df_data[key].unique()))
        else :
            likelihood[key + ' : ' + X[key] + '|' + j] = val/target_count[j]
        
likelihood

{'age : 31-40|no': 0.125,
 'age : 31-40|yes': 0.4166666666666667,
 'income : low|no': 0.2,
 'income : low|yes': 0.3333333333333333,
 'student : yes|no': 0.2,
 'student : yes|yes': 0.6666666666666666,
 'credit_rating : fair|no': 0.4,
 'credit_rating : fair|yes': 0.6666666666666666}

In [9]:
# calculate overall P(X|Ci)

overall_likelihood = {'X|no' : 1,
                      'X|yes' : 1}

for key in likelihood:
    
    if '|no' in key:
        overall_likelihood['X|no'] *= likelihood[key]

    elif '|yes' in key:
        overall_likelihood['X|yes'] *= likelihood[key]
        
overall_likelihood

{'X|no': 0.0020000000000000005, 'X|yes': 0.06172839506172839}

In [10]:
# get P(Ci|X)

posterior = {}

for key in overall_likelihood:
    for target in target_unique:
        if target in key : 
            posterior[target] = overall_likelihood[key] * prior[target]
        else:
            continue

posterior

{'no': 0.0007142857142857145, 'yes': 0.03968253968253968}

In [11]:
pred_target = max(posterior)

print('Prediksi hasil Naive Bayes adalah ' + pred_target.upper())

Prediksi hasil Naive Bayes adalah YES
