### Play tennis classifier implementation


In [1]:
import numpy as np

In [2]:
def create_train_data():
    data = [['Sunny', 'Hot', 'High', 'Weak', 'no'],
            ['Sunny', 'Hot', 'High', 'Strong', 'no'],
            ['Overcast', 'Hot', 'High', 'Weak', 'yes'],
            ['Rain', 'Mild', 'High', 'Weak', 'yes'],
            ['Rain', 'Cool', 'Normal', 'Weak', 'yes'],
            ['Rain', 'Cool', 'Normal', 'Strong', 'no'],
            ['Overcast', 'Cool', 'Normal', 'Strong', 'yes'],
            ['Overcast', 'Mild', 'High', 'Weak', 'no'],
            ['Sunny', 'Cool', 'Normal', 'Weak', 'yes'],
            ['Rain', 'Mild', 'Normal', 'Weak', 'yes']]
    return np.array(data)

In [3]:
train_data = create_train_data()

In [4]:
yes_data = train_data[np.where(train_data[:, -1] == 'yes')]
np.where(yes_data[:, 0] == 'Overcast')

(array([0, 3], dtype=int64),)

In [5]:
def compute_prior_proability(train_data):
    _, counts = np.unique(train_data[:, -1], return_counts=True)
    # prior_probability = np.zeros(len(unique))
    # unique_counts = dict(zip(unique, counts))
    total_rows = np.sum(counts)
    prior_probability = counts/total_rows
    return prior_probability

In [6]:
prior_probability = compute_prior_proability(train_data)
print(prior_probability)
print("P(play tennis = No)", prior_probability[0])
print("P(play tennis = Yes)", prior_probability[1])

[0.4 0.6]
P(play tennis = No) 0.4
P(play tennis = Yes) 0.6


In [7]:
def compute_conditional_probability(train_data):
    y_unique = ['no', 'yes']
    conditional_probability = []
    list_x_name = []
    for i in range(0, train_data.shape[1]-1):
        x_unique = np.unique(train_data[:, i])
        list_x_name.append(x_unique)

        # Tính xác suất có điều kiện cho từng giá trị của biến mục tiêu
        x_conditional_probability = []
        for unique in y_unique:
            # Lọc dữ liệu theo giá trị của 'Play Tennis'= 'yes' / 'no'
            data_features_unique = train_data[train_data[:, -1] == unique]
            # Tính xác suất có điều kiện cho từng giá trị của đặc trưng theo 'Play Tennis'= 'yes' / 'no'
            feature_conditional_probability = []
            for x_name in x_unique:
                x_conditional = np.nonzero(
                    data_features_unique[:, i] == x_name)[0]
                probability = len(x_conditional) / len(data_features_unique)
                feature_conditional_probability.append(probability)
            x_conditional_probability.append(feature_conditional_probability)
            convert_to_np = np.array(x_conditional_probability)
        conditional_probability.append(convert_to_np)
    return conditional_probability, list_x_name

In [8]:
_, list_x_name = compute_conditional_probability(train_data)
print('x1 = ', list_x_name[0])
print('x2 = ', list_x_name[1])
print('x3 = ', list_x_name[2])
print('x4 = ', list_x_name[3])

x1 =  ['Overcast' 'Rain' 'Sunny']
x2 =  ['Cool' 'Hot' 'Mild']
x3 =  ['High' 'Normal']
x4 =  ['Strong' 'Weak']


In [9]:
def get_index_from_value(feature_name, list_features):
    return np.where(list_features == feature_name)[0][0]

In [10]:
outlook = list_x_name[0]
i1 = get_index_from_value('Overcast', outlook)
i2 = get_index_from_value('Rain', outlook)
i3 = get_index_from_value('Sunny', outlook)
print(i1, i2, i3)

0 1 2


In [11]:
train_data = create_train_data()
conditional_probability, list_x_name = compute_conditional_probability(
    train_data)
x1 = get_index_from_value('Sunny', list_x_name[0])
print("P('Outlook'= 'Sunny'| Play Tennis'= 'Yes') = ", np.round(conditional_probability
                                                                [0][1, x1], 2))

P('Outlook'= 'Sunny'| Play Tennis'= 'Yes') =  0.17


In [12]:
print("P('Outlook'= 'Sunny'| Play Tennis'= 'No') = ", np.round(conditional_probability
                                                               [0][0][x1], 2))

P('Outlook'= 'Sunny'| Play Tennis'= 'No') =  0.5


In [13]:
def train_naive_bayes(train_data):
    # Step 1: Calculate Prior Probability
    y_unique = ['no', 'yes']
    prior_probability = compute_prior_proability(train_data)

    #  Step 2: Calculate Conditional Probability
    conditional_probability, list_x_name = compute_conditional_probability(
        train_data)
    return prior_probability, conditional_probability, list_x_name

In [14]:
# ###################
# Prediction
# ###################
def prediction_play_tennis(X, list_x_name, prior_probability, conditional_probability):
    x1 = get_index_from_value(X[0], list_x_name[0])
    x2 = get_index_from_value(X[1], list_x_name[1])
    x3 = get_index_from_value(X[2], list_x_name[2])
    x4 = get_index_from_value(X[3], list_x_name[3])

    p0 = prior_probability[0] \
        * conditional_probability[0][0, x1] \
        * conditional_probability[1][0, x2] \
        * conditional_probability[2][0, x3] \
        * conditional_probability[3][0, x4]

    p1 = prior_probability[1] \
        * conditional_probability[0][1, x1] \
        * conditional_probability[1][1, x2] \
        * conditional_probability[2][1, x3] \
        * conditional_probability[3][1, x4]
    if p0 > p1:
        y_pred = 0
    else:
        y_pred = 1
    return y_pred

In [15]:
X = ['Sunny', 'Cool', 'High', 'Strong']
data = create_train_data()
prior_probability, conditional_probability, list_x_name_2 = train_naive_bayes(
    data)
pred = prediction_play_tennis(
    X, list_x_name, prior_probability, conditional_probability)

if (pred):
    print("Ad should go!")
else:
    print("Ad should not go!")

Ad should not go!


### Iris classifier implementation


In [1]:
import numpy as np

In [10]:
def create_train_data():
    data = np.loadtxt('./iris.data.txt', dtype=str, delimiter=',')
    return data

In [12]:
train_data = create_train_data()
train_data

array([['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
       ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
       ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
       ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
       ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'],
       ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'],
       ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'],
       ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa'],
       ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'],
       ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'],
       ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'],
       ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'],
       ['4.8', '3.0', '1.4', '0.1', 'Iris-setosa'],
       ['4.3', '3.0', '1.1', '0.1', 'Iris-setosa'],
       ['5.8', '4.0', '1.2', '0.2', 'Iris-setosa'],
       ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'],
       ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'],
       ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'],
       ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'],
       ['5.1

In [38]:
def compute_prior_probability(train_data):
    # cách 1
    # y_unique = np.unique(train_data[:, -1], return_counts=True)
    # prior_probability = np.zeros(len(y_unique[0]))
    # for i in range(len(y_unique)):
    #     prior_probability[i] = len(
    #         np.where(train_data[:, -1] == y_unique[i])[0]) / len(train_data[:, -1])

    # cách 2
    y_unique, counts = np.unique(train_data[:, -1], return_counts=True)
    prior_probability = counts / np.sum(counts)
    return prior_probability

In [39]:
compute_prior_probability(train_data)

array([0.33333333, 0.33333333, 0.33333333])

In [85]:
def compute_mean_std_features_uniques(train_data):
    y_unique = np.unique(train_data[:, -1])
    # mean, standard deviation (trung bình, độ lệch chuẩn)
    mean_std_features = []
    for i in range(train_data.shape[1]-1):
        feature_unique_mean_std = np.zeros((len(y_unique), 2))
        for j in range(len(y_unique)):
            data_feature_unique = (train_data[:, i][np.where(
                train_data[:, -1] == y_unique[j])]).astype(float)
            mean_feature_unique = np.mean(data_feature_unique)
            std_feature_unique = np.std(data_feature_unique)
            feature_unique_mean_std[j] = [
                mean_feature_unique, std_feature_unique]
        mean_std_features.append(feature_unique_mean_std)
    return mean_std_features

In [93]:
mean_std_features = compute_mean_std_features_uniques(train_data)
mean_std_features

[array([[5.006     , 0.34894699],
        [5.936     , 0.51098337],
        [6.588     , 0.62948868]]),
 array([[3.418     , 0.37719491],
        [2.77      , 0.31064449],
        [2.974     , 0.31925538]]),
 array([[1.464     , 0.17176728],
        [4.26      , 0.46518813],
        [5.552     , 0.54634787]]),
 array([[0.244     , 0.10613199],
        [1.326     , 0.19576517],
        [2.026     , 0.27188968]])]

In [87]:
def train_gaussian_naive_bayes(train_data):
    prior_probability = compute_prior_probability(train_data)
    mean_std_features = compute_mean_std_features_uniques(train_data)
    return prior_probability, mean_std_features

In [88]:
def gauss(x, mean, std):
    return (1/(std * np.sqrt(2*np.pi))) * (np.e**(-(x - mean)**2 / (2 * std**2)))

In [116]:
def prediction_iris(X, prior_probability, mean_std_features):
    posteriors = np.zeros(len(prior_probability))
    for i in range(len(mean_std_features[0])):
        posterior = prior_probability[i]
        for j in range(len(X)):
            condition_probability_xi = gauss(X[j],
                                             mean_std_features[j][i, 0],
                                             mean_std_features[j][i, 1])
            posterior *= condition_probability_xi
        posteriors[i] = posterior
    return np.argmax(posteriors)

In [117]:
# Example 1
# X =[ sepal length , sepal width , petal length , petal width ]
X = [6.3, 3.3, 6.0, 2.5]
train_data = create_train_data()
y_unique = np.unique(train_data[:, 4])
prior_probability, mean_std_features = train_gaussian_naive_bayes(train_data)
pred = y_unique[prediction_iris(X, prior_probability, mean_std_features)]
print(pred)
assert pred == "Iris-virginica"

Iris-virginica


In [118]:
# Example 2 #########################
# X =[ sepal length , sepal width , petal length , petal width ]
X = [5.0, 2.0, 3.5, 1.0]
train_data = create_train_data()
y_unique = np.unique(train_data[:, 4])
prior_probability, mean_std_features = train_gaussian_naive_bayes(train_data)
pred = y_unique[prediction_iris(X, prior_probability, mean_std_features)]
assert pred == "Iris-versicolor"

In [120]:
# Example 3 #########################
X = [4.9, 3.1, 1.5, 0.1]
# X =[ sepal length , sepal width , petal length , petal width ]
train_data = create_train_data()
y_unique = np . unique(train_data[:, 4])
prior_probability, mean_std_features = train_gaussian_naive_bayes(train_data)
pred = y_unique[prediction_iris(X, prior_probability, mean_std_features)]
assert pred == "Iris-setosa"