# Module 3: Naive Bayes' Classificiation

In [1]:
import pandas as pd
import numpy as np

## Part 1

In [2]:
# data shown in class

data = [['M', 1, 0],
        ['M', 1, 1],
        ['R', 1, 1],
        ['R', 1, 1],
        ['M', 0, 1],
        ['M', 1, 1],
        ['R', 0, 1],
        ['R', 1, 0],
        ['R', 0, 0]]

df_MR = pd.DataFrame(data, columns = ['label', 'f1', 'f2'])

In [3]:
df_MR

Unnamed: 0,label,f1,f2
0,M,1,0
1,M,1,1
2,R,1,1
3,R,1,1
4,M,0,1
5,M,1,1
6,R,0,1
7,R,1,0
8,R,0,0


In [4]:
# function to get compute P(M|f1)

def get_prob_M_f1(df, f1):
    
    # total number of datapoints
    n_all = df.shape[0]
    
    # total # of mammals in the dataset
    n_M = df[df.label=='M'].shape[0]
    
    # total # of reptiles
    n_R = df[df.label=='R'].shape[0]
    
    # total # of mammals with the given feature value
    n_M_f1 = df[(df.label=='M') & (df.f1==f1)].shape[0]
    
    # total # of reptiles with the given feature value
    n_R_f1 = df[(df.label=='R') & (df.f1==f1)].shape[0]
    
    # compute terms in Bayes' formula
    prob_M    = n_M/n_all
    prob_f1_M = n_M_f1/n_M

    prob_R    = n_R/n_all
    prob_f1_R = n_R_f1/n_R
    
    prob_M_f1 = prob_M * prob_f1_M / ((prob_M * prob_f1_M) + (prob_R*prob_f1_R))
    
    return prob_M_f1

In [5]:
get_prob_M_f1(df_MR, 1)

0.5

In [6]:
get_prob_M_f1(df_MR, 0)

0.33333333333333326

# Calculations by hand

Q1: Consider the case of only one feature, f2 = 0. What is the probability that this is a reptile?

Hypothesis (H): This animal is a reptile (R)
Evidence (E): f2 = 0 

P(H) = 5/9
P(E/H) = 2/5
P(-H) = 1 - P(H) = 4/9
P(E/-H)) = 1/4

P(H/E) = P(H)*P(E/H)/ (P(H)*P(E/H) + P(-H)*P(E/-H)) = 5/9*2/5 / (5/9*2/5 + 4/9*1/4) = 2/3

Q2. In the case where you have two feature values, (f1 = 1, f2 = 0), what is the probability that this comes from a reptile?

Hypothesis (H): This animal is a reptile (R)
Evidence (E): f1 = 1 and f2 = 0 

P(H) = 5/9
P(E/H) = 1/5
P(-H) = 1 - P(H) = 4/9
P(E/-H)) = 1/4

P(H/E) = P(H)*P(E/H)/ (P(H)*P(E/H) + P(-H)*P(E/-H)) = 5/9*1/5 / (5/9*1/5 + 4/9*1/4) = 1/2

## <font color = 'green'> Python coding

Q3. Write an equivalent function, get_prob_R_f1(df, f1) to get the probability of a reptile

In [7]:
# function to get compute P(R|f1)

def get_prob_R_f1(df, f1):
    # total number of datapoints
    n_all = df.shape[0]
    
    # total # of mammals in the dataset
    n_M = df[df.label=='M'].shape[0]
    
    # total # of reptiles
    n_R = df[df.label=='R'].shape[0]
    
    # total # of mammals with the given feature value
    n_M_f1 = df[(df.label=='M') & (df.f1==f1)].shape[0]
    
    # total # of reptiles with the given feature value
    n_R_f1 = df[(df.label=='R') & (df.f1==f1)].shape[0]
    
    # compute terms in Bayes' formula
    prob_M    = n_M/n_all
    prob_f1_M = n_M_f1/n_M

    prob_R    = n_R/n_all
    prob_f1_R = n_R_f1/n_R
    
    prob_R_f1 = prob_R * prob_f1_R / ((prob_R * prob_f1_R) + (prob_M * prob_f1_M))
    
    return prob_R_f1

In [8]:
get_prob_R_f1(df_MR, 0)

0.6666666666666666

In [9]:
get_prob_R_f1(df_MR, 1)

0.5

In [10]:
# function to get compute P(R|f2)

def get_prob_R_f2(df, f2):
    # total number of datapoints
    n_all = df.shape[0]
    
    # total # of mammals in the dataset
    n_M = df[df.label=='M'].shape[0]
    
    # total # of reptiles
    n_R = df[df.label=='R'].shape[0]
    
    # total # of mammals with the given feature value
    n_M_f2 = df[(df.label=='M') & (df.f2==f2)].shape[0]
    
    # total # of reptiles with the given feature value
    n_R_f2 = df[(df.label=='R') & (df.f2==f2)].shape[0]
    
    # compute terms in Bayes' formula
    prob_M    = n_M/n_all
    prob_f2_M = n_M_f2/n_M

    prob_R    = n_R/n_all
    prob_f2_R = n_R_f2/n_R
    
    prob_R_f2 = prob_R * prob_f2_R / ((prob_R * prob_f2_R) + (prob_M * prob_f2_M))
    
    return prob_R_f2

In [11]:
get_prob_R_f2(df_MR, 0) # Equal to the result of Q1: 2/3

0.6666666666666666

Q4: Write a function get_prob_MR(df,f1,f2) to compute the ratio of the probabilities of the two labels, as expressed in equation (6). This function should return 'M' or 'R', depending on the value of the ratio.

In [12]:
import fractions

# function to get compute P(MR|f1,f2)
def classify_MR(df, f1, f2):
    # total number of datapoints
    n_all = df.shape[0]
    
    # total # of mammals in the dataset
    n_M = df[df.label=='M'].shape[0]
    
    # total # of reptiles
    n_R = df[df.label=='R'].shape[0]
    
    # total # of mammals with the given feature values
    n_M_f1f2 = df[(df.label=='M') & (df.f1==f1) & (df.f2==f2)].shape[0]
    
    # total # of reptiles with the given feature values
    n_R_f1f2 = df[(df.label=='R') & (df.f1==f1) & (df.f2==f2)].shape[0]
    
    # compute terms in Bayes' formula
    prob_M = fractions.Fraction(n_M, n_all)
    prob_f1f2_M = fractions.Fraction(n_M_f1f2, n_M) 

    prob_R = fractions.Fraction(n_R, n_all)
    prob_f1f2_R = fractions.Fraction(n_R_f1f2, n_R)
    
    # compute the ratio of probabilities
    # Naive Bayes' Classification formula
    ratio = (prob_M * prob_f1f2_M)/ (prob_R * prob_f1f2_R) 

    # return 'M' if the ratio is greater than 1, else return 'R'
    if ratio > 1:
        return 'M'
    elif ratio < 1:
        return 'R'
    # In the case of equal probabilities of two labels
    else:
        # If the ratio is exactly 1
        return 'Equal probabilities of M and R'

In [13]:
classify_MR(df_MR, 1, 0)

'Equal probabilities of M and R'

In [14]:
classify_MR(df_MR, 0, 0)

'R'

In [15]:
classify_MR(df_MR, 0, 1)

'Equal probabilities of M and R'

In [16]:
classify_MR(df_MR, 1, 1)

'Equal probabilities of M and R'

Q5. Are there cases where the predicted label for a given pair of (f1, f2) does not match the actual label in the dataframe? If so, why? If not, is that always guaranteed?

Due to the equal probabilities of the two labels in some cases, misclassifications can occur if we define the function to randomly select one of the labels. In the function above, I specify that the return value for these cases is "Equal probabilities of M and R".

For the other case (f1 = f2 = 0), the predicted label matches the actual one in the DataFrame. It is not always guaranteed, especially with more data, because the predicted result is based solely on the label with the higher probability, not a 100% probability.

## Part 2

In [17]:
# Q6
df_cc = pd.read_csv('cc_approvals-1-1.csv')
df_cc

Unnamed: 0,Income,CreditScore,ApprovalStatus
0,1,1,1
1,1,0,1
2,0,1,1
3,0,0,1
4,0,0,1
...,...,...,...
684,0,0,0
685,1,1,0
686,0,0,0
687,1,0,0


Q7. What are all the possibilities of values of Income and CreditScore that a person can have? Write them in a table. How many distinct pairs of values are there?

In [18]:
# There are four distinct pairs of values of Income and CreditScore in total.
data = [[1, 0],
        [1, 1],
        [0, 1],
        [0, 0]]

df_Q7 = pd.DataFrame(data, columns = ['Income', 'CreditScore'])
df_Q7

Unnamed: 0,Income,CreditScore
0,1,0
1,1,1
2,0,1
3,0,0


Q8. Modify the function in (4) to take df_cc, Income, and CreditScore as input and return a 1 or 0, corresponding to the approval status. 

In [19]:
import fractions

# function to get compute P(Approval|Income, CreditScore)
def approval_status(df, f1, f2):
    # total number of datapoints
    n_all = df.shape[0]
    
    # total # of approved cc in the dataset
    n_approved = df[df.ApprovalStatus==1].shape[0]
    
    # total # of declined cc
    n_declined = df[df.ApprovalStatus==0].shape[0]
    
    # total # of approved cc with the given feature values
    n_approved_f1f2 = df[(df.ApprovalStatus==1) & (df.Income==f1) & (df.CreditScore==f2)].shape[0]
    
    # total # of declined cc with the given feature values
    n_declined_f1f2 = df[(df.ApprovalStatus==0) & (df.Income==f1) & (df.CreditScore==f2)].shape[0]
    
    # compute terms in Bayes' formula
    prob_approved = fractions.Fraction(n_approved, n_all)
    prob_f1f2_approved = fractions.Fraction(n_approved_f1f2, n_approved) 

    prob_declined = fractions.Fraction(n_declined, n_all)
    prob_f1f2_declined = fractions.Fraction(n_declined_f1f2, n_declined)
    
    # compute the ratio of probabilities
    # Naive Bayes' Classification formula
    ratio = (prob_approved * prob_f1f2_approved)/ (prob_declined * prob_f1f2_declined) 

    # return '1' if the ratio is greater than 1, else return '0'
    if ratio > 1:
        return '1'
    elif ratio < 1:
        return '0'
    # In the case of equal probabilities of two labels
    else:
        # If the ratio is exactly 1
        return 'Equal probabilities of 1 and 0'

In [20]:
# Q9
new_data = [[1, 0, approval_status(df_cc, 1, 0)],
        [1, 1, approval_status(df_cc, 1, 1)],
        [0, 1, approval_status(df_cc, 0, 1)],
        [0, 0, approval_status(df_cc, 0, 0)]]

df_nb = pd.DataFrame(new_data, columns = ['Income', 'CreditScore', 'ApprovalStatus_NB'])
df_nb

Unnamed: 0,Income,CreditScore,ApprovalStatus_NB
0,1,0,0
1,1,1,1
2,0,1,1
3,0,0,0


In [21]:
# Q10
df_cc = df_cc.merge(df_nb, on = ['Income', 'CreditScore'])
df_cc

Unnamed: 0,Income,CreditScore,ApprovalStatus,ApprovalStatus_NB
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
...,...,...,...,...
684,0,0,0,0
685,0,0,0,0
686,0,0,0,0
687,0,0,0,0


In [22]:
df_cc.dtypes

Income                int64
CreditScore           int64
ApprovalStatus        int64
ApprovalStatus_NB    object
dtype: object

In [23]:
df_cc['ApprovalStatus_NB'] = df_cc['ApprovalStatus_NB'].astype('int64')

Q11. The overall accuracy of correctly predicting the ApprovalStatus can be obtained by counting the number of rows in the updated df_cc table where ApprovalStatus == ApprovalStatus_NB divided by the total number of rows. What is the overall accuracy?

In [24]:
# Count the number of rows where ApprovalStatus == ApprovalStatus_NB
correct_pred = df_cc[df_cc['ApprovalStatus'] == df_cc['ApprovalStatus_NB']].shape[0]

# Calculate the overall accuracy
n_rows = df_cc.shape[0]
accuracy = correct_pred / n_rows

print("Overall Accuracy:", accuracy)

Overall Accuracy: 0.7358490566037735


Q12. Explain the value of the accuracy - is it what you would expect it to be? if so, why? if not, why?

The accuracy is quite high (close to 1) so it indicates that the model's predictions match the actual ApprovalStatus values for a large portion of the dataset. This suggests that the model is performing well in predicting credit card approvals based on the two provided features (Income & CreditScore).