In [2]:
import pandas as pd

data = {
    'age': ['<=30', '<=30', '31...40', '>40', '>40', '>40', '31...40', '<=30', '<=30', '>40', '<=30', '31...40', '31...40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

df = pd.DataFrame(data)


df


Unnamed: 0,age,income,student,credit_rating,buys_computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,31...40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
6,31...40,low,yes,excellent,yes
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes


In [3]:
total_instances = len(df)
prior_yes = df['buys_computer'].value_counts()['yes'] / total_instances
prior_no = df['buys_computer'].value_counts()['no'] / total_instances

print(f'Prior probability for buys_computer = yes: {prior_yes:.4f}')
print(f'Prior probability for buys_computer = no: {prior_no:.4f}')

Prior probability for buys_computer = yes: 0.6429
Prior probability for buys_computer = no: 0.3571


In [4]:
total_instances = len(df)
income_classes = df['income'].unique()

for income_class in income_classes:
    prior_income = df[df['income'] == income_class].shape[0] / total_instances
    print(f'Prior probability for income = {income_class}: {prior_income:.4f}')

Prior probability for income = high: 0.2857
Prior probability for income = medium: 0.4286
Prior probability for income = low: 0.2857


In [5]:
total_instances = len(df)
student_classes = df['student'].unique()

for student_class in student_classes:
    prior_student = df[df['student'] == student_class].shape[0] / total_instances
    print(f'Prior probability for student = {student_class}: {prior_student:.4f}')

Prior probability for student = no: 0.5000
Prior probability for student = yes: 0.5000


In [6]:
total_instances = len(df)
credit_rating_classes = df['credit_rating'].unique()

for credit_rating_class in credit_rating_classes:
    prior_credit_rating = df[df['credit_rating'] == credit_rating_class].shape[0] / total_instances
    print(f'Prior probability for credit_rating = {credit_rating_class}: {prior_credit_rating:.4f}')

Prior probability for credit_rating = fair: 0.5714
Prior probability for credit_rating = excellent: 0.4286


In [7]:
age_classes = df['age'].unique()

for age_class in age_classes:
    conditional_data = df[df['age'] == age_class]['buys_computer']
    class_conditional_density = conditional_data.value_counts(normalize=True)
    print(f"Class conditional density for 'age' = {age_class}:\n{class_conditional_density}")
    print()


income_classes = df['income'].unique()

for income_class in income_classes:
    conditional_data = df[df['income'] == income_class]['buys_computer']
    class_conditional_density = conditional_data.value_counts(normalize=True)
    print(f"Class conditional density for 'income' = {income_class}:\n{class_conditional_density}")
    print()

Class conditional density for 'age' = <=30:
no     0.6
yes    0.4
Name: buys_computer, dtype: float64

Class conditional density for 'age' = 31...40:
yes    1.0
Name: buys_computer, dtype: float64

Class conditional density for 'age' = >40:
yes    0.6
no     0.4
Name: buys_computer, dtype: float64

Class conditional density for 'income' = high:
no     0.5
yes    0.5
Name: buys_computer, dtype: float64

Class conditional density for 'income' = medium:
yes    0.666667
no     0.333333
Name: buys_computer, dtype: float64

Class conditional density for 'income' = low:
yes    0.75
no     0.25
Name: buys_computer, dtype: float64



In [8]:
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(index=df['age'], columns=[df['income'], df['student'], df['credit_rating']])

# Performing the Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Output the results
print("Chi-square value:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies table:")
print(expected)


Chi-square value: 12.95
p-value: 0.6764100579553458
Degrees of freedom: 16
Expected frequencies table:
[[0.28571429 0.57142857 0.28571429 0.57142857 0.57142857 0.57142857
  0.57142857 0.28571429 0.28571429]
 [0.35714286 0.71428571 0.35714286 0.71428571 0.71428571 0.71428571
  0.71428571 0.35714286 0.35714286]
 [0.35714286 0.71428571 0.35714286 0.71428571 0.71428571 0.71428571
  0.71428571 0.35714286 0.35714286]]


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

le = LabelEncoder()
df['age'] = le.fit_transform(df['age'])
df['income'] = le.fit_transform(df['income'])
df['student'] = le.fit_transform(df['student'])
df['credit_rating'] = le.fit_transform(df['credit_rating'])
df['buys_computer'] = le.fit_transform(df['buys_computer'])

# Features (X) and target variable (y)
X = df.drop('buys_computer', axis=1)
y = df['buys_computer']

# Splitting the data into training and testing sets
Tr_X, Ts_X, Tr_y, Ts_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the Naïve Bayes classifier
model = GaussianNB()
model.fit(Tr_X, Tr_y)

# Making predictions on the test set
predictions = model.predict(Ts_X)

# Evaluating the model
accuracy = accuracy_score(Ts_y, predictions)
conf_matrix = confusion_matrix(Ts_y, predictions)
class_report = classification_report(Ts_y, predictions)

# Output the results
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Accuracy: 0.6666666666666666
Confusion Matrix:
[[0 1]
 [0 2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
