# Naive Bayes Classification
## Purpose: To see if Naive Bayes classification algorithm can be written from scratch
### Datasets: titanic from seaborn library, iris from seaborn library


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

### Load titanic dataset

In [2]:
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Clean titanic dataset

In [3]:
titanic = titanic.iloc[:,:9]
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class
0,0,3,male,22.0,1,0,7.25,S,Third
1,1,1,female,38.0,1,0,71.2833,C,First
2,1,3,female,26.0,0,0,7.925,S,Third
3,1,1,female,35.0,1,0,53.1,S,First
4,0,3,male,35.0,0,0,8.05,S,Third


Check for null values.

In [4]:
titanic.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
class         0
dtype: int64

Transform all null values in the age collumn to the median value.

In [5]:
titanic["age"] = titanic.groupby(["pclass"])["age"].transform(lambda x: x.fillna(x.median()))
titanic.isna().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    2
class       0
dtype: int64

Drop the remaining rows with null values.

In [6]:
titanic = titanic.dropna()
titanic.isna().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
class       0
dtype: int64

In [7]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class
0,0,3,male,22.0,1,0,7.25,S,Third
1,1,1,female,38.0,1,0,71.2833,C,First
2,1,3,female,26.0,0,0,7.925,S,Third
3,1,1,female,35.0,1,0,53.1,S,First
4,0,3,male,35.0,0,0,8.05,S,Third


Move the target variable column from the beginning of the dataframe to the end of the dataframe so it will be compatible with the Naive Bayes algorithm.

In [8]:
survived = titanic["survived"]
titanic = titanic.drop(columns="survived")
titanic["survived"] = survived
titanic.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,survived
0,3,male,22.0,1,0,7.25,S,Third,0
1,1,female,38.0,1,0,71.2833,C,First,1
2,3,female,26.0,0,0,7.925,S,Third,1
3,1,female,35.0,1,0,53.1,S,First,1
4,3,male,35.0,0,0,8.05,S,Third,0


Naive Bayes classification algorithm written from scratch. It supports a target variable with two or three possible outcomes.

In [9]:
def naive_bayes(train, test_instance):
    y = train.columns[-1]
    classes = train[y].unique()
    features = train.columns.drop(y)
    class_0_df = train[train[y] == classes[0]]
    class_1_df = train[train[y] == classes[1]]
    len_class_0 = len(class_0_df)
    len_class_1 = len(class_1_df)
    prob_class_0 = len_class_0 / len(train)
    prob_class_1 = len_class_1 / len(train)
    
    prob_0_lst = []
    prob_1_lst = []
    
    for i in range(len(features)):
        prob_0_lst.append(len(class_0_df[class_0_df[features[i]] == test_instance[i]]) / len_class_0)
        prob_1_lst.append(len(class_1_df[class_1_df[features[i]] == test_instance[i]]) / len_class_1)
        
    prob_class_0_total = prob_0_lst[0]
    prob_class_1_total = prob_1_lst[0]
    
    for i in range(1, len(prob_0_lst)):
        prob_class_0_total*=prob_0_lst[i]
    
    for i in range(1, len(prob_1_lst)):
        prob_class_1_total*=prob_1_lst[i]
    
    prob_class_0_total*=prob_class_0
    prob_class_1_total*=prob_class_1
    
    if len(classes) == 3:
        class_2_df = train[train[y] == classes[2]]
        len_class_2 = len(class_2_df)
        prob_class_2 = len_class_2 / len(train)
        
        prob_2_lst = []
        
        for i in range(len(features)):
            prob_2_lst.append(len(class_2_df[class_2_df[features[i]] == test_instance[i]]) / len_class_2)
        
        prob_class_2_total = prob_2_lst[0]
        
        for i in range(1, len(prob_2_lst)):
            prob_class_2_total*=prob_2_lst[i]
        
        prob_class_2_total*=prob_class_2
        
        if prob_class_0_total > prob_class_1_total and prob_class_0_total > prob_class_2_total:
            accuracy = prob_class_0_total/(prob_class_0_total+prob_class_1_total+prob_class_2_total)
            print(f"Accuracy: {accuracy}")
            return classes[0]
        elif prob_class_1_total > prob_class_0_total and prob_class_1_total > prob_class_2_total:
            accuracy = prob_class_1_total/(prob_class_1_total+prob_class_0_total+prob_class_2_total)
            print(f"Accuracy: {accuracy}")
            return classes[1]
        else:
            accuracy = prob_class_2_total/(prob_class_2_total+prob_class_0_total+prob_class_1_total)
            print(f"Accuracy: {accuracy}")
            return classes[2]
        
    if prob_class_0_total > prob_class_1_total:
        accuracy = prob_class_0_total/(prob_class_0_total+prob_class_1_total)
        print(f"Accuracy: {accuracy}")
        return classes[0]
    else:
        accuracy = prob_class_1_total/(prob_class_0_total+prob_class_1_total)
        print(f"Accuracy: {accuracy}")
        return classes[1]
    

### Naive Bayes titanic

In [10]:
test_inst_titanic = [1, "male", 22.0, 1, 0, 53.1, "S", "First"]
naive_bayes(titanic, test_inst_titanic)

Accuracy: 0.8545312771260241


1

The person with the test_inst_titanic attributes is predicted to survive.

### Load the iris dataset

In [11]:
iris = sns.load_dataset("iris")
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Naive Bayes iris

In [12]:
test_inst_iris = [5.8, 3.1, 5.3, 1.9]
naive_bayes(iris, test_inst_iris)

Accuracy: 1.0


'virginica'

The flower with the test_inst_iris attributes is predicted to be virginica.