In [492]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for statistical data visualization
%matplotlib inline

In [493]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

./ML-Assignment 1 - Student names.ipynb
./README.md
./iris.csv
./naive_bayes_classifier.ipynb


In [494]:
import warnings

warnings.filterwarnings('ignore')

In [495]:
data = 'iris.csv'

df = pd.read_csv(data, sep=',')

In [496]:
# view dimensions of dataset

df.shape

(150, 5)

View top 5 rows of dataset


In [497]:
# preview the dataset

df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [498]:
# view summary of dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal length  150 non-null    float64
 1   sepal width   150 non-null    float64
 2   petal length  150 non-null    float64
 3   petal width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [499]:
# explore categorical variables

categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)

There are 1 categorical variables

The categorical variables are :

 ['class']


In [500]:
# view the categorical variables

df[categorical].head()

Unnamed: 0,class
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa


Explore problems within categorical variables
First, I will explore the categorical variables.

Missing values in categorical variables


In [501]:
# check missing values in categorical variables

df[categorical].isnull().sum()

class    0
dtype: int64

In [502]:
# view frequency counts of values in categorical variables

for var in categorical: 
    print(df[var].value_counts())

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64


In [503]:
# view frequency distribution of categorical variables

for var in categorical: 
    print(df[var].value_counts()/float(len(df)))

class
Iris-setosa        0.333333
Iris-versicolor    0.333333
Iris-virginica     0.333333
Name: count, dtype: float64


In [504]:
# check labels in class variable

df['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [505]:
# check frequency distribution of values in native_country variable

df['class'].value_counts()

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [506]:
df[categorical].isnull().sum()

class    0
dtype: int64

In [507]:
# check for cardinality in categorical variables

for var in categorical:
    
    print(var, ' contains ', len(df[var].unique()), ' labels')

class  contains  3  labels


In [508]:
# find numerical variables

numerical = [var for var in df.columns if df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :', numerical)

There are 4 numerical variables

The numerical variables are : ['sepal length', 'sepal width', 'petal length', 'petal width']


In [509]:
# view the numerical variables

df[numerical].head()

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [510]:
# check missing values in numerical variables

df[numerical].isnull().sum()

sepal length    0
sepal width     0
petal length    0
petal width     0
dtype: int64

In [511]:
# Check for negative values in numerical variables

df[numerical][df[numerical] < 0].count()

sepal length    0
sepal width     0
petal length    0
petal width     0
dtype: int64

Declare feature vector and target variable


In [512]:
# Features
X = df.drop(['class'], axis=1)

# Label
y = df['class']

Split data into separate training and test set


In [513]:
# split X and y into training and testing sets

# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [514]:
def train_test_split(X, y, test_size=0.3, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    
    # Get the number of samples
    n_samples = X.shape[0]
    
    # Shuffle the indices
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    # Calculate the split index
    test_count = int(n_samples * test_size)
    
    # Split the indices into train and test
    test_indices = indices[:test_count]
    train_indices = indices[test_count:]
    
    # Split the data into train and test sets
    X_train = X.iloc[train_indices] if isinstance(X, pd.DataFrame) else X[train_indices]
    X_test = X.iloc[test_indices] if isinstance(X, pd.DataFrame) else X[test_indices]
    y_train = y.iloc[train_indices] if isinstance(y, pd.Series) else y[train_indices]
    y_test = y.iloc[test_indices] if isinstance(y, pd.Series) else y[test_indices]
    
    return X_train, X_test, y_train, y_test


In [515]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=0)

print("X_train:")
print(X_train)
print("\nX_test:")
print(X_test)
print("\ny_train:")
print(y_train)
print("\ny_test:")
print(y_test)

X_train:
     sepal length  sepal width  petal length  petal width
81            5.5          2.4           3.7          1.0
140           6.7          3.1           5.6          2.4
142           5.8          2.7           5.1          1.9
39            5.1          3.4           1.5          0.2
58            6.6          2.9           4.6          1.3
88            5.6          3.0           4.1          1.3
70            5.9          3.2           4.8          1.8
87            6.3          2.3           4.4          1.3
36            5.5          3.5           1.3          0.2
21            5.1          3.7           1.5          0.4
9             4.9          3.1           1.5          0.1
103           6.3          2.9           5.6          1.8
67            5.8          2.7           4.1          1.0
117           7.7          3.8           6.7          2.2
47            4.6          3.2           1.4          0.2

X_test:
     sepal length  sepal width  petal length  petal wi

In [516]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

((15, 4), (135, 4))

In [517]:
# check the shape of y_train and y_test

y_train.shape, y_test.shape

((15,), (135,))

In [518]:
# One-hot encode the target variable using pandas.get_dummies()
y_train_encoded = pd.get_dummies(y_train)
y_test_encoded = pd.get_dummies(y_test)

# Ensure both train and test sets have the same columns and convert boolean to integers
y_train_encoded, y_test_encoded = y_train_encoded.align(y_test_encoded, join='outer', axis=1, fill_value=0)

# Convert boolean values to integers (0s and 1s)
y_train_encoded = y_train_encoded.astype(int)
y_test_encoded = y_test_encoded.astype(int)

In [519]:
# Print the results
print("Training labels (encoded):")
print(y_train_encoded.head())

print("\nTesting labels (encoded):")
print(y_test_encoded.head())


Training labels (encoded):
     Iris-setosa  Iris-versicolor  Iris-virginica
81             0                1               0
140            0                0               1
142            0                0               1
39             1                0               0
58             0                1               0

Testing labels (encoded):
     Iris-setosa  Iris-versicolor  Iris-virginica
114            0                0               1
62             0                1               0
33             1                0               0
107            0                0               1
7              1                0               0


In [520]:
X_train.shape

(15, 4)

In [521]:
X_test.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width
114,5.8,2.8,5.1,2.4
62,6.0,2.2,4.0,1.0
33,5.5,4.2,1.4,0.2
107,7.3,2.9,6.3,1.8
7,5.0,3.4,1.5,0.2


Model training


In [522]:
import sklearn
print(sklearn.__version__)

import sklearn
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

print(sklearn.__version__)

1.6.1
1.6.1


In [523]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()


# fit the model
gnb.fit(X_train, y_train)


Predict the results


In [524]:
y_pred = gnb.predict(X_test)

y_pred

array(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolo

Check accuracy score


In [525]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.9481
