# Machine Learning

# Classification Algorithm: Naive Bayes

In [3]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [4]:
# lets load dataset from seaborn
import seaborn as sns
p = sns.load_dataset('penguins')
p.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [None]:
#$ lets consider species column is ur target variable
# Problem statement: Create a ML model to predict Species of 
#a penguin based on remaining features


### List out challenges

#### Convert Category to Numbers

In [9]:
p.island[:3]

0    Torgersen
1    Torgersen
2    Torgersen
Name: island, dtype: object

In [7]:
p.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [10]:
p.island = p.island.map({'Torgersen':0, 'Biscoe':1, 'Dream':2})

In [11]:
p.island[:3]

0    0
1    0
2    0
Name: island, dtype: int64

In [13]:
p.island.value_counts()

1    168
2    124
0     52
Name: island, dtype: int64

In [15]:
# Convert male female to 0,1
p.sex = p.sex.map({'Male':0,'Female':1})

In [17]:
p.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,0,39.1,18.7,181.0,3750.0,0.0
1,Adelie,0,39.5,17.4,186.0,3800.0,1.0


#### Next challenge is to deal with NaN

In [18]:
p.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [19]:
p.shape

(344, 7)

In [20]:
# will drop all NaN records
p.dropna(inplace=True)

In [22]:
p.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [23]:
p.shape

(333, 7)

In [24]:
p.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    int64  
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 20.8+ KB


In [26]:
# sex column has float data type 
# lets convert to int
p.sex = p.sex.astype(int)

In [27]:
p.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    int64  
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    int32  
dtypes: float64(4), int32(1), int64(1), object(1)
memory usage: 19.5+ KB


In [30]:
# as algorithm needs input and output separateely 
# input
X = p.iloc[:,1:]
X[:2]

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,39.1,18.7,181.0,3750.0,0
1,0,39.5,17.4,186.0,3800.0,1


In [31]:
# output
y = p.species
y[:2]

0    Adelie
1    Adelie
Name: species, dtype: object

### Import algorithm

In [32]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
# its a class
model = MultinomialNB()
# model= object
# MultinomialNB is class

In [34]:
model

MultinomialNB()

## Training Phase

In [35]:
model.fit(X,y)

MultinomialNB()

## Testing Phase

In [37]:
p[:3]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,0,39.1,18.7,181.0,3750.0,0
1,Adelie,0,39.5,17.4,186.0,3800.0,1
2,Adelie,0,40.3,18.0,195.0,3250.0,1


In [41]:
model.predict([[0,39.6,15.9,150,6100,0],
              [0,39.6,15.9,190,5100,1]])

array(['Gentoo', 'Gentoo'], dtype='<U9')

## Check the goodness of data

In [40]:
model.score(X,y)*100

81.98198198198197

In [44]:
y.value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64

In [47]:
import pandas as pd
pred = model.predict(X)
pd.Series(pred).value_counts()

Adelie       138
Gentoo       131
Chinstrap     64
dtype: int64