In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_iris
import seaborn as sns

In [3]:
df=sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [6]:
df.isnull().sum()


species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [None]:
df.dropna(inplace=True) #drop null values

In [8]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [10]:
#Feature Engineering
df.sex.unique() #check unique values in the column

array(['Male', 'Female'], dtype=object)

In [12]:
sex=pd.get_dummies(df['sex'],drop_first=True)
sex.head()

Unnamed: 0,Male
0,True
1,False
2,False
4,False
5,True


In [13]:
df.island.unique() #check unique values in the column

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [15]:
island=pd.get_dummies(df['island'],drop_first=True)
island.head()

Unnamed: 0,Dream,Torgersen
0,False,True
1,False,True
2,False,True
4,False,True
5,False,True


In [None]:
# cocatinate the dataframes
df=pd.concat([df,sex,island],axis=1)

In [None]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Male,Dream,Torgersen
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,True,False,True
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,False,False,True
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,False,False,True
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,False,False,True
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,True,False,True


In [18]:
#drop the columns which are repeated
df.drop(['sex','island'],axis=1,inplace=True)

In [19]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Male,Dream,Torgersen
0,Adelie,39.1,18.7,181.0,3750.0,True,False,True
1,Adelie,39.5,17.4,186.0,3800.0,False,False,True
2,Adelie,40.3,18.0,195.0,3250.0,False,False,True
4,Adelie,36.7,19.3,193.0,3450.0,False,False,True
5,Adelie,39.3,20.6,190.0,3650.0,True,False,True


In [21]:
Y=df.species
Y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [23]:
Y=Y.map({'Adelie':0,'Chinstrap':1,'Gentoo':2}) #map the values to integers
Y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

In [None]:
df.drop(['species'],axis=1,inplace=True)


KeyError: "['species'] not found in axis"

In [26]:
df.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Male,Dream,Torgersen
0,39.1,18.7,181.0,3750.0,True,False,True
1,39.5,17.4,186.0,3800.0,False,False,True
2,40.3,18.0,195.0,3250.0,False,False,True
4,36.7,19.3,193.0,3450.0,False,False,True
5,39.3,20.6,190.0,3650.0,True,False,True


In [27]:
X=df

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [29]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=5,criterion='entropy',random_state=0)
model.fit(X_train,Y_train)

0,1,2
,n_estimators,5
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
#Predicting the test set results
Y_pred=model.predict(X_test)


In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
cm=confusion_matrix(Y_test,Y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[48  0  0]
 [ 1 15  0]
 [ 1  0 35]]


In [32]:
accuracy_score(Y_test,Y_pred)

0.98

In [34]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        48
           1       1.00      0.94      0.97        16
           2       1.00      0.97      0.99        36

    accuracy                           0.98       100
   macro avg       0.99      0.97      0.98       100
weighted avg       0.98      0.98      0.98       100



Changing the criteria to Gini


In [41]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=7,criterion='gini',random_state=0)
model.fit(X_train,Y_train)

0,1,2
,n_estimators,7
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [42]:
y_pred=model.predict(X_test)

In [43]:
accuracy_score(Y_test,Y_pred)

0.98