## Step-1:Data pre-processing

## Using Random Forest algorithm

1. Importing the required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

2. Importing the dataset using the pandas library

In [2]:
data=pd.read_csv("penguins.csv")
data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [3]:
data.shape

(344, 7)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


3. Checking the missing values in the dataset

In [5]:
data.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

Here some missing values found.

# Drop the null values

In [6]:
data.dropna(inplace=True)

In [7]:
data.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

Here missing values have been removed.

# Step-2: Feature engineering step

In our dataset we have categorical variable so we have to convert it in numerical variables.

# Converting 'sex' categorical variable into numerical variable using dummy variables

In [8]:
data.sex.unique()

array(['MALE', 'FEMALE'], dtype=object)

In [9]:
pd.get_dummies(data['sex']).head()

Unnamed: 0,FEMALE,MALE
0,0,1
1,1,0
2,1,0
4,1,0
5,0,1


In [10]:
gender=pd.get_dummies(data['sex'],drop_first=True)
gender.head()

Unnamed: 0,MALE
0,1
1,0
2,0
4,0
5,1


# Converting 'island' variable

In [11]:
data.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [12]:
pd.get_dummies(data['island']).head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,0,0,1
1,0,0,1
2,0,0,1
4,0,0,1
5,0,0,1


In [13]:
island=pd.get_dummies(data['island'],drop_first=True)
island.head()

Unnamed: 0,Dream,Torgersen
0,0,1
1,0,1
2,0,1
4,0,1
5,0,1


# creating the new dataset by concatenating the 'sex' and 'island'

In [14]:
new_data=pd.concat([data,gender,island],axis=1)
new_data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,MALE,Dream,Torgersen
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE,1,0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE,0,0,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE,0,0,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE,0,0,1
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE,1,0,1


# Dropping the unwanted variables

In [15]:
new_data.drop(['sex','island'],axis=1,inplace=True)
new_data.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,MALE,Dream,Torgersen
0,Adelie,39.1,18.7,181.0,3750.0,1,0,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1
2,Adelie,40.3,18.0,195.0,3250.0,0,0,1
4,Adelie,36.7,19.3,193.0,3450.0,0,0,1
5,Adelie,39.3,20.6,190.0,3650.0,1,0,1


4. Extracting the independent and dependent variables

In [16]:
#for independent variable
x=new_data.iloc[:,[1,2,3,4,5,6,7]]
x.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,MALE,Dream,Torgersen
0,39.1,18.7,181.0,3750.0,1,0,1
1,39.5,17.4,186.0,3800.0,0,0,1
2,40.3,18.0,195.0,3250.0,0,0,1
4,36.7,19.3,193.0,3450.0,0,0,1
5,39.3,20.6,190.0,3650.0,1,0,1


In [17]:
#for dependent variable
y=new_data.iloc[:,0]
y.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

# Converting the dependent variable in numerical format.

In [18]:
y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [19]:
y=y.map({'Adelie':0,'Chinstrap':1,'Gentoo':2})
y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

5. Splitting the dataset into Training and Training dataset

In [20]:
#importing the train_test_split class from the scikit-learn.model_selection library
from sklearn.model_selection import train_test_split

#splitting the dataset
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)

In [21]:
x_train.shape

(249, 7)

In [22]:
x_test.shape

(84, 7)

In [23]:
y_train.shape

(249,)

In [24]:
y_test.shape

(84,)

## Step-2: Fitting the Random Forest classifier to the training data:

In [26]:
#importing the Random Forest classifier form the scikit-learn.neighbors library
from sklearn.ensemble import RandomForestClassifier

#create an instance
classifier=RandomForestClassifier(n_estimators=5,criterion='entropy',random_state=0)

#fitting the values in the classifier
classifier.fit(x_train,y_train)

## Step-3:Predicting the test set result:

In [27]:
y_pred=classifier.predict(x_test)

In [28]:
y_pred

array([0, 0, 2, 0, 0, 0, 1, 2, 2, 1, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0,
       2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 1, 0, 1, 0, 2, 2, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 0, 0,
       2, 2, 1, 2, 2, 1, 2, 1, 0, 2, 0, 2, 0, 2, 1, 2, 2, 2], dtype=int64)

## Step-4:Creating the confusion matrix

In [29]:
#importing the confusion matrix class from the scikit-learn.metrics library
from sklearn.metrics import confusion_matrix

#creating an instance
cm=confusion_matrix(y_test,y_pred)

In [30]:
cm

array([[42,  0,  0],
       [ 1, 13,  0],
       [ 0,  0, 28]], dtype=int64)

In [31]:
#importing the classification_report class from scikit-learn library.
from sklearn.metrics import classification_report

#printing the results
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        42
           1       1.00      0.93      0.96        14
           2       1.00      1.00      1.00        28

    accuracy                           0.99        84
   macro avg       0.99      0.98      0.98        84
weighted avg       0.99      0.99      0.99        84



## Step-5:Accuracy score

In [32]:
from sklearn.metrics import accuracy_score

#creating an instance
accuracy=accuracy_score(y_test,y_pred)

#printing the accuracy score
print("Accuracy score:",accuracy*100,"%")

Accuracy score: 98.80952380952381 %


## Hence the accuracy score of Random Forest model is 98.80%