In [1]:
import seaborn as sns
import pandas as pd

# Load Iris dataset
df = sns.load_dataset('iris')
df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
#Handling Missing Values
df['sepal_length'].fillna(df['sepal_length'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sepal_length'].fillna(df['sepal_length'].mean(), inplace=True)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
#Label Encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['species_encoded'] = le.fit_transform(df['species'])

In [5]:
#One-Hot Encoding

df = pd.get_dummies(df, columns=['species'], prefix='species')

In [6]:
# Feature Scaling
# Standardization

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['sepal_length', 'sepal_width']] = scaler.fit_transform(df[['sepal_length', 'sepal_width']])

In [7]:
#Min-Max Normalization

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['petal_length', 'petal_width']] = scaler.fit_transform(df[['petal_length', 'petal_width']])

In [8]:
#Feature Creation / Combination
# Sepal area = sepal length × sepal width
df['sepal_area'] = df['sepal_length'] * df['sepal_width']

# Petal area = petal length × petal width
df['petal_area'] = df['petal_length'] * df['petal_width']

In [9]:
#Log Transformation

import numpy as np
df['log_petal_length'] = np.log(df['petal_length'] + 1)  # Add 1 to avoid log(0)

In [10]:
#Binning (Discretization)

df['petal_size'] = pd.cut(df['petal_length'],
                          bins=[0, 2, 4, 7],
                          labels=['Small', 'Medium', 'Large'])

In [11]:
#Polynomial Features

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[['sepal_length', 'sepal_width']])

In [12]:
#Outlier Handling (IQR Method)

Q1 = df['sepal_width'].quantile(0.25)
Q3 = df['sepal_width'].quantile(0.75)
IQR = Q3 - Q1

df = df[(df['sepal_width'] >= Q1 - 1.5*IQR) & (df['sepal_width'] <= Q3 + 1.5*IQR)]

In [13]:
#Feature Selection (Correlation Method)

corr = df.corr(numeric_only=True)
important_features = corr['species_encoded'].abs().sort_values(ascending=False)
print(important_features)

species_encoded       1.000000
petal_width           0.955638
petal_length          0.947460
log_petal_length      0.942125
petal_area            0.941660
species_virginica     0.867483
species_setosa        0.862965
sepal_length          0.788053
sepal_width           0.405380
sepal_area            0.301668
species_versicolor    0.017923
Name: species_encoded, dtype: float64
