In [6]:
import pandas as pd

# Load dataset
df = pd.read_csv('iris.csv')

# Selecting relevant features
selected_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']
df_selected = df[selected_columns]

print("Selected Features:")
print(df_selected.head())


Selected Features:
   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0            5.1           3.5            1.4           0.2  Iris-setosa
1            4.9           3.0            1.4           0.2  Iris-setosa
2            4.7           3.2            1.3           0.2  Iris-setosa
3            4.6           3.1            1.5           0.2  Iris-setosa
4            5.0           3.6            1.4           0.2  Iris-setosa


In [7]:
# Check for missing values
print("Missing Values:\n", df_selected.isnull().sum())

# Dropping rows with missing values
df_cleaned = df_selected.dropna()

print("Data after Dropping Missing Values:")
print(df_cleaned.head())


Missing Values:
 SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64
Data after Dropping Missing Values:
   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0            5.1           3.5            1.4           0.2  Iris-setosa
1            4.9           3.0            1.4           0.2  Iris-setosa
2            4.7           3.2            1.3           0.2  Iris-setosa
3            4.6           3.1            1.5           0.2  Iris-setosa
4            5.0           3.6            1.4           0.2  Iris-setosa


In [8]:
from sklearn.preprocessing import KBinsDiscretizer

# Apply discretization on Sepal Length
kbins = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
df_cleaned['SepalLengthBinned'] = kbins.fit_transform(df_cleaned[['SepalLengthCm']])

print("Data after Sepal Length Discretization:")
print(df_cleaned[['SepalLengthCm', 'SepalLengthBinned']].head())


Data after Sepal Length Discretization:
   SepalLengthCm  SepalLengthBinned
0            5.1                0.0
1            4.9                0.0
2            4.7                0.0
3            4.6                0.0
4            5.0                0.0


In [None]:
import numpy as np
from scipy import stats

# Compute Z-scores for numeric columns
z_scores = np.abs(stats.zscore(df_cleaned[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]))

# Remove rows where Z-score > 3 (outliers)
df_no_outliers = df_cleaned[(z_scores < 3).all(axis=1)]

print("Data after Eliminating Outliers:")
print(df_no_outliers.head())
