In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Load the Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)


In [3]:
# Display the first few rows of the dataframe
print("Original Data:")
print(iris_df.head())

Original Data:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


In [4]:
# Encode the categorical feature (species)
label_encoder = LabelEncoder()
iris_df['species'] = label_encoder.fit_transform(iris_df['species'])

In [5]:
# Display the dataframe after encoding the species
print("\nData after encoding the species:")
print(iris_df.head())


Data after encoding the species:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species  
0        0  
1        0  
2        0  
3        0  
4        0  


In [6]:
# Standardize the numerical features
scaler = StandardScaler()
iris_df[iris.feature_names] = scaler.fit_transform(iris_df[iris.feature_names])

In [7]:
# Display the dataframe after standardization
print("\nData after standardization:")
print(iris_df.head())


Data after standardization:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0          -0.900681          1.019004          -1.340227         -1.315444   
1          -1.143017         -0.131979          -1.340227         -1.315444   
2          -1.385353          0.328414          -1.397064         -1.315444   
3          -1.506521          0.098217          -1.283389         -1.315444   
4          -1.021849          1.249201          -1.340227         -1.315444   

   species  
0        0  
1        0  
2        0  
3        0  
4        0  


In [8]:
# Feature Engineering: Create additional features
iris_df['sepal_ratio'] = iris_df['sepal length (cm)'] / iris_df['sepal width (cm)']
iris_df['petal_ratio'] = iris_df['petal length (cm)'] / iris_df['petal width (cm)']

In [9]:
# Display the dataframe after feature engineering
print("\nData after feature engineering:")
print(iris_df.head())


Data after feature engineering:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0          -0.900681          1.019004          -1.340227         -1.315444   
1          -1.143017         -0.131979          -1.340227         -1.315444   
2          -1.385353          0.328414          -1.397064         -1.315444   
3          -1.506521          0.098217          -1.283389         -1.315444   
4          -1.021849          1.249201          -1.340227         -1.315444   

   species  sepal_ratio  petal_ratio  
0        0    -0.883884     1.018839  
1        0     8.660565     1.018839  
2        0    -4.218311     1.062047  
3        0   -15.338649     0.975632  
4        0    -0.818002     1.018839  


In [10]:
# Split the data into training and testing sets
X = iris_df.drop('species', axis=1)
y = iris_df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
print("\nTraining features:")
print(X_train.head())
print("\nTraining labels:")
print(y_train.head())


Training features:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
22          -1.506521          1.249201          -1.567576         -1.315444   
15          -0.173674          3.090775          -1.283389         -1.052180   
65           1.038005          0.098217           0.364896          0.264142   
11          -1.264185          0.788808          -1.226552         -1.315444   
42          -1.748856          0.328414          -1.397064         -1.315444   

    sepal_ratio  petal_ratio  
22    -1.205987     1.191671  
15    -0.056191     1.219743  
65    10.568453     1.381440  
11    -1.602653     0.932424  
42    -5.325157     1.062047  

Training labels:
22    0
15    0
65    1
11    0
42    0
Name: species, dtype: int32


In [12]:
print("\nTesting features:")
print(X_test.head())
print("\nTesting labels:")
print(y_test.head())


Testing features:
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
73            0.310998         -0.592373           0.535409          0.000878   
18           -0.173674          1.709595          -1.169714         -1.183812   
118           2.249683         -1.052767           1.785832          1.448832   
78            0.189830         -0.362176           0.421734          0.395774   
76            1.159173         -0.592373           0.592246          0.264142   

     sepal_ratio  petal_ratio  
73     -0.525003   610.118906  
18     -0.101588     0.988091  
118    -2.136925     1.232601  
78     -0.524136     1.065592  
76     -1.956829     2.242151  

Testing labels:
73     1
18     0
118    2
78     1
76     1
Name: species, dtype: int32
