In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
data = pd.read_csv('Iris.csv')

In [3]:
print(data.head())


   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [5]:
print("Column names in the dataset:", data.columns)


Column names in the dataset: Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [7]:
target = 'Species'

In [10]:
X = data.drop(target_column, axis=1)
y = data[target_column]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

In [13]:
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [15]:
X_train_preprocessed = preprocessor.fit_transform(X_train)

In [16]:
X_test_preprocessed = preprocessor.transform(X_test)


In [17]:
print('Preprocessed training data:')
print(X_train_preprocessed)

Preprocessed training data:
[[-1.21030717 -1.47393679  1.22037928 -1.5639872  -1.30948358]
 [-1.37240188 -0.13307079  3.02001693 -1.27728011 -1.04292204]
 [-0.21458252  1.08589829  0.09560575  0.38562104  0.28988568]
 [-1.46502743 -1.23014297  0.77046987 -1.21993869 -1.30948358]
 [-0.74717943 -1.7177306   0.32056046 -1.39196294 -1.30948358]
 [ 1.66108484  0.59831066 -1.25412249  0.72966956  0.95628954]
 [-0.53877194  0.72020757  0.32056046  0.44296246  0.42316645]
 [-1.09452523 -0.74255534  0.99542457 -1.27728011 -1.30948358]
 [-1.62712214 -0.98634915  1.22037928 -1.33462153 -1.30948358]
 [-0.9787433  -0.74255534  2.34515281 -1.27728011 -1.44276436]
 [ 1.56845929 -0.01117388 -0.80421307  0.78701097  0.95628954]
 [ 0.24854522  0.23261993  0.77046987  0.44296246  0.55644722]
 [ 0.27170161  1.08589829  0.09560575  0.5576453   0.42316645]
 [-1.34924549 -0.49876152  1.8952434  -1.39196294 -1.04292204]
 [-1.48818382 -0.49876152  1.44533399 -1.27728011 -1.30948358]
 [ 0.15591967 -0.37686461 -