# Import Libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Import Data


In [2]:
# Load the iris dataset from sklearn
iris = load_iris()

# dataset = read_csv('iris.csv')  # if a CSV file is available

# Create a DataFrame from the iris dataset
dataset = pd.DataFrame(data=iris.data, columns=iris.feature_names)
dataset['target'] = iris.target

# Display the first few rows of the dataset
dataset.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
# Display the dataset description
dataset.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [4]:
# Separate features and target variable
X = dataset.iloc[:, :-1].values  # Features
y = dataset.iloc[:, -1].values    # Target variable 

print("Features (X):")
print(X[:5])  # Display first 5 rows of features
print("\nTarget (y):")
print(y[:5])  # Display first 5 target values

Features (X):
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]

Target (y):
[0 0 0 0 0]


# Missing Data

In [5]:
# Check for missing data in each column
missing_data = dataset.isnull().sum()
print("Missing data in each column:")
print(missing_data)

Missing data in each column:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64


In [6]:
from sklearn.impute import SimpleImputer

# Configure the SimpleImputer to replace missing values with the mean of each column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer.fit(X[:, 1:3])  # Fit the imputer on the second and third columns (index 1 and 2)
imputer.transform(X[:, 1:3])  # Transform the data to fill missing values

X[:, 1:3] = imputer.transform(X[:, 1:3])  # Update the original data with the transformed values

# Display the dataset after handling missing values
print("Dataset after handling missing values:")
print(dataset.head())

Dataset after handling missing values:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


# Encoding data

In [7]:
# Identify categorical columns in the dataset
categorical_columns = dataset.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:")
print(categorical_columns)

Categorical columns:
Index([], dtype='object')


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Example: Apply OneHotEncoder to categorical columns (if any)
column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

# Fit and transform the dataset using the column transformer
# Note: This step is only necessary if there are categorical columns to encode
# X_transformed = column_transformer.fit_transform(X)

In [9]:
# If there are no categorical columns, we can still create a placeholder for transformed data
# X_transformed = np.array(X_transformed)  # Placeholder for transformed data, if no categorical columns

In [11]:
from sklearn.preprocessing import LabelEncoder

# Example: Encode binary categorical columns using LabelEncoder
for col in categorical_columns:
    if dataset[col].nunique() == 2:  # Check if column is binary
        le = LabelEncoder()
        dataset[col] = le.fit_transform(dataset[col])
        
# Display the dataset after encoding categorical columns
print("Dataset after encoding categorical columns:")
print(dataset.head())   

Dataset after encoding categorical columns:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


# Splitting the Data

In [12]:
from sklearn.model_selection import train_test_split
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set features (X_train):")
print(X_train[:5])  # Display first 5 rows of training features
print("\nTesting set features (X_test):")
print(X_test[:5])  # Display first 5 rows of testing features  
print("\nTraining set target (y_train):")
print(y_train[:5])  # Display first 5 training target values
print("\nTesting set target (y_test):")
print(y_test[:5])  # Display first 5 testing target values  


Training set features (X_train):
[[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]]

Testing set features (X_test):
[[6.1 2.8 4.7 1.2]
 [5.7 3.8 1.7 0.3]
 [7.7 2.6 6.9 2.3]
 [6.  2.9 4.5 1.5]
 [6.8 2.8 4.8 1.4]]

Training set target (y_train):
[0 0 1 0 0]

Testing set target (y_test):
[1 0 2 1 1]


# Scaling the Data

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test sets
# becarfull to apply the same transformation to both sets on the numerical features only
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled training features (X_train_scaled):")
print(X_train_scaled[:5])
print("\nScaled test features (X_test_scaled):")
print(X_test_scaled[:5])

Scaled training features (X_train_scaled):
[[-1.47393679  1.20365799 -1.56253475 -1.31260282]
 [-0.13307079  2.99237573 -1.27600637 -1.04563275]
 [ 1.08589829  0.08570939  0.38585821  0.28921757]
 [-1.23014297  0.75647855 -1.2187007  -1.31260282]
 [-1.7177306   0.30929911 -1.39061772 -1.31260282]]

Scaled test features (X_test_scaled):
[[ 0.35451684 -0.58505976  0.55777524  0.02224751]
 [-0.13307079  1.65083742 -1.16139502 -1.17911778]
 [ 2.30486738 -1.0322392   1.8185001   1.49058286]
 [ 0.23261993 -0.36147005  0.44316389  0.4227026 ]
 [ 1.2077952  -0.58505976  0.61508092  0.28921757]]
