# ML Pipeline with Categorical and Numerical Data

## 1. Import Libraries

In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


## 2. Create Dataset

In [3]:

data = {
    "Age": [22, 25, 47, 52, 46, 56, 23, 40],
    "Salary": [15000, 29000, 48000, 52000, 46000, 58000, 18000, 41000],
    "Gender": ["Male", "Female", "Female", "Male", "Male", "Female", "Female", "Male"],
    "Purchased": [0, 0, 1, 1, 1, 1, 0, 1]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Age,Salary,Gender,Purchased
0,22,15000,Male,0
1,25,29000,Female,0
2,47,48000,Female,1
3,52,52000,Male,1
4,46,46000,Male,1
5,56,58000,Female,1
6,23,18000,Female,0
7,40,41000,Male,1


## 3. Split Features and Target

In [None]:

X = df.drop("Purchased", axis=1)
y = df["Purchased"]


## 4. Train-Test Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


## 5. Preprocessing

In [None]:

num_features = ["Age", "Salary"]
cat_features = ["Gender"]

preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(drop="first"), cat_features)
])


## 6. Build Pipeline

In [None]:

pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", LogisticRegression())
])


## 7. Train Model

In [None]:

pipeline.fit(X_train, y_train)


## 8. Predictions

In [None]:

y_pred = pipeline.predict(X_test)
y_pred


## 9. Evaluation

In [None]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
