In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

print( "STEP 1: Extract" )
# Simulate raw data
data = {
    'CustomerID': [101, 102, 103, 104, 105],
    'Age': [25, 35, None, 45, 28],
    'Gender': ['Male', 'Female', 'Female', None, 'Male'],
    'Category': ['Electronics', 'Clothing', 'Clothing', 'Electronics', 'Furniture'],
    'Purchase': [300, 150, 200, 250, None]
}
df = pd.DataFrame(data)

print("🔹 Raw Data:")
print(df)

print("STEP 2: Transform")
# Features & Target
X = df.drop(['CustomerID', 'Purchase'], axis=1)
y = df['Purchase']

# Define column types
numeric_features = ['Age']
categorical_features = ['Gender', 'Category']

# Pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("STEP 3: Load")
# Fit and transform
X_clean = preprocessor.fit_transform(X)

print("\n✅ Transformed Feature Matrix (X):")
# Check if X_clean is a sparse matrix before calling toarray() (though in this case it's dense)
print(X_clean.toarray() if hasattr(X_clean, 'toarray') else X_clean)

# Handle target column (drop NA)
y_clean = y.fillna(y.mean())

print("\n✅ Target Vector (y):")
print(y_clean.values)

# Optional: Split and save
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# Save transformed data
# X_train is already a dense numpy array, no need for toarray()
pd.DataFrame(X_train).to_csv("X_train.csv", index=False)
pd.DataFrame(y_train).to_csv("y_train.csv", index=False)

STEP 1: Extract
🔹 Raw Data:
   CustomerID   Age  Gender     Category  Purchase
0         101  25.0    Male  Electronics     300.0
1         102  35.0  Female     Clothing     150.0
2         103   NaN  Female     Clothing     200.0
3         104  45.0    None  Electronics     250.0
4         105  28.0    Male    Furniture       NaN
STEP 2: Transform
STEP 3: Load

✅ Transformed Feature Matrix (X):
[[-1.19893036  0.          1.          0.          0.          1.
   0.        ]
 [ 0.25431856  1.          0.          0.          1.          0.
   0.        ]
 [ 0.          1.          0.          0.          1.          0.
   0.        ]
 [ 1.70756748  0.          0.          1.          0.          1.
   0.        ]
 [-0.76295568  0.          1.          0.          0.          0.
   1.        ]]

✅ Target Vector (y):
[300. 150. 200. 250. 225.]
