In [10]:
import pandas as pd

In [11]:
import numpy as np

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [14]:
from sklearn.impute import SimpleImputer

In [15]:
from sklearn.compose import ColumnTransformer

In [16]:
from sklearn.pipeline import Pipeline

In [17]:
def preprocess_data(file_path, target_column):
    """
    Loads, preprocesses, and splits the data from a CSV file.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        sys.exit() # Exit the script if the file doesn't exist

    print("--- Original Raw Dataset (first 5 rows) ---")
    print(df.head())

    # --- Error Check: Verify the target column exists ---
    if target_column not in df.columns:
        print(f"\n--- ERROR ---")
        print(f"KeyError: The target column '{target_column}' was not found in the CSV file.")
        print("Please check for typos or extra spaces.")
        print("\nAvailable columns are:")
        print(list(df.columns))
        sys.exit() # Exit the script

    print("\nMissing values before processing:")
    print(df.isnull().sum())

    # Separate features (X) from the target variable (y)
    X = df.drop(target_column, axis=1)
    # This assumes the positive class is 'Yes'. Modify if your labels are different (e.g., 1, 'positive').
    y = df[target_column].apply(lambda x: 1 if str(x).lower() == 'yes' else 0)

    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"\nNumerical features identified: {numerical_features}")
    print(f"Categorical features identified: {categorical_features}")

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"\nDataset split into {X_train.shape[0]} training and {X_test.shape[0]} testing samples.")

    # Fit the preprocessor and transform the training data
    X_train_processed = preprocessor.fit_transform(X_train)
    # Transform the test data
    X_test_processed = preprocessor.transform(X_test)

    # Get feature names after one-hot encoding for creating a readable DataFrame
    try:
        ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
        all_feature_names = numerical_features + list(ohe_feature_names)
    except Exception:
        # Fallback if there are no categorical features
        all_feature_names = numerical_features

    X_train_processed_df = pd.DataFrame(X_train_processed, columns=all_feature_names, index=X_train.index)
    X_test_processed_df = pd.DataFrame(X_test_processed, columns=all_feature_names, index=X_test.index)

    print("\n--- Processed Training Features (first 5 rows) ---")
    print(X_train_processed_df.head())

    print("\n--- Processed Testing Features ---")
    print(X_test_processed_df)

    print("\n--- Shape of Processed Datasets ---")
    print(f"Processed Training Features Shape: {X_train_processed_df.shape}")
    print(f"Processed Testing Features Shape:  {X_test_processed_df.shape}")
    print(f"Training Target Shape:             {y_train.shape}")
    print(f"Testing Target Shape:              {y_test.shape}")

    return X_train_processed_df, X_test_processed_df, y_train, y_test

In [18]:
if __name__ == '__main__':
    csv_file_name = 'data.csv'
    target_column_name = 'species'
    preprocess_data(file_path=csv_file_name, target_column=target_column_name)


--- Original Raw Dataset (first 5 rows) ---
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

Missing values before processing:
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

Numerical features identified: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
Categorical features identified: []

Dataset split into 120 training and 30 testing samples.

--- Processed Training Features (first 5 rows) ---
    sepal_length  sepal_width  petal_length  petal_width
22     -1.473937     1.220379     -1.563987    -1.309484
15     -0.133071     3.020017     -1.277280    -1.042922
65      1.085898     0.