In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import os
import glob

In [9]:
def load_dataset(path: str) -> pd.DataFrame:
    return pd.read_csv(path)

In [10]:
def handle_missing_values(df: pd.DataFrame, strategy: str = 'mean') -> pd.DataFrame:
    if strategy == 'mean':
        return df.fillna(df.mean())
    if strategy == 'median':
        return df.fillna(df.median())
    if strategy == 'drop':
        return df.dropna()
    return df

In [11]:
def scale_features(df: pd.DataFrame, method: str = 'standard') -> pd.DataFrame:
    scaler = StandardScaler() if method == 'standard' else MinMaxScaler()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

In [12]:
def apply_pca(df: pd.DataFrame, n_components: int = 2) -> pd.DataFrame:
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    max_components = min(len(df), len(numeric_cols))
    
    if max_components < 1:
        return df

    n_components = min(n_components, max_components)

    pca = PCA(n_components=n_components)
    components = pca.fit_transform(df[numeric_cols])

    for i in range(n_components):
        df[f'PC{i+1}'] = components[:, i]

    return df


In [13]:
def preprocess_file(input_path: str, output_path: str, n_pca_components: int = 2, scaling_method: str = 'standard', missing_strategy: str = 'mean') -> None:
    df = load_dataset(input_path)
    df = handle_missing_values(df, missing_strategy)
    df = scale_features(df, scaling_method)
    df = apply_pca(df, n_pca_components)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)

In [14]:
raw_folder = "../data/raw"
processed_folder = "../data/processed"
file_paths = glob.glob(f"{raw_folder}/*.csv")

for file_path in file_paths:
    filename = os.path.basename(file_path)
    output_path = os.path.join(processed_folder, filename)
    preprocess_file(file_path, output_path, n_pca_components=3, scaling_method='standard', missing_strategy='mean')

print(f"✅ All files processed and saved in {processed_folder}")


✅ All files processed and saved in ../data/processed
