In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

# ----------------------------------------------------------
# 1. Load dataset
# ----------------------------------------------------------
df = pd.read_csv("../06_ml/HR-Employee-Attrition.csv")
df["Attrition"] = df["Attrition"].map({"Yes": 1, "No": 0})

print("Shape:", df.shape)
df.head()


Shape: (1470, 35)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [7]:
# ----------------------------------------------------------
# 2. Separate features and target
# ----------------------------------------------------------

X = df.drop("Attrition", axis=1)
y = df["Attrition"]

# Drop ID or constant columns (non-informative)
drop_cols = ["EmployeeCount", "EmployeeNumber", "StandardHours"]
X = X.drop(columns=drop_cols)

# Define categorical columns (text)
categorical_features = [
    "BusinessTravel", "Department", "EducationField", "Gender",
    "JobRole", "MaritalStatus", "Over18", "OverTime"
]

# Define ordinal / numeric columns (treated as numeric)
ordinal_numeric = [
    "Education", "EnvironmentSatisfaction", "JobInvolvement", "JobLevel",
    "JobSatisfaction", "PerformanceRating", "RelationshipSatisfaction",
    "StockOptionLevel", "WorkLifeBalance"
]

# Define continuous numeric columns
continuous_numeric = [
    "Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome",
    "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike", "TotalWorkingYears",
    "TrainingTimesLastYear", "YearsAtCompany", "YearsInCurrentRole",
    "YearsSinceLastPromotion", "YearsWithCurrManager"
]

# Combine both numeric sets for preprocessing
numeric_features = ordinal_numeric + continuous_numeric

print("Numeric columns:", numeric_features)
print("Categorical columns:", categorical_features)


Numeric columns: ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance', 'Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Categorical columns: ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']


In [8]:
# ----------------------------------------------------------
# 3. Create preprocessing steps
# ----------------------------------------------------------

# Numeric pipeline: fill missing with median, then scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())   # or MinMaxScaler()
])

# Categorical pipeline: fill missing with most frequent, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [13]:
# ----------------------------------------------------------
# 4. Apply transformations
# ----------------------------------------------------------
X_processed = preprocessor.fit_transform(X)

print("Transformed feature array shape:", X_processed.shape)



Transformed feature array shape: (1470, 52)


In [11]:
# ----------------------------------------------------------
# 5. Save processed data
# ----------------------------------------------------------
os.makedirs("../data/processed", exist_ok=True)

# Get feature names after encoding
encoded_cols = preprocessor.named_transformers_["cat"]["encoder"].get_feature_names_out(categorical_features)
all_feature_names = np.concatenate([numeric_features, encoded_cols])

processed_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed,
                            columns=all_feature_names)

processed_df["Attrition"] = y.values
processed_df.to_csv("../data/processed/features.csv", index=False)

print("Saved cleaned dataset → data/processed/features.csv")


Saved cleaned dataset → data/processed/features.csv


### Preprocessing Improvements 

1. **Imputation (Median & Most Frequent):**  
   Replaced missing numeric and categorical values systematically, ensuring no model crashes or inconsistent record counts.

2. **Scaling with StandardScaler:**  
   Put all numeric variables (like Age, Income, Distance) on a similar scale, preventing features with large ranges from dominating model training.

3. **Encoding with OneHotEncoder:**  
   Converted categorical text fields (e.g., Department, JobRole, Gender) into machine-readable binary columns.  
   This avoided label bias and made categorical comparisons consistent.

4. **ColumnTransformer Pipeline:**  
   Unified all preprocessing steps (imputation, scaling, encoding) into one reproducible workflow — ensuring the same transformations are applied consistently to any future dataset.

These preprocessing steps improved dataset consistency, comparability across features, and readiness for model training.
