In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

df = pd.read_csv(r'../../Data/employee_promotion.csv')

print("Original shape:", df.shape)
df.head()

Original shape: (54808, 13)


Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60.0,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50.0,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50.0,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73.0,0


In [2]:
# Remove duplicates
print("\nRemoving duplicates...")
df = df.drop_duplicates()
print("Shape after removing duplicates:", df.shape)
# Null value analysis
print("\nNull value analysis:")
print(df.columns)
for col in df.columns:
    null_percentage = df[col].isnull().mean() * 100
    print(f"{col}: {null_percentage:.2f}% null values")

# Define features and target
X = df.drop(columns=["employee_id", "is_promoted", "region"])
y = df["is_promoted"]

# Train/validation split BEFORE any preprocessing to avoid data leakage
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTrain set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Train set class distribution:\Datn{y_train.value_counts()}")
print(f"Validation set class distribution:\n{y_val.value_counts()}")


Removing duplicates...
Shape after removing duplicates: (54808, 13)

Null value analysis:
Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'awards_won', 'avg_training_score', 'is_promoted'],
      dtype='object')
employee_id: 0.00% null values
department: 0.00% null values
region: 0.00% null values
education: 4.40% null values
gender: 0.00% null values
recruitment_channel: 0.00% null values
no_of_trainings: 0.00% null values
age: 0.00% null values
previous_year_rating: 7.52% null values
length_of_service: 0.00% null values
awards_won: 0.00% null values
avg_training_score: 4.67% null values
is_promoted: 0.00% null values

Train set shape: (43846, 10)
Validation set shape: (10962, 10)
Train set class distribution:\Datnis_promoted
0    40086
1     3760
Name: count, dtype: int64
Validation set class distribution:
is_promoted
0    10054
1      908
Name: count, dty

In [3]:
# Define categorical and numerical columns
categorical_cols = [
    "department",
    "education",
    "gender",
    "recruitment_channel",
]

numerical_cols = [
    "no_of_trainings",
    "age",
    "previous_year_rating",
    "length_of_service",
    "awards_won",
    "avg_training_score",
]

# Create preprocessing pipelines for both numerical and categorical data
# Imputation is now part of the pipeline - fitted only on training data!
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),  # mode imputation
        ("onehot", OneHotEncoder(drop="first", handle_unknown="error")),
    ]
)

numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),  # median imputation
    ]
)

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop",
)

print("Preprocessor created successfully!")

Preprocessor created successfully!


In [4]:


# Create the full pipeline with preprocessing and model
model = Pipeline(
    [
        ("preprocess", preprocessor),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=120,
                max_depth=20,
                min_samples_split=2,
                min_samples_leaf=1,
                max_features="sqrt",
                random_state=42,
                n_jobs=-1,
            ),
        ),
    ]
)

# Train the model
print("Training started...")
model.fit(X_train, y_train)
print("Training finished!")

Training started...


Training finished!


In [5]:
from sklearn.metrics import classification_report, confusion_matrix

# Validation results
# The same imputation stats from training are applied to validation data
val_score = model.score(X_val, y_val)
print(f"Validation accuracy: {val_score:.4f}")

# Get predictions for more detailed analysis if needed
y_pred = model.predict(X_val)


print("\nClassification Report:")
print(classification_report(y_val, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

Validation accuracy: 0.9379

Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     10054
           1       0.85      0.31      0.45       908

    accuracy                           0.94     10962
   macro avg       0.89      0.65      0.71     10962
weighted avg       0.93      0.94      0.92     10962


Confusion Matrix:
[[10004    50]
 [  631   277]]


In [6]:
import joblib

joblib.dump(model, "hr_promotion_model_full_pipeline.pkl")

print("Full model pipeline saved successfully!")

Full model pipeline saved successfully!
