In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
print("Libraries imported successfully!")

Libraries imported successfully!


In [None]:
df = pd.read_csv('/content/drive/MyDrive/CAPSTONE PROJECT/expected_ctc.csv')

print("Dataset loaded successfully!")
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])
df.head()

In [None]:
print("Dataset Info:")
df.info()

In [None]:
print("\nSummary Statistics:")
display(df.describe().T)

In [None]:
print("\nMissing Values:")
print(df.isnull().sum().sort_values(ascending=False).head(10))

In [None]:
df = df.drop_duplicates()

target = 'Expected_CTC'
df['Expected_CTC_log'] = np.log1p(df[target])

num_cols = df.select_dtypes(include=['number']).columns
cat_cols = df.select_dtypes(include=['object']).columns

df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df[cat_cols] = df[cat_cols].fillna('Unknown')

print("Data cleaning completed!")

In [None]:
if 'Total_Experience' in df.columns and 'Total_Experience_in_field_applied' in df.columns:
    df['Experience_Gap'] = df['Total_Experience'] - df['Total_Experience_in_field_applied']

if 'Inhand_Offer' in df.columns:
    df['Inhand_Offer_Flag'] = df['Inhand_Offer'].map({'Yes': 1, 'No': 0})

print("Feature engineering completed!")

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

print("Categorical features encoded successfully!")

In [None]:
X = df.drop(columns=[target, 'Expected_CTC_log'])
y = df['Expected_CTC_log']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train-Test split completed!")
print("Training samples:", X_train.shape[0])
print("Testing samples :", X_test.shape[0])

In [None]:
model = RandomForestRegressor(n_estimators=100,max_depth=20,max_features='sqrt',random_state=42,n_jobs=-1 )
model.fit(X_train, y_train)

y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("Model training completed!")
print(f"MAE : {mae:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"R²  : {r2:.3f}")

In [None]:

plt.figure(figsize=(10,6))
sns.barplot(x=imp.head(15), y=imp.head(15).index)
plt.title("Top 15 Important Features")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

In [None]:
fairness_features = ['Gender', 'Department', 'Education']

fairness_features = [col for col in fairness_features if col in df.columns]

if fairness_features:
    test_df = X_test.copy()
    test_df['Actual_CTC'] = np.expm1(y_test)
    test_df['Predicted_CTC'] = np.expm1(model.predict(X_test))

    for feature in fairness_features:
        test_df[feature] = df.loc[X_test.index, feature]

        fairness = test_df.groupby(feature)[['Actual_CTC', 'Predicted_CTC']].mean()
        fairness['Difference'] = fairness['Predicted_CTC'] - fairness['Actual_CTC']

        print(f"\nFairness Check by {feature}:")
        print(fairness.sort_values('Difference', ascending=False))
else:
    print("No categorical features found for fairness check.")
