In [123]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import classification_report, accuracy_score

In [124]:
df = pd.read_csv('hr_data.csv') 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8995 entries, 0 to 8994
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   SLNO                         8995 non-null   int64  
 1   Candidate Ref                8995 non-null   int64  
 2   DOJ Extended                 8995 non-null   object 
 3   Duration to accept offer     8995 non-null   int64  
 4   Notice period                8995 non-null   int64  
 5   Offered band                 8995 non-null   object 
 6   Pecent hike expected in CTC  8995 non-null   float64
 7   Percent hike offered in CTC  8995 non-null   float64
 8   Percent difference CTC       8995 non-null   float64
 9   Joining Bonus                8995 non-null   object 
 10  Candidate relocate actual    8995 non-null   object 
 11  Gender                       8995 non-null   object 
 12  Candidate Source             8995 non-null   object 
 13  Rex in Yrs        

In [125]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# drop irrelevant columns: 'SLNO', 'Candidate Ref'
df = df.drop(columns=['SLNO', 'Candidate Ref'])
df = pd.get_dummies(df, drop_first=True)
status_column = [col for col in df.columns if 'Status' in col][0] 
df['Status'] = df[status_column]  
df = df.drop(columns=[status_column])
X = df.drop(columns='Status')  # independent variables
y = df['Status']               # dependent variable

def calculate_vif(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_const = np.column_stack((np.ones(X_scaled.shape[0]), X_scaled))
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X_scaled_const, i + 1) for i in range(len(X.columns))]
    return vif_data

def remove_high_vif(X, threshold=10):
    while True:
        vif_data = calculate_vif(X)
        max_vif = vif_data["VIF"].max()
        if max_vif > threshold:
            feature_to_remove = vif_data.loc[vif_data["VIF"].idxmax(), "Feature"]
            print(f"Removing {feature_to_remove} with VIF {max_vif:.2f}")
            X = X.drop(columns=[feature_to_remove])
        else:
            break
    return X

X_reduced = remove_high_vif(X)

print("\nFinal VIF values:")
print(calculate_vif(X_reduced))

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(max_iter=5000, solver='liblinear')
logreg.fit(X_train_scaled, y_train)

y_pred = logreg.predict(X_test_scaled)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy After Removing columns with high VIF: {accuracy * 100:.2f}%")

Removing Location_Chennai with VIF 343.56
Removing Percent hike offered in CTC with VIF 25.11
Removing Offered band_E2 with VIF 12.90

Final VIF values:
                               Feature       VIF
0             Duration to accept offer  1.332659
1                        Notice period  1.232006
2          Pecent hike expected in CTC  1.111195
3               Percent difference CTC  1.051662
4                           Rex in Yrs  2.769042
5                                  Age  1.535542
6                     DOJ Extended_Yes  1.210812
7                      Offered band_E1  2.118204
8                      Offered band_E3  1.383070
9                    Joining Bonus_Yes  1.077254
10       Candidate relocate actual_Yes  1.063140
11                         Gender_Male  1.032725
12             Candidate Source_Direct  1.472196
13  Candidate Source_Employee Referral  1.453005
14                            LOB_BFSI  3.551174
15                            LOB_CSMP  2.010616
16            

In [126]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42)


models = {
    'SVM': (SVC(), {
        'C': [1, 10],
        'kernel': ['rbf']
    }),
    'kNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5],
        'weights': ['uniform', 'distance']
    }),
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [100],
        'max_depth': [None, 10]
    })
}

# Perform grid search for each model
best_models = {}
for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search

    print(f"\n{name} Best Parameters:")
    print(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    print(f"{name} Test Accuracy:")
    print(f"{accuracy_score(y_test, y_pred):.4f}")

# Identify the best overall model
best_model_name = max(best_models, key=lambda name: best_models[name].best_score_)
best_model = best_models[best_model_name]

print("\nBest Overall Model:")
print(f"Model: {best_model_name}")
print("Best Parameters:")
print(best_model.best_params_)
print("Best Cross-Validation Score:")
print(f"{best_model.best_score_:.4f}")

# Final evaluation on test set
y_pred = best_model.predict(X_test)
print("Test Accuracy:")
print(f"{accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


SVM Best Parameters:
{'C': 10, 'kernel': 'rbf'}
SVM Test Accuracy:
0.8114

kNN Best Parameters:
{'n_neighbors': 5, 'weights': 'uniform'}
kNN Test Accuracy:
0.7944

Random Forest Best Parameters:
{'max_depth': 10, 'n_estimators': 100}
Random Forest Test Accuracy:
0.8125

Best Overall Model:
Model: Random Forest
Best Parameters:
{'max_depth': 10, 'n_estimators': 100}
Best Cross-Validation Score:
0.8220
Test Accuracy:
0.8125
Classification Report:
              precision    recall  f1-score   support

       False       0.81      1.00      0.90      2181
        True       0.83      0.03      0.06       518

    accuracy                           0.81      2699
   macro avg       0.82      0.51      0.48      2699
weighted avg       0.82      0.81      0.73      2699



## Applying Standardization and removing outliers (detected using Cook's distance and Leverage value)

In [127]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_with_const = sm.add_constant(X_scaled)

# fit a model to get leverage values
model = sm.OLS(y, X_scaled_with_const).fit()
influence = model.get_influence()

# calculate Cook's distance
cooks_d = influence.cooks_distance[0]
leverage = influence.hat_matrix_diag


cooks_threshold = 4 / len(X_scaled)  # Common threshold
leverage_threshold = 3 * (X_scaled.shape[1] / len(X_scaled))  # Leverage threshold

# Identify outliers
outliers_cooks = np.where(cooks_d > cooks_threshold)[0]
outliers_leverage = np.where(leverage > leverage_threshold)[0]
outliers = set(outliers_cooks) | set(outliers_leverage)

# remove outliers
X_clean = np.delete(X_scaled, list(outliers), axis=0)
y_clean = np.delete(y, list(outliers), axis=0)


X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.3, random_state=42)

models = {
    'SVM': (SVC(), {
        'C': [1, 10],
        'kernel': ['rbf']
    }),
    'kNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5],
        'weights': ['uniform', 'distance']
    }),
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [100],
        'max_depth': [None, 10]
    })
}

# Perform grid search for each model
best_models = {}
for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search

    print(f"\n{name} Best Parameters:")
    print(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    print(f"{name} Test Accuracy:")
    print(f"{accuracy_score(y_test, y_pred):.4f}")

# Identify the best overall model
best_model_name = max(best_models, key=lambda name: best_models[name].best_score_)
best_model = best_models[best_model_name]

print("\nBest Overall Model:")
print(f"Model: {best_model_name}")
print("Best Parameters:")
print(best_model.best_params_)
print("Best Cross-Validation Score:")
print(f"{best_model.best_score_:.4f}")

# Final evaluation on test set
y_pred = best_model.predict(X_test)
print("Test Accuracy:")
print(f"{accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))



SVM Best Parameters:
{'C': 1, 'kernel': 'rbf'}
SVM Test Accuracy:
0.8541

kNN Best Parameters:
{'n_neighbors': 5, 'weights': 'uniform'}
kNN Test Accuracy:
0.8386

Random Forest Best Parameters:
{'max_depth': None, 'n_estimators': 100}
Random Forest Test Accuracy:
0.8577

Best Overall Model:
Model: Random Forest
Best Parameters:
{'max_depth': None, 'n_estimators': 100}
Best Cross-Validation Score:
0.8564
Test Accuracy:
0.8577
Classification Report:
              precision    recall  f1-score   support

       False       0.87      0.98      0.92      2149
        True       0.59      0.13      0.21       373

    accuracy                           0.86      2522
   macro avg       0.73      0.56      0.56      2522
weighted avg       0.83      0.86      0.82      2522



## After removing outliers and applying standardization accuracy of all models increased:
1. **SVM from 81.14% to 85.41**
2. **kNN from 79.44% to 83.86%**
3. **Random Forest from 81.25% to 85.77%**

### However Random Forest is still the best model