In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import f1_score

In [16]:
TARGET_COLUMN = 'Attrition_Stayed' 
df = pd.read_csv(r'D:\Desktop\eff.csv')


In [17]:
df = df.drop('Employee ID', axis=1)
print(f"Shape after dropping Employee ID: {df.shape}")

Shape after dropping Employee ID: (72629, 42)


In [18]:
class_counts = df[TARGET_COLUMN].value_counts()
class_distribution = df[TARGET_COLUMN].value_counts(normalize=True) * 100

In [19]:
print(f"Class Counts (0=Attrition/Left, 1=Stayed):\n{class_counts}")
print(f"Class Distribution (%):\n{class_distribution}")

Class Counts (0=Attrition/Left, 1=Stayed):
Attrition_Stayed
1    37939
0    34690
Name: count, dtype: int64
Class Distribution (%):
Attrition_Stayed
1    52.23671
0    47.76329
Name: proportion, dtype: float64


In [20]:
is_imbalanced = class_distribution.max() > 75
if is_imbalanced:
    print("\nALERT: The data is highly imbalanced (Major Class > 75%)")
else:
    print("\nData appears relatively balanced.")


Data appears relatively balanced.


In [21]:
X = df.drop(TARGET_COLUMN, axis=1)
y = df[TARGET_COLUMN]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [22]:
lr_params = {}
if is_imbalanced:
    lr_params['class_weight'] = 'balanced'
    print("Initializing with 'class_weight = balanced' to handle imbalance")
lr_model = LogisticRegression(random_state=42, solver='liblinear',**lr_params)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
print("Logistic Regression Results:")
print(f" Accuracy Score: {accuracy_score(y_test, y_pred_lr):.4f}")
print("Classification Report (Focus on Class 0/Attrition):\n", classification_report(y_test, y_pred_lr))
print("(Confusion Matrix):\n", confusion_matrix(y_test, y_pred_lr))

Logistic Regression Results:
 Accuracy Score: 0.7534
Classification Report (Focus on Class 0/Attrition):
               precision    recall  f1-score   support

           0       0.74      0.74      0.74     10407
           1       0.76      0.77      0.76     11382

    accuracy                           0.75     21789
   macro avg       0.75      0.75      0.75     21789
weighted avg       0.75      0.75      0.75     21789

(Confusion Matrix):
 [[7702 2705]
 [2669 8713]]


In [23]:
k_range = range(1, 21) 
k_scores = []
scoring_metric = 'f1' if is_imbalanced else 'accuracy'
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring=scoring_metric)
    k_scores.append(scores.mean())
best_k_index = np.argmax(k_scores)
best_k = k_range[best_k_index]
print(f" K = {best_k}")

 K = 19


In [24]:
knn_model = KNeighborsClassifier(n_neighbors=best_k)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print("(Classification Report):\n", classification_report(y_test, y_pred_knn))
print("(Confusion Matrix):\n", confusion_matrix(y_test, y_pred_knn))


Accuracy: 0.7138
(Classification Report):
               precision    recall  f1-score   support

           0       0.69      0.72      0.71     10407
           1       0.73      0.71      0.72     11382

    accuracy                           0.71     21789
   macro avg       0.71      0.71      0.71     21789
weighted avg       0.71      0.71      0.71     21789

(Confusion Matrix):
 [[7502 2905]
 [3331 8051]]
