In [1]:


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [3]:



df = pd.read_csv("../Data/hr model2/HR_Analytics.csv")
df.head()

Unnamed: 0,EmpID,Age,AgeGroup,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,RM297,18,18-25,Yes,Travel_Rarely,230,Research & Development,3,3,Life Sciences,...,3,80,0,0,2,3,0,0,0,0.0
1,RM302,18,18-25,No,Travel_Rarely,812,Sales,10,3,Medical,...,1,80,0,0,2,3,0,0,0,0.0
2,RM458,18,18-25,Yes,Travel_Frequently,1306,Sales,5,3,Marketing,...,4,80,0,0,3,3,0,0,0,0.0
3,RM728,18,18-25,No,Non-Travel,287,Research & Development,5,2,Life Sciences,...,4,80,0,0,2,3,0,0,0,0.0
4,RM829,18,18-25,Yes,Non-Travel,247,Research & Development,8,1,Medical,...,4,80,0,0,0,3,0,0,0,0.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1480 entries, 0 to 1479
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   EmpID                     1480 non-null   object 
 1   Age                       1480 non-null   int64  
 2   AgeGroup                  1480 non-null   object 
 3   Attrition                 1480 non-null   object 
 4   BusinessTravel            1480 non-null   int64  
 5   DailyRate                 1480 non-null   int64  
 6   Department                1480 non-null   int64  
 7   DistanceFromHome          1480 non-null   int64  
 8   Education                 1480 non-null   int64  
 9   EducationField            1480 non-null   int64  
 10  EmployeeCount             1480 non-null   int64  
 11  EmployeeNumber            1480 non-null   int64  
 12  EnvironmentSatisfaction   1480 non-null   int64  
 13  Gender                    1480 non-null   int64  
 14  HourlyRa

In [17]:
df['PerformanceRating'].value_counts()

PerformanceRating
3    1253
4     227
Name: count, dtype: int64

In [4]:


label_cols = ['JobRole', 'Gender', 'EducationField', 'Department', 'BusinessTravel']

for col in label_cols:
    if col in df.columns:
        df[col] = LabelEncoder().fit_transform(df[col])

X = df[['JobSatisfaction', 'EnvironmentSatisfaction', 'WorkLifeBalance',
        'MonthlyIncome', 'YearsAtCompany', 'Education', 'TrainingTimesLastYear']]

y = df['PerformanceRating']


In [5]:


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [6]:

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("\n", classification_report(y_test, dt_pred))


Decision Tree Accuracy: 0.7263513513513513

               precision    recall  f1-score   support

           3       0.86      0.81      0.83       252
           4       0.19      0.25      0.21        44

    accuracy                           0.73       296
   macro avg       0.52      0.53      0.52       296
weighted avg       0.76      0.73      0.74       296



In [14]:



gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
print("GradientBoostingClassifier Training Accuracy:", gb_model.score(X_train, y_train))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_pred))
print("\n", classification_report(y_test, gb_pred))



GradientBoostingClassifier Training Accuracy: 0.8538851351351351
Gradient Boosting Accuracy: 0.8412162162162162

               precision    recall  f1-score   support

           3       0.85      0.99      0.91       252
           4       0.00      0.00      0.00        44

    accuracy                           0.84       296
   macro avg       0.42      0.49      0.46       296
weighted avg       0.72      0.84      0.78       296



In [15]:


# pickle.dump(dt_model, open("decision_tree_model.pkl", "wb"))
pickle.dump(gb_model, open("../Models/gradient_boosting_model.pkl", "wb"))
pickle.dump(scaler, open("../Scalers/scaler.pkl", "wb"))

print("Models Saved Successfully!")

Models Saved Successfully!
