# Training Models for HR-Employee Attrition and Performance Analysis


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier


This cell loads the HR-Employee dataset and categorizes certain numerical columns into defined ranges.

 - Age: Categories: 0-25, 26-37, 38-47, 48+
 - Distance from Home: Categories: Near, Far
 - Monthly Income: Categories: 0-2000, 2001-3000, 3001-7000, 7001-11000, 11001+
 - Stock Option Level: Categories: Group1-2, Group3-4
 - Total Working Years: Categories: 0-3 Years, 4-10 Years, 11-25 Years, 26+ Years
 - Years at Company: Categories: 0-5, 6-11, 11+
 - Years in Current Role: Categories: 0-2, 3-4, 5-6, 7-9, 9+
 - Years Since Last Promotion: Categories: 0-2, 3-7, 8+
 - Years with Current Manager: Categories: 0-2, 3-7, 8+

In [2]:
# Load dataset
data = pd.read_csv("HR-EmployeeAttrition.csv")

# Categorize columns
data['Age'] = pd.cut(data['Age'], bins=[-1, 25, 37, 47, data['Age'].max()], labels=['0-25', '26-37', '38-47', f'48-{data["Age"].max()}'])
data['DistanceFromHome'] = pd.cut(data['DistanceFromHome'], bins=[-1, 10, data['DistanceFromHome'].max()], labels=['Near', 'Far'])
data['MonthlyIncome'] = pd.cut(data['MonthlyIncome'], bins=[-1, 2000, 3000, 7000, 11000, data['MonthlyIncome'].max()], labels=['0-2000', '2001-3000', '3001-7000', '7001-11000', f'11001-{data["MonthlyIncome"].max()}'])
data['StockOptionLevel'] = pd.cut(data['StockOptionLevel'], bins=[0, 1.5, 3.5], labels=['Group1-2', 'Group3-4'], right=False)
data['TotalWorkingYears'] = pd.cut(data['TotalWorkingYears'], bins=[-1, 3, 10, 25, data['TotalWorkingYears'].max()], labels=['0-3 Years', '4-10 Years', '11-25 Years', f"26-{data['TotalWorkingYears'].max()} Years"])
data['YearsAtCompany'] = pd.cut(data['YearsAtCompany'], bins=[-1, 5, 11, data['YearsAtCompany'].max()], labels=['0-5', '6-11', f'11-{data["YearsAtCompany"].max()}'])
data['YearsInCurrentRole'] = pd.cut(data['YearsInCurrentRole'], bins=[-1, 2, 4, 6, 9, data['YearsInCurrentRole'].max()], labels=['0-2', '3-4', '5-6', '7-9', f'9-{data["YearsInCurrentRole"].max()}'])
data['YearsSinceLastPromotion'] = pd.cut(data['YearsSinceLastPromotion'], bins=[-1, 2, 7, data['YearsSinceLastPromotion'].max()], labels=['0-2', '3-7', f'8-{data["YearsSinceLastPromotion"].max()}'])
data['YearsWithCurrManager'] = pd.cut(data['YearsWithCurrManager'], bins=[-1, 2, 7, data['YearsWithCurrManager'].max()], labels=['0-2', '3-7', f'8-{data["YearsWithCurrManager"].max()}'])

features = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome', 'OverTime', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


### Preprocessing of the HR-EmployeeAttrition Dataset
Preprocessing of the HR-EmployeeAttrition dataset is demonstrated in this cell. <br>
The dataset is first separated into features and target variables based on the specified features and target variables in the preceding cell. <br>
Subsequently, OneHotEncoder is applied to categorical columns for transformation. <br>
Lastly, the columns of the test set are adjusted to align with the training set.


In [3]:
# Separate dataset into features and target variable
X = data[features]
y_attrition = data['Attrition']
y_performance = data['PerformanceRating']

X_train_attrition, X_test_attrition, y_train_attrition, y_test_attrition = train_test_split(X, y_attrition, test_size=0.2, random_state=42)
X_train_performance, X_test_performance, y_train_performance, y_test_performance = train_test_split(X, y_performance, test_size=0.2, random_state=42)

# Use OneHotEncoder for categorical columns
X_train_attrition = pd.get_dummies(X_train_attrition, drop_first=True)
X_test_attrition = pd.get_dummies(X_test_attrition, drop_first=True)
X_train_performance = pd.get_dummies(X_train_performance, drop_first=True)
X_test_performance = pd.get_dummies(X_test_performance, drop_first=True)

# Ensure the test set has the same columns as the training set
X_test_attrition = X_test_attrition.reindex(columns=X_train_attrition.columns, fill_value=0)
X_test_performance = X_test_performance.reindex(columns=X_train_performance.columns, fill_value=0)

### Model Training and Evaluation
Logistic Regression and SVM RBF models are instantiated and trained for predicting attrition and performance rating, respectively. <br>
The models are then used to make predictions on both the training and test sets. Accuracy scores are calculated to evaluate the performance of each model. <br>
Additionally, cross-validation is performed to assess the generalization performance of the models. <br>
Finally, classification reports are generated to provide a detailed evaluation of the models' performance in predicting attrition and performance rating.

In [4]:
# Logistic Regression and SVM RBF models
model_logistic_attrition = LogisticRegression(penalty='l2', C=10.0, solver='lbfgs', max_iter=5000)
model_svm_rbf_performance = SVC(kernel='rbf', C=2.0, gamma='scale', random_state=42)


# Fit the model
model_logistic_attrition.fit(X_train_attrition, y_train_attrition)
model_svm_rbf_performance.fit(X_train_performance, y_train_performance)


# Predict the target variable
y_train_attrition_pred = model_logistic_attrition.predict(X_train_attrition)
y_test_attrition_pred = model_logistic_attrition.predict(X_test_attrition)
y_train_svm_rbf_performance_pred = model_svm_rbf_performance.predict(X_train_performance)
y_test_svm_rbf_performance_pred = model_svm_rbf_performance.predict(X_test_performance)


# Calculate the accuracy of the model
train_attrition_accuracy = accuracy_score(y_train_attrition, y_train_attrition_pred)
test_attrition_accuracy = accuracy_score(y_test_attrition, y_test_attrition_pred)
train_svm_rbf_performance_accuracy = accuracy_score(y_train_performance, y_train_svm_rbf_performance_pred)
test_svm_rbf_performance_accuracy = accuracy_score(y_test_performance, y_test_svm_rbf_performance_pred)


# Print the accuracy of the model
print("Train Accuracy (Attrition) - Logistic Regression:", train_attrition_accuracy)
print("Test Accuracy (Attrition) - Logistic Regression:", test_attrition_accuracy)
print("\nTrain Accuracy (Performance Rating) - SVM RBF:", train_svm_rbf_performance_accuracy)
print("Test Accuracy (Performance Rating) - SVM RBF:", test_svm_rbf_performance_accuracy)


# Cross-validation
cv_scores_attrition = cross_val_score(model_logistic_attrition, X_train_attrition, y_train_attrition, cv=5)
cv_scores_svm_rbf_performance = cross_val_score(model_svm_rbf_performance, X_train_performance, y_train_performance, cv=5)


# Print the cross-validation scores and mean of the scores
print("\nCross-validation Scores (Attrition) - Logistic Regression:", cv_scores_attrition)
print("Mean Cross-validation Score (Attrition) - Logistic Regression:", cv_scores_attrition.mean())
print("\nCross-validation Scores (Performance Rating) - SVM RBF:", cv_scores_svm_rbf_performance)
print("Mean Cross-validation Score (Performance Rating) - SVM RBF:", cv_scores_svm_rbf_performance.mean())


# Classification report
print("\nAttrition Model Classification Report:")
print(classification_report(y_test_attrition, y_test_attrition_pred))
print("\nPerformance Rating Model Classification Report:")
print(classification_report(y_test_performance, y_test_svm_rbf_performance_pred))

Train Accuracy (Attrition) - Logistic Regression: 0.888468809073724
Test Accuracy (Attrition) - Logistic Regression: 0.8754716981132076

Train Accuracy (Performance Rating) - SVM RBF: 0.8402646502835539
Test Accuracy (Performance Rating) - SVM RBF: 0.8641509433962264

Cross-validation Scores (Attrition) - Logistic Regression: [0.87735849 0.8490566  0.84433962 0.87677725 0.88625592]
Mean Cross-validation Score (Attrition) - Logistic Regression: 0.8667575784673165

Cross-validation Scores (Performance Rating) - SVM RBF: [0.83962264 0.83962264 0.83962264 0.8436019  0.83886256]
Mean Cross-validation Score (Performance Rating) - SVM RBF: 0.8402664759009211

Attrition Model Classification Report:
              precision    recall  f1-score   support

          No       0.92      0.94      0.93       223
         Yes       0.62      0.55      0.58        42

    accuracy                           0.88       265
   macro avg       0.77      0.74      0.75       265
weighted avg       0.87     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Data Resampling

I observed an imbalance in the data in the previous output, so I used ADASYN to generate synthetic data. <br>
ADASYN enables the generation of synthetic data to augment the minority class, thereby reducing class imbalance.

In [5]:
# Separate dataset into features and target variable
X = data[features]
y_attrition = data['Attrition']
y_performance = data['PerformanceRating']

# One-Hot Encoding for categorical columns
X_encoded = pd.get_dummies(X, drop_first=True)

# Apply ADASYN
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_encoded, y_performance)
X_resampled_attrition, y_resampled_attrition = adasyn.fit_resample(X_encoded, y_attrition)

# Split the dataset into training and test sets
X_train_attrition, X_test_attrition, y_train_attrition, y_test_attrition = train_test_split(X_resampled_attrition, y_resampled_attrition, test_size=0.2, random_state=42)
X_train_performance, X_test_performance, y_train_performance, y_test_performance = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


The training is done with the same SVM and Logistic Regression as above, but this time using the data augmented with ADASYN.

In [6]:
#Logistic Regression and SVM RBF models
model_logistic_attrition = LogisticRegression(penalty='l2', C=10.0, solver='lbfgs', max_iter=5000)
model_svm_rbf_performance = SVC(kernel='rbf', C=2.0, gamma='scale', random_state=42)


# Fit the models
model_logistic_attrition.fit(X_train_attrition, y_train_attrition)
model_svm_rbf_performance.fit(X_train_performance, y_train_performance)


# Predict the target variable
y_train_attrition_pred = model_logistic_attrition.predict(X_train_attrition)
y_test_attrition_pred = model_logistic_attrition.predict(X_test_attrition)
y_train_svm_rbf_performance_pred = model_svm_rbf_performance.predict(X_train_performance)
y_test_svm_rbf_performance_pred = model_svm_rbf_performance.predict(X_test_performance)


# Calculate the accuracy of the model
train_attrition_accuracy = accuracy_score(y_train_attrition, y_train_attrition_pred)
test_attrition_accuracy = accuracy_score(y_test_attrition, y_test_attrition_pred)
train_svm_rbf_performance_accuracy = accuracy_score(y_train_performance, y_train_svm_rbf_performance_pred)
test_svm_rbf_performance_accuracy = accuracy_score(y_test_performance, y_test_svm_rbf_performance_pred)


# Print the accuracy of the model
print("Train Accuracy (Attrition) - Logistic Regression {:.3f}".format(train_attrition_accuracy))
print("Test Accuracy (Attrition) - Logistic Regression {:.3f}".format(test_attrition_accuracy))
print("\nTrain Accuracy (Performance Rating) - SVM RBF {:.3f}".format(train_svm_rbf_performance_accuracy))
print("Test Accuracy (Performance Rating) - SVM RBF {:.3f}".format(test_svm_rbf_performance_accuracy))


# Cross-validation
cv_scores_attrition = cross_val_score(model_logistic_attrition, X_train_attrition, y_train_attrition, cv=5)
cv_scores_svm_rbf_performance = cross_val_score(model_svm_rbf_performance, X_train_performance, y_train_performance, cv=5)


# Print the cross-validation scores and mean of the scores
print("\nCross-validation Scores (Attrition) - Logistic Regression:", cv_scores_attrition)
print("Mean Cross-validation Score (Attrition) - Logistic Regression:", cv_scores_attrition.mean())
print("\nCross-validation Scores (Performance Rating) - SVM RBF:", cv_scores_svm_rbf_performance)
print("Mean Cross-validation Score (Performance Rating) - SVM RBF:", cv_scores_svm_rbf_performance.mean())


# Classification report
print("\nAttrition Model Classification Report:")
print(classification_report(y_test_attrition, y_test_attrition_pred))
print("\nPerformance Rating Model Classification Report:")
print(classification_report(y_test_performance, y_test_svm_rbf_performance_pred))


Train Accuracy (Attrition) - Logistic Regression 0.914
Test Accuracy (Attrition) - Logistic Regression 0.901

Train Accuracy (Performance Rating) - SVM RBF 0.888
Test Accuracy (Performance Rating) - SVM RBF 0.886

Cross-validation Scores (Attrition) - Logistic Regression: [0.9047619  0.91011236 0.90168539 0.88202247 0.90168539]
Mean Cross-validation Score (Attrition) - Logistic Regression: 0.9000535045478866

Cross-validation Scores (Performance Rating) - SVM RBF: [0.84615385 0.9002849  0.86285714 0.85142857 0.86857143]
Mean Cross-validation Score (Performance Rating) - SVM RBF: 0.8658591778591779

Attrition Model Classification Report:
              precision    recall  f1-score   support

          No       0.87      0.94      0.90       222
         Yes       0.93      0.87      0.90       224

    accuracy                           0.90       446
   macro avg       0.90      0.90      0.90       446
weighted avg       0.90      0.90      0.90       446


Performance Rating Model Cl

### Model Evaluation for Attrition and Performance Rating
Five different models (Logistic Regression, Random Forest, SVM Linear, SVM RBF, Gradient Boosting) have been applied to train and test data for predicting attrition and performance rating. <br/> <br/>
Performance metrics such as training accuracy, test accuracy, cross-validation scores, and classification reports are obtained for each model. These metrics are used to thoroughly evaluate and compare model performance. By observing these values in the output of the code, the performance of each model for each target variable can be analyzed.


### Logistic Regression

In [7]:
# Logistic Regression model with regularization
model_logistic_attrition = LogisticRegression(penalty='l2', C=10.0, solver='lbfgs', max_iter=5000)
model_performance = LogisticRegression(penalty='l2', C=1000.0, solver='lbfgs', max_iter=5000, multi_class='multinomial')

# Train the model
model_logistic_attrition.fit(X_train_attrition, y_train_attrition)
model_performance.fit(X_train_performance, y_train_performance)

# Predict the target attribute
y_train_attrition_pred = model_logistic_attrition.predict(X_train_attrition)
y_test_attrition_pred = model_logistic_attrition.predict(X_test_attrition)
y_train_performance_pred = model_performance.predict(X_train_performance)
y_test_performance_pred = model_performance.predict(X_test_performance)

# Calculate the accuracy of the model
train_attrition_accuracy = accuracy_score(y_train_attrition, y_train_attrition_pred)
test_attrition_accuracy = accuracy_score(y_test_attrition, y_test_attrition_pred)
train_performance_accuracy = accuracy_score(y_train_performance, y_train_performance_pred)
test_performance_accuracy = accuracy_score(y_test_performance, y_test_performance_pred)

# Print the accuracy of the model
print("Train Accuracy (Attrition) - Logistic Regression: {:.3f}".format(train_attrition_accuracy))
print("Test Accuracy (Attrition) - Logistic Regression: {:.3f}".format(test_attrition_accuracy))
print("\nTrain Accuracy (Performance Rating) - Logistic Regression: {:.3f}".format(train_performance_accuracy))
print("Test Accuracy (Performance Rating) - Logistic Regression: {:.3f}".format(test_performance_accuracy))

# Cross-validation
cv_scores_attrition = cross_val_score(model_logistic_attrition, X_train_attrition, y_train_attrition, cv=5)
cv_scores_performance = cross_val_score(model_performance, X_train_performance, y_train_performance, cv=5)

print("\nCross-validation Scores (Attrition) - Logistic Regression:", cv_scores_attrition)
print("Mean Cross-validation Score (Attrition) - Logistic Regression:", cv_scores_attrition.mean())
print("\nCross-validation Scores (Performance Rating) - Logistic Regression:", cv_scores_performance)
print("Mean Cross-validation Score (Performance Rating) - Logistic Regression:", cv_scores_performance.mean())

# Classification report
print("\nAttrition Model Classification Report:")
print(classification_report(y_test_attrition, y_test_attrition_pred))
print("\nPerformance Rating Model Classification Report:")
print(classification_report(y_test_performance, y_test_performance_pred))


Train Accuracy (Attrition) - Logistic Regression: 0.914
Test Accuracy (Attrition) - Logistic Regression: 0.901

Train Accuracy (Performance Rating) - Logistic Regression: 0.862
Test Accuracy (Performance Rating) - Logistic Regression: 0.854

Cross-validation Scores (Attrition) - Logistic Regression: [0.9047619  0.91011236 0.90168539 0.88202247 0.90168539]
Mean Cross-validation Score (Attrition) - Logistic Regression: 0.9000535045478866

Cross-validation Scores (Performance Rating) - Logistic Regression: [0.81481481 0.88319088 0.83428571 0.84       0.84571429]
Mean Cross-validation Score (Performance Rating) - Logistic Regression: 0.8436011396011397

Attrition Model Classification Report:
              precision    recall  f1-score   support

          No       0.87      0.94      0.90       222
         Yes       0.93      0.87      0.90       224

    accuracy                           0.90       446
   macro avg       0.90      0.90      0.90       446
weighted avg       0.90      0.

### Random Forest

In [8]:
# Random Forest model
model_rf_attrition = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, max_leaf_nodes=10) 
model_rf_performance = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=10, max_leaf_nodes=20)


# Train the model
model_rf_attrition.fit(X_train_attrition, y_train_attrition)
model_rf_performance.fit(X_train_performance, y_train_performance)


# Predict the target attribute
y_train_rf_attrition_pred = model_rf_attrition.predict(X_train_attrition)
y_test_rf_attrition_pred = model_rf_attrition.predict(X_test_attrition)
y_train_rf_performance_pred = model_rf_performance.predict(X_train_performance)
y_test_rf_performance_pred = model_rf_performance.predict(X_test_performance)


# Calculate the accuracy of the model
train_rf_attrition_accuracy = accuracy_score(y_train_attrition, y_train_rf_attrition_pred)
test_rf_attrition_accuracy = accuracy_score(y_test_attrition, y_test_rf_attrition_pred)
train_rf_performance_accuracy = accuracy_score(y_train_performance, y_train_rf_performance_pred)
test_rf_performance_accuracy = accuracy_score(y_test_performance, y_test_rf_performance_pred)


# Print the accuracy of the model
print("Train Accuracy (Attrition) - Random Forest: {:.3f}".format(train_rf_attrition_accuracy))
print("Test Accuracy (Attrition) - Random Forest: {:.3f}".format(test_rf_attrition_accuracy))
print("\nTrain Accuracy (Performance Rating) - Random Forest: {:.3f}".format(train_rf_performance_accuracy))
print("Test Accuracy (Performance Rating) - Random Forest: {:.3f}".format(test_rf_performance_accuracy))


# Cross-validation
cv_scores_rf_attrition = cross_val_score(model_rf_attrition, X_train_attrition, y_train_attrition, cv=5)
cv_scores_rf_performance = cross_val_score(model_rf_performance, X_train_performance, y_train_performance, cv=5)

print("\nCross-validation Scores (Attrition) - Random Forest:", cv_scores_rf_attrition)
print("Mean Cross-validation Score (Attrition) - Random Forest:", cv_scores_rf_attrition.mean())
print("\nCross-validation Scores (Performance Rating) - Random Forest:", cv_scores_rf_performance)
print("Mean Cross-validation Score (Performance Rating) - Random Forest:", cv_scores_rf_performance.mean())

# Classification report
print("\nAttrition Model Classification Report:")
print(classification_report(y_test_attrition, y_test_rf_attrition_pred))
print("\nPerformance Rating Model Classification Report:")
print(classification_report(y_test_performance, y_test_rf_performance_pred))


Train Accuracy (Attrition) - Random Forest: 0.831
Test Accuracy (Attrition) - Random Forest: 0.809

Train Accuracy (Performance Rating) - Random Forest: 0.870
Test Accuracy (Performance Rating) - Random Forest: 0.827

Cross-validation Scores (Attrition) - Random Forest: [0.78431373 0.81741573 0.83707865 0.80337079 0.81179775]
Mean Cross-validation Score (Attrition) - Random Forest: 0.810795329367702

Cross-validation Scores (Performance Rating) - Random Forest: [0.79202279 0.84330484 0.84571429 0.8        0.82857143]
Mean Cross-validation Score (Performance Rating) - Random Forest: 0.8219226699226698

Attrition Model Classification Report:
              precision    recall  f1-score   support

          No       0.80      0.82      0.81       222
         Yes       0.81      0.80      0.81       224

    accuracy                           0.81       446
   macro avg       0.81      0.81      0.81       446
weighted avg       0.81      0.81      0.81       446


Performance Rating Model

### Gradient Boosting

In [9]:
# Gradient Boosting model
model_gb_attrition = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model_gb_performance = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)


# Train the model
model_gb_attrition.fit(X_train_attrition, y_train_attrition)
model_gb_performance.fit(X_train_performance, y_train_performance)


# Predict the target attribute
y_train_gb_attrition_pred = model_gb_attrition.predict(X_train_attrition)
y_test_gb_attrition_pred = model_gb_attrition.predict(X_test_attrition)
y_train_gb_performance_pred = model_gb_performance.predict(X_train_performance)
y_test_gb_performance_pred = model_gb_performance.predict(X_test_performance)


# Calculate the accuracy of the model
train_gb_attrition_accuracy = accuracy_score(y_train_attrition, y_train_gb_attrition_pred)
test_gb_attrition_accuracy = accuracy_score(y_test_attrition, y_test_gb_attrition_pred)
train_gb_performance_accuracy = accuracy_score(y_train_performance, y_train_gb_performance_pred)
test_gb_performance_accuracy = accuracy_score(y_test_performance, y_test_gb_performance_pred)


# Print the accuracy of the model
print("Train Accuracy (Attrition) - Gradient Boosting: {:.3f}".format(train_gb_attrition_accuracy))
print("Test Accuracy (Attrition) - Gradient Boosting: {:.3f}".format(test_gb_attrition_accuracy))
print("\nTrain Accuracy (Performance Rating) - Gradient Boosting: {:.3f}".format(train_gb_performance_accuracy))
print("Test Accuracy (Performance Rating) - Gradient Boosting: {:.3f}".format(test_gb_performance_accuracy))

# Cross-validation
cv_scores_gb_attrition = cross_val_score(model_gb_attrition, X_train_attrition, y_train_attrition, cv=5)
cv_scores_gb_performance = cross_val_score(model_gb_performance, X_train_performance, y_train_performance, cv=5)

print("\nCross-validation Scores (Attrition) - Gradient Boosting:", cv_scores_gb_attrition)
print("Mean Cross-validation Score (Attrition) - Gradient Boosting:", cv_scores_gb_attrition.mean())
print("\nCross-validation Scores (Performance Rating) - Gradient Boosting:", cv_scores_gb_performance)
print("Mean Cross-validation Score (Performance Rating) - Gradient Boosting:", cv_scores_gb_performance.mean())

# Classification report
print("\nAttrition Model Classification Report:")
print(classification_report(y_test_attrition, y_test_gb_attrition_pred))
print("\nPerformance Rating Model Classification Report:")
print(classification_report(y_test_performance, y_test_gb_performance_pred))


Train Accuracy (Attrition) - Gradient Boosting: 0.941
Test Accuracy (Attrition) - Gradient Boosting: 0.901

Train Accuracy (Performance Rating) - Gradient Boosting: 0.932
Test Accuracy (Performance Rating) - Gradient Boosting: 0.884

Cross-validation Scores (Attrition) - Gradient Boosting: [0.8767507  0.8988764  0.91853933 0.89606742 0.88483146]
Mean Cross-validation Score (Attrition) - Gradient Boosting: 0.895013061404337

Cross-validation Scores (Performance Rating) - Gradient Boosting: [0.84045584 0.87749288 0.85428571 0.83714286 0.86      ]
Mean Cross-validation Score (Performance Rating) - Gradient Boosting: 0.8538754578754579

Attrition Model Classification Report:
              precision    recall  f1-score   support

          No       0.89      0.91      0.90       222
         Yes       0.91      0.89      0.90       224

    accuracy                           0.90       446
   macro avg       0.90      0.90      0.90       446
weighted avg       0.90      0.90      0.90     

### SVM Linear

In [10]:
# SVM model
model_svm_linear_attrition = SVC(kernel='linear', C=100.0, random_state=42)
model_svm_linear_performance = SVC(kernel='linear', C=100.0, random_state=42)


# Train the model
model_svm_linear_attrition.fit(X_train_attrition, y_train_attrition)
model_svm_linear_performance.fit(X_train_performance, y_train_performance)


# Predict the target attribute
y_train_svm_linear_attrition_pred = model_svm_linear_attrition.predict(X_train_attrition)
y_test_svm_linear_attrition_pred = model_svm_linear_attrition.predict(X_test_attrition)
y_train_svm_linear_performance_pred = model_svm_linear_performance.predict(X_train_performance)
y_test_svm_linear_performance_pred = model_svm_linear_performance.predict(X_test_performance)


# Calculate the accuracy of the model
train_svm_linear_attrition_accuracy = accuracy_score(y_train_attrition, y_train_svm_linear_attrition_pred)
test_svm_linear_attrition_accuracy = accuracy_score(y_test_attrition, y_test_svm_linear_attrition_pred)
train_svm_linear_performance_accuracy = accuracy_score(y_train_performance, y_train_svm_linear_performance_pred)
test_svm_linear_performance_accuracy = accuracy_score(y_test_performance, y_test_svm_linear_performance_pred)


# Print the accuracy of the model
print("Train Accuracy (Attrition) - SVM Linear: {:.3f}".format(train_svm_linear_attrition_accuracy))
print("Test Accuracy (Attrition) - SVM Linear: {:.3f}".format(test_svm_linear_attrition_accuracy))
print("\nTrain Accuracy (Performance Rating) - SVM Linear: {:.3f}".format(train_svm_linear_performance_accuracy))
print("Test Accuracy (Performance Rating) - SVM Linear: {:.3f}".format(test_svm_linear_performance_accuracy))


# Cross-validation
cv_scores_svm_linear_attrition = cross_val_score(model_svm_linear_attrition, X_train_attrition, y_train_attrition, cv=5)
cv_scores_svm_linear_performance = cross_val_score(model_svm_linear_performance, X_train_performance, y_train_performance, cv=5)

print("\nCross-validation Scores (Attrition) - SVM Linear:", cv_scores_svm_linear_attrition)
print("Mean Cross-validation Score (Attrition) - SVM Linear:", cv_scores_svm_linear_attrition.mean())
print("\nCross-validation Scores (Performance Rating) - SVM Linear:", cv_scores_svm_linear_performance)
print("Mean Cross-validation Score (Performance Rating) - SVM Linear:", cv_scores_svm_linear_performance.mean())


# Classification report
print("\nAttrition Model Classification Report:")
print(classification_report(y_test_attrition, y_test_svm_linear_attrition_pred))
print("\nPerformance Rating Model Classification Report:")
print(classification_report(y_test_performance, y_test_svm_linear_performance_pred))


Train Accuracy (Attrition) - SVM Linear: 0.915
Test Accuracy (Attrition) - SVM Linear: 0.901

Train Accuracy (Performance Rating) - SVM Linear: 0.861
Test Accuracy (Performance Rating) - SVM Linear: 0.861

Cross-validation Scores (Attrition) - SVM Linear: [0.89355742 0.91011236 0.89044944 0.88764045 0.8988764 ]
Mean Cross-validation Score (Attrition) - SVM Linear: 0.8961272149309163

Cross-validation Scores (Performance Rating) - SVM Linear: [0.81481481 0.89173789 0.86       0.83428571 0.84285714]
Mean Cross-validation Score (Performance Rating) - SVM Linear: 0.8487391127391126

Attrition Model Classification Report:
              precision    recall  f1-score   support

          No       0.87      0.94      0.90       222
         Yes       0.94      0.86      0.90       224

    accuracy                           0.90       446
   macro avg       0.90      0.90      0.90       446
weighted avg       0.90      0.90      0.90       446


Performance Rating Model Classification Report:

### SVM RBF

In [11]:
# SVM model with RBF kernel
model_svm_rbf_attrition = SVC(kernel='rbf', C=2.0, gamma='scale', random_state=42)
model_svm_rbf_performance = SVC(kernel='rbf', C=2.0, gamma='scale', random_state=42)


# Train the model
model_svm_rbf_attrition.fit(X_train_attrition, y_train_attrition)
model_svm_rbf_performance.fit(X_train_performance, y_train_performance)


# Predict the target attribute
y_train_svm_rbf_attrition_pred = model_svm_rbf_attrition.predict(X_train_attrition)
y_test_svm_rbf_attrition_pred = model_svm_rbf_attrition.predict(X_test_attrition)
y_train_svm_rbf_performance_pred = model_svm_rbf_performance.predict(X_train_performance)
y_test_svm_rbf_performance_pred = model_svm_rbf_performance.predict(X_test_performance)


# Calculate the accuracy of the model
train_svm_rbf_attrition_accuracy = accuracy_score(y_train_attrition, y_train_svm_rbf_attrition_pred)
test_svm_rbf_attrition_accuracy = accuracy_score(y_test_attrition, y_test_svm_rbf_attrition_pred)
train_svm_rbf_performance_accuracy = accuracy_score(y_train_performance, y_train_svm_rbf_performance_pred)
test_svm_rbf_performance_accuracy = accuracy_score(y_test_performance, y_test_svm_rbf_performance_pred)


# Print the accuracy of the model
print("Train Accuracy (Attrition) - SVM RBF: {:.3f}".format(train_svm_rbf_attrition_accuracy))
print("Test Accuracy (Attrition) - SVM RBF: {:.3f}".format(test_svm_rbf_attrition_accuracy))
print("\nTrain Accuracy (Performance Rating) - SVM RBF: {:.3f}".format(train_svm_rbf_performance_accuracy))
print("Test Accuracy (Performance Rating) - SVM RBF: {:.3f}".format(test_svm_rbf_performance_accuracy))


# Cross-validation
cv_scores_svm_rbf_attrition = cross_val_score(model_svm_rbf_attrition, X_train_attrition, y_train_attrition, cv=5)
cv_scores_svm_rbf_performance = cross_val_score(model_svm_rbf_performance, X_train_performance, y_train_performance, cv=5)

print("\nCross-validation Scores (Attrition) - SVM RBF:", cv_scores_svm_rbf_attrition)
print("Mean Cross-validation Score (Attrition) - SVM RBF:", cv_scores_svm_rbf_attrition.mean())
print("\nCross-validation Scores (Performance Rating) - SVM RBF:", cv_scores_svm_rbf_performance)
print("Mean Cross-validation Score (Performance Rating) - SVM RBF:", cv_scores_svm_rbf_performance.mean())


# Classification report
print("\nAttrition Model Classification Report:")
print(classification_report(y_test_attrition, y_test_svm_rbf_attrition_pred))
print("\nPerformance Rating Model Classification Report:")
print(classification_report(y_test_performance, y_test_svm_rbf_performance_pred))



Train Accuracy (Attrition) - SVM RBF: 0.925
Test Accuracy (Attrition) - SVM RBF: 0.910

Train Accuracy (Performance Rating) - SVM RBF: 0.888
Test Accuracy (Performance Rating) - SVM RBF: 0.886

Cross-validation Scores (Attrition) - SVM RBF: [0.87114846 0.90730337 0.90449438 0.90168539 0.88483146]
Mean Cross-validation Score (Attrition) - SVM RBF: 0.8938926132250653

Cross-validation Scores (Performance Rating) - SVM RBF: [0.84615385 0.9002849  0.86285714 0.85142857 0.86857143]
Mean Cross-validation Score (Performance Rating) - SVM RBF: 0.8658591778591779

Attrition Model Classification Report:
              precision    recall  f1-score   support

          No       0.88      0.95      0.91       222
         Yes       0.94      0.88      0.91       224

    accuracy                           0.91       446
   macro avg       0.91      0.91      0.91       446
weighted avg       0.91      0.91      0.91       446


Performance Rating Model Classification Report:
              precision