In [155]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [156]:
import sys
sys.path.append('/content/drive/My Drive/People_Analytics')

#Import libraries

In [157]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

#Read in data file

In [158]:
#Read in data file
employee_attrition = pd.read_csv('/content/drive/My Drive/People_Analytics/Employee_Attrition.csv')

#Data validation

In [159]:
#View sample data
employee_attrition.sample(10)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
5,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,...,3,80,0,8,2,2,7,7,3,6
59,37,No,Travel_Rarely,1115,Research & Development,1,4,Life Sciences,1,77,...,3,80,1,7,2,4,7,5,0,7
1006,49,Yes,Travel_Frequently,1475,Research & Development,28,2,Life Sciences,1,1420,...,1,80,0,20,2,3,4,3,1,3
1237,32,Yes,Travel_Rarely,964,Sales,1,2,Life Sciences,1,1734,...,2,80,0,10,2,3,0,0,0,0
1271,21,Yes,Travel_Rarely,337,Sales,7,1,Marketing,1,1780,...,2,80,0,1,3,3,1,0,1,0
547,42,Yes,Travel_Frequently,933,Research & Development,19,3,Medical,1,752,...,4,80,0,7,2,3,2,2,2,2
916,46,No,Travel_Rarely,168,Sales,4,2,Marketing,1,1280,...,3,80,1,26,2,3,11,4,0,8
210,32,Yes,Travel_Rarely,1045,Sales,4,4,Medical,1,291,...,3,80,0,14,2,2,14,8,9,8
1459,29,No,Travel_Rarely,1378,Research & Development,13,2,Other,1,2053,...,1,80,1,10,2,3,4,3,0,3
731,20,Yes,Travel_Rarely,1097,Research & Development,11,3,Medical,1,1016,...,1,80,0,1,2,3,1,0,0,0


In [160]:
#Check data types
employee_attrition.dtypes

Unnamed: 0,0
Age,int64
Attrition,object
BusinessTravel,object
DailyRate,int64
Department,object
DistanceFromHome,int64
Education,int64
EducationField,object
EmployeeCount,int64
EmployeeNumber,int64


In [161]:
#Get dataframe shape
employee_attrition.shape

(1470, 35)

#Define feature and target variables

In [162]:
#Drop irrleveant features
drop_cols = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours', 'Attrition']
employee_attrition = employee_attrition.drop(columns=drop_cols)
#Validate dropped columns
employee_attrition.shape

(1470, 30)

In [163]:
#Define feature and target variables
X = employee_attrition.drop(columns=['YearsAtCompany'])
y = employee_attrition['YearsAtCompany']

#Encode categorical features

In [164]:
#Encode categoical features
categorical_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

#Train-test split and train model

In [165]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

#Train model with linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

#Make predictions and evaluate
y_pred = lr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R2 Score: {r2:.2f}")

RMSE: 3.00
R2 Score: 0.77


#Analyze feature coefficients

In [166]:
#Analyze feature coefficients
coef_df = pd.DataFrame({
    "Feature": X_encoded.columns,
    "Coefficient": lr.coef_
})
coef_df["Abs_Coefficient"] = coef_df["Coefficient"].abs()
coef_df = coef_df.sort_values(by="Abs_Coefficient", ascending=False)

#Display top 15 features based on coefficient
print(coef_df[["Feature", "Coefficient"]].head(15))


                              Feature  Coefficient
24  Department_Research & Development    -2.685593
38            JobRole_Sales Executive    -2.103516
39       JobRole_Sales Representative    -1.471180
28             EducationField_Medical     1.323831
32            JobRole_Human Resources    -1.268611
34                    JobRole_Manager    -1.158650
27           EducationField_Marketing     1.131400
36          JobRole_Research Director    -1.123457
26       EducationField_Life Sciences     1.103384
25                   Department_Sales    -0.804579
30    EducationField_Technical Degree     0.769932
37         JobRole_Research Scientist     0.695748
29               EducationField_Other     0.610147
41               MaritalStatus_Single    -0.609941
33      JobRole_Laboratory Technician     0.575762


In [167]:
top_features = coef_df.sort_values(by="Abs_Coefficient", ascending=False).head(8)

In [168]:
top_features

Unnamed: 0,Feature,Coefficient,Abs_Coefficient
24,Department_Research & Development,-2.685593,2.685593
38,JobRole_Sales Executive,-2.103516,2.103516
39,JobRole_Sales Representative,-1.47118,1.47118
28,EducationField_Medical,1.323831,1.323831
32,JobRole_Human Resources,-1.268611,1.268611
34,JobRole_Manager,-1.15865,1.15865
27,EducationField_Marketing,1.1314,1.1314
36,JobRole_Research Director,-1.123457,1.123457


#Feature engineering for improving model

In [169]:
#Ratio-based feature engineering
employee_attrition["Income_Per_Level"] = employee_attrition["MonthlyIncome"] / employee_attrition["JobLevel"]
employee_attrition["YearsInRoleRation"] = employee_attrition["YearsInCurrentRole"] / employee_attrition["TotalWorkingYears"].replace(0, 1)
employee_attrition["YearsWithManagerRatio"] = employee_attrition["YearsWithCurrManager"] / employee_attrition["YearsAtCompany"].replace(0, 1)
employee_attrition["PromotionPerYear"] = employee_attrition["YearsSinceLastPromotion"] / employee_attrition["YearsAtCompany"].replace(0, 1)
#Interaction feature engineering
#employee_attrition["Satisfaction_Involvment"] = employee_attrition["JobSatisfaction"] * employee_attrition["JobInvolvement"]
employee_attrition["JobLevel_Income"] = employee_attrition["JobLevel"] * employee_attrition["MonthlyIncome"]

In [170]:
#Handle division errors and missing values
employee_attrition.replace([np.inf, -np.inf], np.nan, inplace=True)
employee_attrition.fillna(0, inplace=True)

In [171]:
#Define targets and features
target = "YearsAtCompany"
X = employee_attrition.drop(columns=[target])
y = employee_attrition[target]

In [172]:
#One-hot encode categorical variables
categorical_cols = X.select_dtypes(include=["object"]).columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [173]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

#Train linear regreesion model
lr_tenure = LinearRegression()
lr_tenure.fit(X_train, y_train)

#Make predictions and evaluate
y_pred = lr_tenure.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

RMSE: 2.56
R-squared Score: 0.83


#Train and prediction PercentSalaryHike

In [174]:
#Define target and variables
target_hike = "PercentSalaryHike"
X_hike = employee_attrition.drop(columns=[target_hike, "YearsAtCompany"])
y_hike = employee_attrition[target_hike]

In [175]:
#One-hot encode categorical variables
categorical_cols_hike = X.select_dtypes(include=["object"]).columns.tolist()
X_hike_encoded = pd.get_dummies(X_hike, columns=categorical_cols_hike, drop_first=True)
#Train-test split
X_train_hike, X_test_hike, y_train_hike, y_test_hike = train_test_split(X_hike_encoded, y_hike, test_size=0.2, random_state=42)

#Train linear regression model
lr_hike = LinearRegression()
lr_hike.fit(X_train_hike, y_train_hike)

#Make predictions and evaluate
y_pred_hike = lr_hike.predict(X_test_hike)
rmse_hike = np.sqrt(mean_squared_error(y_test_hike, y_pred_hike))
r2_hike = r2_score(y_test_hike, y_pred_hike)

print(f"RMSE: {rmse_hike:.2f}")
print(f"R-squared Score: {r2_hike:.2f}")

RMSE: 2.27
R-squared Score: 0.61


In [176]:
#Add predictions to entire dataframe
employee_attrition["Predicted_Tenure"] = lr_tenure.predict(X_encoded)
employee_attrition["Predicted_Tenure"] = employee_attrition["Predicted_Tenure"].round(0).astype(int)
employee_attrition["Predicted_Salary_Hike"] = lr_hike.predict(X_hike_encoded)
employee_attrition["Predicted_Salary_Hike"] = employee_attrition["Predicted_Salary_Hike"].round(0).astype(int)
#View sample data
employee_attrition.sample(10)

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Income_Per_Level,YearsInRoleRation,YearsWithManagerRatio,PromotionPerYear,JobLevel_Income,Predicted_Tenure,Predicted_Salary_Hike
1280,37,Travel_Rarely,1239,Human Resources,8,2,Other,3,Male,89,...,0,4,7,2035.5,0.0,0.7,0.4,8142,10,14
1030,31,Travel_Rarely,326,Sales,8,2,Life Sciences,1,Male,31,...,7,9,9,3597.666667,0.538462,0.692308,0.692308,32379,15,14
188,34,Travel_Frequently,1069,Research & Development,2,1,Life Sciences,4,Male,45,...,9,1,9,4773.5,0.9,0.9,0.1,19094,12,14
820,35,Travel_Frequently,1182,Sales,11,2,Marketing,4,Male,54,...,2,0,2,2484.0,0.4,0.4,0.0,9936,5,14
10,35,Travel_Rarely,809,Research & Development,16,3,Medical,1,Male,84,...,4,0,3,2426.0,0.666667,0.6,0.0,2426,5,14
129,49,Travel_Rarely,470,Research & Development,20,4,Medical,3,Female,96,...,11,5,11,3283.5,0.6875,0.733333,0.333333,13134,17,14
452,45,Travel_Rarely,561,Sales,2,3,Other,4,Male,61,...,7,3,7,2402.5,0.777778,0.875,0.375,9610,9,14
647,35,Travel_Rarely,672,Research & Development,25,3,Technical Degree,4,Male,78,...,10,4,8,3634.333333,0.625,0.615385,0.307692,32709,14,14
481,34,Travel_Rarely,254,Research & Development,1,2,Life Sciences,2,Male,83,...,5,1,3,3622.0,0.833333,0.5,0.166667,3622,6,14
552,56,Travel_Rarely,832,Research & Development,9,3,Medical,3,Male,81,...,7,1,1,2775.75,0.233333,0.1,0.1,44412,10,15


#View distribution of Predicted_Tenure

In [177]:
#View distribution of Predicted_Tenure
employee_attrition["Predicted_Tenure"].describe()

Unnamed: 0,Predicted_Tenure
count,1470.0
mean,7.057823
std,5.675121
min,-2.0
25%,3.0
50%,5.0
75%,10.0
max,29.0


#View distribution of Predicted_Salary_Hike

In [178]:
#View distribution of Predicted_Salary_Hike
employee_attrition["Predicted_Salary_Hike"].describe()

Unnamed: 0,Predicted_Salary_Hike
count,1470.0
mean,15.235374
std,2.870066
min,12.0
25%,14.0
50%,14.0
75%,15.0
max,23.0


#Define function based on distributions to create retention risk feature

In [179]:
#Function for retention risk based on distribution
def classify_retention_risk(row):
  if row["Predicted_Tenure"] < 2.8:
    return "High"
  elif row["Predicted_Tenure"] < 5.2:
    return "Medium"
  elif row["Predicted_Salary_Hike"] < 13.8:
    return "Medium"
  else:
    return "Low"

#Apply function to dataframe

In [180]:
#Apply function to create Retention_Risk feature
employee_attrition["Retention_Risk"] = employee_attrition.apply(classify_retention_risk, axis=1)
#View sample data
employee_attrition.sample(10)

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,YearsSinceLastPromotion,YearsWithCurrManager,Income_Per_Level,YearsInRoleRation,YearsWithManagerRatio,PromotionPerYear,JobLevel_Income,Predicted_Tenure,Predicted_Salary_Hike,Retention_Risk
474,24,Travel_Rarely,691,Research & Development,23,3,Medical,2,Male,89,...,1,4,2725.0,0.833333,0.666667,0.166667,2725,6,14,Low
988,41,Travel_Frequently,1200,Research & Development,22,3,Life Sciences,4,Female,75,...,3,3,2733.5,0.166667,0.5,0.5,10934,5,14,Medium
775,43,Travel_Rarely,415,Sales,25,3,Medical,3,Male,79,...,0,0,3599.333333,0.0,0.0,0.0,32394,5,14,Medium
1464,26,Travel_Rarely,1167,Sales,5,3,Other,4,Female,30,...,0,0,2966.0,0.4,0.0,0.0,2966,3,15,Medium
392,54,Travel_Rarely,821,Research & Development,5,2,Medical,1,Male,86,...,1,2,3881.2,0.083333,0.5,0.25,97030,7,14,Low
462,34,Travel_Rarely,258,Sales,21,4,Life Sciences,4,Male,74,...,5,7,2668.5,0.7,0.7,0.5,10674,11,14,Low
16,32,Travel_Rarely,334,Research & Development,5,2,Life Sciences,1,Male,80,...,0,5,3298.0,0.285714,0.833333,0.0,3298,6,14,Low
328,33,Travel_Frequently,508,Sales,10,3,Marketing,2,Male,46,...,0,1,2341.0,0.777778,0.142857,0.0,9364,7,14,Low
684,40,Travel_Rarely,658,Sales,10,4,Marketing,1,Male,67,...,0,0,3235.0,0.0,0.0,0.0,29115,4,14,Medium
1417,31,Travel_Rarely,1154,Sales,2,2,Life Sciences,1,Male,54,...,1,2,3067.0,0.666667,1.0,0.5,3067,1,14,High


#View samples of YearsAtCompany vs. Predicted Tenure

In [181]:
#View samples of YearsAtCompany and Predicted_Tenure
employee_attrition[['YearsAtCompany', 'Predicted_Tenure']].sample(10)

Unnamed: 0,YearsAtCompany,Predicted_Tenure
1334,7,8
1278,12,12
518,7,8
62,27,19
1114,8,6
272,5,4
23,0,2
778,16,13
1412,12,12
70,4,4


#View samples of PercentSalaryHike vs. Predicted_Salary_Hike

In [182]:
employee_attrition[['PercentSalaryHike', 'Predicted_Salary_Hike']].sample(10)

Unnamed: 0,PercentSalaryHike,Predicted_Salary_Hike
993,11,14
547,12,14
730,12,14
1177,15,14
554,19,14
948,23,22
230,15,14
1281,18,14
245,12,15
1089,17,14


#Check for out-of-range values

In [183]:
#View range of values of predicted features
print(f"Min Tenure: {employee_attrition['Predicted_Tenure'].min()}")
print(f"Max Tenure: {employee_attrition['Predicted_Tenure'].max()}")
print(f"Min Salary Hike: {employee_attrition['Predicted_Salary_Hike'].min()}")
print(f"Max Salary Hike: {employee_attrition['Predicted_Salary_Hike'].max()}")

Min Tenure: -2
Max Tenure: 29
Min Salary Hike: 12
Max Salary Hike: 23


#Replace negative values with 0

In [184]:
#Replace negative predicted tenure with 0
employee_attrition["Predicted_Tenure"] = employee_attrition["Predicted_Tenure"].clip(lower=0)

#Valide results

In [185]:
#Validate results
print(f"Min Tenure: {employee_attrition['Predicted_Tenure'].min()}")
print(f"Max Tenure: {employee_attrition['Predicted_Tenure'].max()}")
print(f"Min Salary Hike: {employee_attrition['Predicted_Salary_Hike'].min()}")
print(f"Max Salary Hike: {employee_attrition['Predicted_Salary_Hike'].max()}")

Min Tenure: 0
Max Tenure: 29
Min Salary Hike: 12
Max Salary Hike: 23


#Get MAE of for Predicted_Tenure and Predicted_Salary_Hike

In [186]:
employee_attrition["Tenure_Error"] = (employee_attrition["Predicted_Tenure"] - employee_attrition["YearsAtCompany"]).abs()
tenure_margin_of_error = employee_attrition["Tenure_Error"].mean()
employee_attrition["Salary_Hike_Error"] = (employee_attrition["Predicted_Salary_Hike"] - employee_attrition["PercentSalaryHike"]).abs()
salary_hike_margin_of_error = employee_attrition["Salary_Hike_Error"].mean()
print(f"Tenure MAE: ", tenure_margin_of_error.round(1))
print(f"Salary Hike MAE: ", salary_hike_margin_of_error.round(1))

Tenure MAE:  1.6
Salary Hike MAE:  1.9
