In [None]:
import pandas as pd

# Load the dataset
file_path = 'employee_attrition.csv'
employee_data = pd.read_csv(file_path)

# Prepare data
employee_data['tenure'] = employee_data['YearsAtCompany']
employee_data['event'] = employee_data['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)

# dropping useless columns : 'EmployeeCount','Over18'
employee_data.drop(columns=['EmployeeCount','Over18','StandardHours'],inplace=True)

In [None]:
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Initialize the Kaplan-Meier fitter
kmf = KaplanMeierFitter()

# Fit the data
kmf.fit(durations=employee_data['tenure'], event_observed=employee_data['event'])

# Plot the survival function
kmf.plot_survival_function()
plt.title('Employee Retention over Time')
plt.xlabel('Time (Years)')
plt.ylabel('Survival Probability')
plt.show()

In [None]:
numerical_df = employee_data.select_dtypes(exclude='object')

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = add_constant(numerical_df.drop(columns=[ 'event','tenure','YearsAtCompany']))  # Drop target and duration columns
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

In [None]:
import seaborn as sns
import numpy as np


corr = numerical_df.drop(columns=[ 'event','tenure']).corr(method='spearman')
plt.figure(figsize=(11,8))
sns.heatmap(data=corr,annot = True,fmt = '.1f')
plt.show()

# Monthly Income vs JobLevel
# YearsAtCompany vs Years-related features(with WithCurrManager etc.)

In [None]:
def correlation_filter(df,threshold):
    for i in df.columns:
        for j in df.columns:
            if i < j and abs(df[i][j]) >= threshold:
                return (i,j)
correlation_filter(corr,0.9)

In [None]:
from lifelines import CoxPHFitter

# Select relevant columns
cox_data = employee_data[['tenure', 'event', 'Age','MonthlyIncome','YearsInCurrentRole','YearsWithCurrManager','YearsSinceLastPromotion','TotalWorkingYears','TrainingTimesLastYear', 'JobSatisfaction', 'RelationshipSatisfaction','EnvironmentSatisfaction','OverTime','WorkLifeBalance','StockOptionLevel','BusinessTravel','MaritalStatus','DistanceFromHome','Gender','JobInvolvement','JobLevel','Education']]

# Encode categorical variables
cox_data = pd.get_dummies(cox_data, drop_first=True)

# Initialize the Cox Proportional Hazards fitter
cph = CoxPHFitter()

# Fit the model
cph.fit(cox_data, duration_col='tenure', event_col='event')

# Print the summary
cph.print_summary()

# Plot the coefficients
cph.plot()
plt.title('Cox Proportional Hazards Model Coefficients')
plt.show()

In [None]:
# Select relevant columns
cox_data = employee_data[['tenure', 'event', 'Age','MonthlyIncome','MonthlyRate','HourlyRate','YearsWithCurrManager','YearsInCurrentRole','YearsSinceLastPromotion','TotalWorkingYears','TrainingTimesLastYear']]

# Encode categorical variables
cox_data = pd.get_dummies(cox_data, drop_first=True)

# Initialize the Cox Proportional Hazards fitter
cph = CoxPHFitter()

# Fit the model
cph.fit(cox_data, duration_col='tenure', event_col='event')

# Print the summary
cph.print_summary()

# Plot the coefficients
cph.plot()
plt.title('Cox Proportional Hazards Model Coefficients')
plt.show()

In [None]:
excluded = ['YearsAtCompany','Attrition']

# Select relevant columns
cox_data = employee_data.drop(columns=excluded)

# Encode categorical variables
cox_data = pd.get_dummies(cox_data, drop_first=True)

# Initialize the Cox Proportional Hazards fitter
cph = CoxPHFitter()

# Fit the model
cph.fit(cox_data, duration_col='tenure', event_col='event')

# Print the summary
cph.print_summary()

# Plot the coefficients
cph.plot()
plt.title('Cox Proportional Hazards Model Coefficients')
plt.show()