#Importing Libraries

In [16]:
import numpy as np
import pandas as pd

# Loading Dataset

In [17]:
df = pd.read_csv('HR-Employee-Attrition.csv')

# Initial Exploration

In [18]:
# Selects and displays 5 random rows from the DataFrame
df.sample(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1109,30,No,Travel_Rarely,1288,Sales,29,4,Technical Degree,1,1568,...,2,80,1,9,3,3,4,2,1,3
1160,45,No,Travel_Rarely,1329,Research & Development,2,2,Other,1,1635,...,1,80,2,10,3,3,10,7,3,9
533,40,No,Travel_Frequently,580,Sales,5,4,Life Sciences,1,729,...,3,80,1,20,2,3,18,13,1,12
1216,43,No,Travel_Rarely,1179,Sales,2,3,Medical,1,1706,...,1,80,1,10,3,3,10,9,8,8
894,54,No,Travel_Rarely,685,Research & Development,3,3,Life Sciences,1,1250,...,1,80,0,36,2,3,10,9,0,9


In [19]:
shape = df.shape
missing = df.isnull().sum().sum()
duplicate = df.duplicated().sum()

print("Rows:", shape[0])
print("Columns:", shape[1])
print("Missing values:", missing)
print("Duplicate values:",duplicate)

Rows: 1470
Columns: 35
Missing values: 0
Duplicate values: 0


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [21]:
num_cols = df.select_dtypes(include=['int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns
print("Numerical:", list(num_cols))
print("Categorical:", list(cat_cols))
print(f"Number of Numerical Columns: {len(num_cols)}")
print(f"Number of Categorical Columns: {len(cat_cols)}")

Numerical: ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Categorical: ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
Number of Numerical Columns: 26
Number of Categorical Columns: 9


# Changed numerical values to categorical to improve visualization

In [22]:
df.select_dtypes(include=['int64']).sample(5)

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
272,28,1158,9,3,1,377,4,94,3,1,...,4,80,1,5,3,2,5,2,0,4
1063,29,1246,19,3,1,1497,3,77,2,2,...,3,80,2,10,3,3,10,7,0,4
1024,47,359,2,4,1,1443,1,82,3,4,...,2,80,2,26,2,4,20,17,5,6
884,40,107,10,3,1,1239,2,84,2,2,...,2,80,1,7,2,4,5,1,1,3
1221,45,1238,1,1,1,1712,3,74,2,3,...,4,80,1,25,3,2,23,15,14,4


In [23]:
df["Education"] = df["Education"].replace({
    1: "High School or Below",
    2: "College Graduate",
    3: "Bachelor’s Degree",
    4: "Master’s Degree",
    5: "Doctorate"
})

df["EnvironmentSatisfaction"] = df["EnvironmentSatisfaction"].replace({
    1: "Low Satisfaction",
    2: "Moderate Satisfaction",
    3: "High Satisfaction",
    4: "Very High Satisfaction"
})

df["JobInvolvement"] = df["JobInvolvement"].replace({
    1: "Low Involvement",
    2: "Moderate Involvement",
    3: "High Involvement",
    4: "Very High Involvement"
})

df["JobLevel"] = df["JobLevel"].replace({
    1: "Entry Level",
    2: "Junior Level",
    3: "Mid Level",
    4: "Senior Level",
    5: "Executive Level"
})

df["JobSatisfaction"] = df["JobSatisfaction"].replace({
    1: "Low Satisfaction",
    2: "Moderate Satisfaction",
    3: "High Satisfaction",
    4: "Very High Satisfaction"
})

df["PerformanceRating"] = df["PerformanceRating"].replace({
    1: "Poor Performance",
    2: "Good Performance",
    3: "Excellent Performance",
    4: "Outstanding Performance"
})

df["RelationshipSatisfaction"] = df["RelationshipSatisfaction"].replace({
    1: "Low Satisfaction",
    2: "Moderate Satisfaction",
    3: "High Satisfaction",
    4: "Very High Satisfaction"
})

df["WorkLifeBalance"] = df["WorkLifeBalance"].replace({
    1: "Poor Balance",
    2: "Good Balance",
    3: "Better Balance",
    4: "Excellent Balance"
})

## Summary stat of numeric variables

In [24]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
DailyRate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0
DistanceFromHome,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
EmployeeCount,1470.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EmployeeNumber,1470.0,1024.865306,602.024335,1.0,491.25,1020.5,1555.75,2068.0
HourlyRate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
MonthlyIncome,1470.0,6502.931293,4707.956783,1009.0,2911.0,4919.0,8379.0,19999.0
MonthlyRate,1470.0,14313.103401,7117.786044,2094.0,8047.0,14235.5,20461.5,26999.0
NumCompaniesWorked,1470.0,2.693197,2.498009,0.0,1.0,2.0,4.0,9.0
PercentSalaryHike,1470.0,15.209524,3.659938,11.0,12.0,14.0,18.0,25.0


# Removing Unnecessary Columns
The minimum age is 18, so all employees are adults — the 'Over18' column is not needed.

'EmployeeCount' and 'StandardHours' have the same value for everyone, so they don't add any useful information.

'EmployeeNumber' is just an identifier and not useful for analysis, so we remove it too.

In [25]:
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)

In [26]:
categorical = []
for column in df.columns:
    if df[column].dtype == 'object' and df[column].nunique() <= 30:
        categorical.append(column)

        print(f"\nColumn: {column}")
        print("Unique Values:", df[column].unique())
        print("-" * 70)

        print(df[column].value_counts(dropna=False))
        print("=" * 70)

# Remove target variable if needed
if 'Attrition' in categorical:
    categorical.remove('Attrition')


Column: Attrition
Unique Values: ['Yes' 'No']
----------------------------------------------------------------------
Attrition
No     1233
Yes     237
Name: count, dtype: int64

Column: BusinessTravel
Unique Values: ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
----------------------------------------------------------------------
BusinessTravel
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: count, dtype: int64

Column: Department
Unique Values: ['Sales' 'Research & Development' 'Human Resources']
----------------------------------------------------------------------
Department
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64

Column: Education
Unique Values: ['College Graduate' 'High School or Below' 'Master’s Degree'
 'Bachelor’s Degree' 'Doctorate']
----------------------------------------------------------------------
Education
Bachelor’s Degree       572
Master’s Degree   

In [27]:
df.nunique()

Age                           43
Attrition                      2
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCompany                37
YearsInCurrentRole            19
YearsSinceLastPromotion       16
YearsWithC

##Chi-Square Test of Independence

In [28]:
from scipy.stats import chi2_contingency
df["Attrition"] = df["Attrition"].astype("category")

# Select categorical variables (including 'object' types)
cat_cols = df.select_dtypes(include=["category", "object"]).columns.tolist()
cat_cols = [col for col in cat_cols if col != "Attrition"]  # exclude target

chi_statistic = []
p_val = []
vars_rm = []

for col in cat_cols:
    observed = pd.crosstab(df["Attrition"], df[col])
    stat, p, dof, expected = chi2_contingency(observed)
    chi_statistic.append(stat)
    p_val.append(p)

    if p >= 0.05:
        print(f"Attrition and {col} are independent (p-value = {p:.2f}).\n")
        vars_rm.append(col)

chi_df = pd.DataFrame({
    "Variable": cat_cols,
    "Chi_Statistic": chi_statistic,
    "P_value": p_val
}).sort_values("P_value", ascending=True)

display(chi_df)


Attrition and Education are independent (p-value = 0.55).

Attrition and Gender are independent (p-value = 0.29).

Attrition and PerformanceRating are independent (p-value = 0.99).

Attrition and RelationshipSatisfaction are independent (p-value = 0.15).



Unnamed: 0,Variable,Chi_Statistic,P_value
11,OverTime,87.564294,8.158424e-21
8,JobRole,86.190254,2.752482e-15
7,JobLevel,72.529013,6.634685e-15
10,MaritalStatus,46.163677,9.455511e-11
6,JobInvolvement,28.492021,2.863181e-06
0,BusinessTravel,24.182414,5.608614e-06
4,EnvironmentSatisfaction,22.503881,5.123469e-05
9,JobSatisfaction,17.505077,0.0005563005
14,WorkLifeBalance,16.325097,0.0009725699
1,Department,10.796007,0.004525607


In [29]:
print("All chi-square test results:\n")
print(chi_df.head())

All chi-square test results:

          Variable  Chi_Statistic       P_value
11        OverTime      87.564294  8.158424e-21
8          JobRole      86.190254  2.752482e-15
7         JobLevel      72.529013  6.634685e-15
10   MaritalStatus      46.163677  9.455511e-11
6   JobInvolvement      28.492021  2.863181e-06


Based on the chi-square test results, the factors that significantly influence whether an employee stays with the company include Work-Life Balance, Job Satisfaction, Overtime status, Stock Option Level, and Department.

Conversely, variables such as Education, Gender, Performance Rating, and Relationship Satisfaction were found to be independent of Attrition and will therefore be excluded from the predictor set in the model.