In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('employee_attrition.csv')
df

In [None]:
# no missing values

df.isna().sum().unique()

# dropping useless columns : 'EmployeeCount','Over18'
df.drop(columns=['EmployeeCount','Over18','StandardHours'],inplace=True)

In [None]:
categorical = ['WorkLifeBalance','EducationField','Department','JobRole','MaritalStatus','Gender','OverTime','Attrition','BusinessTravel', 'Education','EnvironmentSatisfaction','JobInvolvement','JobLevel','JobSatisfaction','PerformanceRating','RelationshipSatisfaction','StockOptionLevel']
numerical = [x for x in df.columns if x not in categorical]

In [None]:
plt.figure(figsize=(6,3))
sns.countplot(x = 'Attrition' , data = df)
plt.title('Distribution of Numerical Feature')
plt.show()

In [None]:
plt.figure(figsize=(6,3))
sns.countplot(x='StockOptionLevel', hue='Attrition', data=df)
plt.title('Stock Option Levels by Attrition')
plt.xlabel('Stock Option Level')
plt.ylabel('Count')
plt.show()

In [None]:
# Pearson Correlation between numerical features

corr = df[numerical].corr()

plt.figure(figsize=(9,6))
sns.heatmap(corr,annot=True,fmt='.2f')
plt.title('Numerical Features Correlation')
plt.show()

In [None]:
# Spearman correlation between numerical features + ordinal categorical data

numerical_df = df.select_dtypes(exclude='object')

corr = numerical_df.corr(method='spearman')
plt.figure(figsize=(11,8))
sns.heatmap(data=corr,annot = True,fmt = '.1f')
plt.show()

In [None]:
# Create box plots for numerical features
# numerical feature distributions to attrition

numerical_features = df[numerical]

print(numerical_features.columns.size)

plt.figure(figsize=(20, 15))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(5, 5, i)
    sns.boxplot(x='Attrition', y=feature, data=df)
    plt.title(f'Box plot of {feature}')
plt.tight_layout()
plt.show()

In [None]:
contingency_table = pd.crosstab(df['MaritalStatus'],df['Attrition'])
contingency_table.plot(kind='bar', figsize=(8, 5), colormap='viridis')
plt.title('Stacked Bar Chart of Gender and Attrition')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table

In [None]:
contingency_table = pd.crosstab([df['Gender'], df['MaritalStatus']], df['Attrition'])
# Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(contingency_table, annot=True, fmt='d')
plt.title('Heatmap of Gender and Attrition')
plt.show()

In [None]:
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table['percentage'].plot(kind='bar', figsize=(9, 6), colormap='viridis',ylim=(0,1))

In [None]:
contingency_table = pd.crosstab([df['Education']], df['Attrition'])
# Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(contingency_table, annot=True, fmt='d')
plt.title('Heatmap of Gender and Attrition')
plt.show()

In [None]:
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table['percentage'].plot(kind='bar', figsize=(9, 6), colormap='viridis',ylim=(0,1))

In [None]:
contingency_table = pd.crosstab([df['JobRole'],df['Department']], df['Attrition'])
# Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(contingency_table, annot=True, fmt='d')
plt.title('Heatmap of JobRole & Department on Attrition')
plt.show()

In [None]:
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table['percentage'].plot(kind='bar', figsize=(9, 6), colormap='viridis',ylim=(0,1))

In [None]:
contingency_table = pd.crosstab([df['StockOptionLevel']], df['Attrition'])
# Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(contingency_table, annot=True, fmt='d')
plt.title('Heatmap of JobRole & Department on Attrition')
plt.show()

In [None]:
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table['percentage'].plot(kind='bar', figsize=(9, 6), colormap='viridis',ylim=(0,1))

In [None]:
contingency_table = pd.crosstab([df['EnvironmentSatisfaction'],df['JobSatisfaction'],df['RelationshipSatisfaction']],df['Attrition'])

contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table['percentage'].plot(kind='bar', figsize=(15, 8), colormap='viridis',ylim=(0,1))
# contingency_table.plot(kind='bar', figsize=(15, 8), colormap='viridis')

plt.title('Stacked Bar Chart of Gender and Attrition')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
contingency_table = pd.crosstab([df['EnvironmentSatisfaction'],df['RelationshipSatisfaction']],df['Attrition'])
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table['percentage'].plot(kind='bar', figsize=(15, 8), colormap='viridis',ylim=(0,0.5))

plt.title('Stacked Bar Chart of Gender and Attrition')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table

In [None]:
# Overtime in Sales Department is a contributing factor to employee attrition
sales_df = df[df['Department'] == 'Sales']
contingency_table = pd.crosstab(sales_df['OverTime'],sales_df['Attrition'])
contingency_table.plot(kind='bar', figsize=(9, 6), colormap='viridis')

In [None]:
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table['percentage'].plot(kind='bar',ylim = (0,1))

In [None]:
sales_df = df[df['Department'] == 'Sales']
contingency_table = pd.crosstab(sales_df['JobInvolvement'],sales_df['Attrition'])
contingency_table.plot(kind='bar', figsize=(9, 6), colormap='viridis')

In [None]:
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table['percentage'].plot(kind='bar',ylim = (0,1))

In [None]:
yes_df = df[df['Attrition'] == 'Yes']
no_df = df[df['Attrition'] == 'No']

In [None]:
contingency_table = pd.crosstab([df['JobLevel']],df['Attrition'])
contingency_table['percentage'] = contingency_table['Yes']/ (contingency_table['No'] + contingency_table['Yes'] )
contingency_table['percentage'].plot(kind='bar', figsize=(9,6), colormap='viridis',ylim=(0,0.5))

plt.title('Stacked Bar Chart of Gender and Attrition')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()