# Employee Attrition Analysis
This notebook explores factors influencing employee 
attrition using IBM HR dataset.

## Setups and import


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: make plots look nicer sasa
sns.set(style="whitegrid")

### Load the dataset

In [None]:
# Read the dataset
df = pd.read_csv('employee_attrition_ibm_hr.csv')

# Display the first 5 rows and show number of columns
df.head()

###  Understanding the datset

In [None]:
# Check column names and data types
df.info()

In [None]:
# Check for missing values
df.isnull().sum()

In [32]:
# Get summary statistics for numerical columns
df.describe().round(2)

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92,802.49,9.19,2.91,1.0,1024.87,2.72,65.89,2.73,2.06,...,2.71,80.0,0.79,11.28,2.8,2.76,7.01,4.23,2.19,4.12
std,9.14,403.51,8.11,1.02,0.0,602.02,1.09,20.33,0.71,1.11,...,1.08,0.0,0.85,7.78,1.29,0.71,6.13,3.62,3.22,3.57
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


 ###    Identify numeric columns that might be categorical

In [None]:
# Select numeric columns
numeric_cols = df.select_dtypes(include='number').columns

# Print number of unique values per numeric column
for col in numeric_cols:
    unique_vals = df[col].nunique()
    print(f"{col}: {unique_vals} unique values")

# Identify likely categorical numeric columns (few unique values)
categorical_numeric = [col for col in numeric_cols if df[col].nunique() < 10]
print("\nLikely categorical numeric columns:", categorical_numeric)

 ###    Map categorical numeric columns

In [None]:
# Define mappings
education_map = {1: 'Below College', 2: 'College', 3: 'Bachelor', 4: 'Master', 5: 'Doctor'}
satisfaction_map = {1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'}
worklife_map = {1: 'Bad', 2: 'Good', 3: 'Better', 4: 'Best'}
performance_map = {1: 'Low', 2: 'Good', 3: 'Excellent', 4: 'Outstanding'}

# Apply mappings
df['Education'] = df['Education'].map(education_map)
df['EnvironmentSatisfaction'] = df['EnvironmentSatisfaction'].map(satisfaction_map)
df['JobInvolvement'] = df['JobInvolvement'].map(satisfaction_map)
df['JobSatisfaction'] = df['JobSatisfaction'].map(satisfaction_map)
df['WorkLifeBalance'] = df['WorkLifeBalance'].map(worklife_map)
df['PerformanceRating'] = df['PerformanceRating'].map(performance_map)

# Verify mappings
df[['Education', 'JobSatisfaction', 'WorkLifeBalance', 'PerformanceRating']].head()


###     Employee attrition analysis

In [None]:
# Count employees who left vs stayed
attrition_counts = df['Attrition'].value_counts()
print("Attrition counts:\n", attrition_counts)

# Plot attrition distribution
sns.countplot(x='Attrition', data=df)
plt.title('Employee Attrition Distribution')
plt.show()


###     Export cleaned data to Tableu

In [33]:
df.to_csv('employee_attrition_cleaned.csv', index=False)
