# HR Data Analysis

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('HR Data.csv')

In [60]:
print("Initial dataset info:")
print(df.info())

Initial dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  Jo

In [61]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

# 1. Removing unecessary columns

In [62]:
columns_to_drop = ['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18']
df = df.drop(columns=columns_to_drop)

print("Columns removed:", columns_to_drop)

Columns removed: ['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18']


# 2. Rename columns

In [63]:
column_rename = {
    'BusinessTravel': 'TravelFrequency',
    'DailyRate': 'DailyPay',
    'DistanceFromHome': 'CommuteDistance',
    'EnvironmentSatisfaction': 'EnvSatisfaction',
    'JobInvolvement': 'JobEngagement',
    'MonthlyIncome': 'MonthlySalary',
    'NumCompaniesWorked': 'PriorCompanies',
    'PercentSalaryHike': 'SalaryHikePercent',
    'TotalWorkingYears': 'TotalExperience',
    'TrainingTimesLastYear': 'TrainingLastYear',
    'YearsAtCompany': 'CompanyTenure',
    'YearsInCurrentRole': 'CurrentRoleTenure',
    'YearsSinceLastPromotion': 'LastPromotionYear',
    'YearsWithCurrManager': 'CurrentManagerTenure'
}

df = df.rename(columns=column_rename)

print("\nColumns renamed:")
for old, new in column_rename.items():
    print(f"{old} -> {new}")


Columns renamed:
BusinessTravel -> TravelFrequency
DailyRate -> DailyPay
DistanceFromHome -> CommuteDistance
EnvironmentSatisfaction -> EnvSatisfaction
JobInvolvement -> JobEngagement
MonthlyIncome -> MonthlySalary
NumCompaniesWorked -> PriorCompanies
PercentSalaryHike -> SalaryHikePercent
TotalWorkingYears -> TotalExperience
TrainingTimesLastYear -> TrainingLastYear
YearsAtCompany -> CompanyTenure
YearsInCurrentRole -> CurrentRoleTenure
YearsSinceLastPromotion -> LastPromotionYear
YearsWithCurrManager -> CurrentManagerTenure


# 3. Eliminating redundant entries

In [64]:
df_before = df.shape[0]
df = df.drop_duplicates()
df_after = df.shape[0]

print(f"\nRedundant entries removed: {df_before - df_after}")


Redundant entries removed: 0


# 4. Sanitize specific columns

In [65]:
# Convert categorical columns to title case
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col] = df[col].str.title()

# Ensure numeric columns are of the correct type
numeric_columns = ['Age', 'DailyPay', 'CommuteDistance', 'MonthlySalary', 'PriorCompanies', 'TotalExperience']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("\nColumns sanitized:")
print(df.dtypes)


Columns sanitized:
Age                          int64
Attrition                   object
TravelFrequency             object
DailyPay                     int64
Department                  object
CommuteDistance              int64
Education                    int64
EducationField              object
EnvSatisfaction              int64
Gender                      object
HourlyRate                   int64
JobEngagement                int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlySalary                int64
MonthlyRate                  int64
PriorCompanies               int64
OverTime                    object
SalaryHikePercent            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StockOptionLevel             int64
TotalExperience              int64
TrainingLastYear             int64
WorkLifeBalance              int64
CompanyTenure                int64


# 5. Eliminate NaN values

In [66]:
nan_before = df.isna().sum().sum()
df = df.dropna()
nan_after = df.isna().sum().sum()

print(f"\nNaN values removed: {nan_before - nan_after}")


NaN values removed: 0


# Additional changes

In [67]:
# Step 7: Create age-group to enhance visulization
bins = [0, 20, 40, 60, 80]
labels = ['young', 'middle-aged', 'old-aged', 'sinior-citizan']

df['AgeGroup'] = pd.cut(df['Age'], bins, labels=labels)

print(df['AgeGroup'].unique())

['old-aged', 'middle-aged', 'young']
Categories (4, object): ['young' < 'middle-aged' < 'old-aged' < 'sinior-citizan']


In [68]:
df.columns

Index(['Age', 'Attrition', 'TravelFrequency', 'DailyPay', 'Department',
       'CommuteDistance', 'Education', 'EducationField', 'EnvSatisfaction',
       'Gender', 'HourlyRate', 'JobEngagement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlySalary', 'MonthlyRate',
       'PriorCompanies', 'OverTime', 'SalaryHikePercent', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalExperience',
       'TrainingLastYear', 'WorkLifeBalance', 'CompanyTenure',
       'CurrentRoleTenure', 'LastPromotionYear', 'CurrentManagerTenure',
       'AgeGroup'],
      dtype='object')