In [62]:
# Importing Various Modules
import pandas as pd
import numpy as np
import re

file_path = './Data/IBM-HR-Analytics-Employee-Attrition-and-Performance.csv'

# Loading Dataset
df = pd.read_csv(file_path)


In [63]:
df.describe()


Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [64]:
"""
This code computes the size of a DataFrame, enlists the column labels, adds spaces before capital letters in column names, and generates basic information about the attributes of the DataFrame.
"""

# Computing the size of DataFrame
print(f"Size of the dataframe: {df.shape}")

# Enlist the Column Labels
print(f"Columns in the dataframe: {df.columns.tolist()}")

df.columns = df.columns.map(lambda x: re.sub(r"(?<=\w)([A-Z])", r" \1", x).strip()) # Add space before capital letters

# Generating Basic Information of Attributes
print(df.info())


Size of the dataframe: (1470, 35)
Columns in the dataframe: ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Age                         1470 non-null   int64 
 1   Attrition                   1470 non-null

In [65]:
"""
This code defines mappings for categorical columns and maps numerical values to categorical values in a DataFrame.
It then displays the updated DataFrame and provides a summary using the describe() function.
"""

# Define mappings for categorical columns
business_travel_map = {1: 'No Travel', 2: 'Travel Frequently', 3: 'Travel Rarely'}
department_map = {1: 'HR', 2: 'R&D', 3: 'Sales'}
education_field_map = {1: 'HR', 2: 'Life Sciences', 3: 'Marketing', 4: 'Medical Sciences', 5: 'Others', 6: 'Technical'}
gender_map = {1: 'Female', 2: 'Male'}
marital_status_map = {1: 'Divorced', 2: 'Married', 3: 'Single'}
job_role_map = {1: 'HC Rep', 2: 'HR', 3: 'Lab Technician', 4: 'Manager', 5: 'Managing Director', 6: 'Research Director', 7: 'Research Scientist', 8: 'Sales Executive', 9: 'Sales Representative'}
over_18_map = {1: 'Yes', 2: 'No'}
overtime_map = {1: 'No', 2: 'Yes'}

# Map numerical values to categorical values
df['MAPPED Business Travel'] = df['Business Travel'].map(business_travel_map)
df['MAPPED Department'] = df['Department'].map(department_map)
df['MAPPED Education Field'] = df['Education Field'].map(education_field_map)
df['MAPPED Gender'] = df['Gender'].map(gender_map)
df['MAPPED Marital Status'] = df['Marital Status'].map(marital_status_map)
df['MAPPED Job Role'] = df['Job Role'].map(job_role_map)
df['MAPPED Over18'] = df['Over18'].map(over_18_map)
df['MAPPED Overtime'] = df['Over Time'].map(overtime_map)

# Display the updated DataFrame
print(df)

# Provide a summary of the DataFrame
df.describe()


      Age Attrition    Business Travel  Daily Rate              Department  \
0      41       Yes      Travel_Rarely        1102                   Sales   
1      49        No  Travel_Frequently         279  Research & Development   
2      37       Yes      Travel_Rarely        1373  Research & Development   
3      33        No  Travel_Frequently        1392  Research & Development   
4      27        No      Travel_Rarely         591  Research & Development   
...   ...       ...                ...         ...                     ...   
1465   36        No  Travel_Frequently         884  Research & Development   
1466   39        No      Travel_Rarely         613  Research & Development   
1467   27        No      Travel_Rarely         155  Research & Development   
1468   49        No  Travel_Frequently        1023                   Sales   
1469   34        No      Travel_Rarely         628  Research & Development   

      Distance From Home  Education Education Field  Employee C

Unnamed: 0,Age,Daily Rate,Distance From Home,Education,Employee Count,Employee Number,Environment Satisfaction,Hourly Rate,Job Involvement,Job Level,...,Relationship Satisfaction,Standard Hours,Stock Option Level,Total Working Years,Training Times Last Year,Work Life Balance,Years At Company,Years In Current Role,Years Since Last Promotion,Years With Curr Manager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [66]:

"""
This code identifies and lists the numerical and categorical features in a DataFrame.
It performs descriptive analysis on the numerical attributes and drops unnecessary columns from the DataFrame.
"""

# Identify and list numerical features in the DataFrame
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()

# Print the list of numerical features
print(f"Numerical features: {numerical_features}")

# Identify and list categorical features in the DataFrame
categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Print the list of categorical features
print(f"Categorical features: {categorical_features}")

# Perform descriptive analysis on numerical attributes and print the result
print(df[numerical_features].describe())

# Drop unnecessary columns from the DataFrame
df.drop(['Employee Count', 'Employee Number', 'Over18', 'Standard Hours'], axis="columns", inplace=True)

Numerical features: ['Age', 'Daily Rate', 'Distance From Home', 'Education', 'Employee Count', 'Employee Number', 'Environment Satisfaction', 'Hourly Rate', 'Job Involvement', 'Job Level', 'Job Satisfaction', 'Monthly Income', 'Monthly Rate', 'Num Companies Worked', 'Percent Salary Hike', 'Performance Rating', 'Relationship Satisfaction', 'Standard Hours', 'Stock Option Level', 'Total Working Years', 'Training Times Last Year', 'Work Life Balance', 'Years At Company', 'Years In Current Role', 'Years Since Last Promotion', 'Years With Curr Manager']
Categorical features: ['Attrition', 'Business Travel', 'Department', 'Education Field', 'Gender', 'Job Role', 'Marital Status', 'Over18', 'Over Time', 'MAPPED Business Travel', 'MAPPED Department', 'MAPPED Education Field', 'MAPPED Gender', 'MAPPED Marital Status', 'MAPPED Job Role', 'MAPPED Over18', 'MAPPED Overtime']
               Age   Daily Rate  Distance From Home    Education  \
count  1470.000000  1470.000000         1470.000000  147

In [67]:
df.describe()


Unnamed: 0,Age,Daily Rate,Distance From Home,Education,Environment Satisfaction,Hourly Rate,Job Involvement,Job Level,Job Satisfaction,Monthly Income,...,Performance Rating,Relationship Satisfaction,Stock Option Level,Total Working Years,Training Times Last Year,Work Life Balance,Years At Company,Years In Current Role,Years Since Last Promotion,Years With Curr Manager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,2.721769,65.891156,2.729932,2.063946,2.728571,6502.931293,...,3.153741,2.712245,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,1.093082,20.329428,0.711561,1.10694,1.102846,4707.956783,...,0.360824,1.081209,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,...,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,2.0,48.0,2.0,1.0,2.0,2911.0,...,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,3.0,66.0,3.0,2.0,3.0,4919.0,...,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,4.0,83.75,3.0,3.0,4.0,8379.0,...,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,19999.0,...,4.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [68]:
# Save Dataframe to CSV File
df.to_csv('processed_IBM_HR_dataset.csv', index=False)