# Predicting Employee Promotions

This project aims to predict whether an employee will be promoted based on their historical and organizational data. This notebook follows a step-by-step approach for a machine learning project.

## imported libraries

In [None]:
import numpy as np
import pandas as pd
from numpy import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px

## load dataset

In [None]:
print(os.path.isfile('employee_promotion.csv'))  # return True if the file exists


In [None]:
df= pd.read_csv("employee_promotion.csv")
df


## Data Preprocessing

In [None]:
df.head()

In [None]:
df.interpolate(inplace=False)

In [None]:
df.info()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
#group by 
df = pd.read_csv("employee_promotion.csv")
empgby = df.groupby(['education']).size()
gby=pd.DataFrame(empgby)
gby

In [None]:
#group by two columns
empqtr = df.groupby(['department', 'education']).size()
grpqrt=pd.DataFrame(empqtr)
grpqrt

In [None]:
df.columns

In [None]:
# Calculates the average training score for each department
df.columns = df.columns.str.strip()
avgtrain = df.groupby('department')['avg_training_score'].mean()
print(avgtrain)

In [None]:
high_scores = df[df['avg_training_score'] > 60]  # Filter rows where avg_training_score > 60
high_scores

In [None]:
sorted_df = df.sort_values(by='avg_training_score', ascending=False)
sorted_df

In [None]:
#Aggregation 
#Combine data into meaningful summaries using functions like sum, mean, count
summary = df.groupby('department').agg({
    'avg_training_score': 'mean',
    'previous_year_rating': 'max',
    
})
summary

In [None]:
#Pivot Tables
#Similar to Excel,it summarizes data with rows, columns, and values

#Create a pivot table to see promotions by department and gender
pivot = df.pivot_table(values='is_promoted', index='department', columns='gender', aggfunc='sum')
pivot


In [None]:
#handling Missing Data (isna(), fillna(), dropna())
#Check, clean, or replace missing values.

#Example:

#Find missing values

df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.isna().sum()

In [None]:
#selecting specific data
selected_data = df[['department', 'education', 'avg_training_score']]
selected_data

## Data Visualization

In [None]:
df = pd.read_csv("employee_promotion.csv")
plt.scatter(df['avg_training_score'], df['previous_year_rating'],color='green')
plt.title("Scatter Plot")
plt.xlabel('avg_training_score')
plt.ylabel('previous_year_rating')
plt.show()

In [None]:
# Plotting a histogram for average training scores
plt.hist(df['avg_training_score'], bins=10, color='skyblue', edgecolor='black')
plt.title('Distribution of Average Training Scores')
plt.xlabel('Average Training Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
#correlation heatmap


# Calculate correlation matrix
corr_matrix = df[['avg_training_score', 'previous_year_rating', 'is_promoted']].corr()

# Plot heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#Box Plot: To visualize the spread and identify outliers
#Box plots are useful for checking the distribution and spotting outliers in a numerical column.
#Example:
# Box plot for previous year ratings
plt.boxplot(df['previous_year_rating'].dropna(), vert=False, patch_artist=True, 
            boxprops=dict(facecolor='lightblue', color='black'),
            whiskerprops=dict(color='black'), flierprops=dict(markerfacecolor='red', marker='o', markersize=8))
plt.title('Boxplot of Previous Year Ratings')
plt.xlabel('Previous Year Rating')
plt.show()

In [None]:
# Pie chart for promotion status (Promoted vs Not Promoted)
promotion_counts = df['is_promoted'].value_counts()
promotion_counts.plot(kind='pie', autopct='%1.1f%%', colors=['blue', 'lightblue'])
plt.title('Promotion Status Distribution')
plt.ylabel('')  # Remove y-axis label for better appearance
plt.show()


In [None]:
# Bar chart for promotion counts by department
promotion_by_dept = df.groupby('department')['is_promoted'].sum()
promotion_by_dept.plot(kind='bar', color='lightgreen')
plt.title('Promotions by Department')
plt.xlabel('Department')
plt.ylabel('Number of Promotions')
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
plt.show()

In [None]:
# Grouping by education and plotting the average rating
education_avg_rating = df.groupby('education')['previous_year_rating'].mean()

education_avg_rating.plot(kind='bar', color='skyblue')
plt.title('Average Rating by Education')
plt.xlabel('Education Level')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Count plot for gender distribution using 'hue' (no actual grouping by 'hue')
sns.countplot(x='gender', data=df, hue='gender', palette='Set2')
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
# Stacked bar chart for promotion status by department
promotion_by_dept = df.groupby(['department', 'is_promoted']).size().unstack()
promotion_by_dept.plot(kind='bar', stacked=True, color=['lightcoral', 'lightgreen'])
plt.title('Promotion Status by Department (Stacked)')
plt.xlabel('Department')
plt.ylabel('Number of Employees')
plt.xticks(rotation=45)
plt.show()

In [None]:
df = pd.read_csv("employee_promotion.csv")
df

In [None]:
sns.set_theme(style="darkgrid")
plt.Figure(figsize=(20,1000))
sns.relplot(data=df, x="department", y="education", height=6, aspect=1.5)  # Height=6 inches, Aspect=1.5 (width)
plt.show()


In [None]:
# x and y given as array_like objects
import plotly.express as px
fig = px.scatter(df, x='department', y='education', title="Scatter Plot of Department vs Education")
fig.show()

In [None]:
# x and y given as array_like objects
import plotly.express as px
fig = px.bar(df, x='education', y='employee_id', title="Scatter Plot of Department vs Education")
fig.show()

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(x='education',y='employee_id',hue='is_promoted', data=df)
plt.show()



In [None]:
sns.countplot(x='gender',data=df,palette='Greens_d')

In [None]:
plt.figure(figsize=(17,6))
sns.violinplot(x='age',y='gender',hue='is_promoted',data=df)
plt.show()

In [None]:
px.pie(df,values='awards_won',names='education',template='plotly_dark')

In [None]:
px.violin(df,df['department'],template='plotly_dark')


In [None]:
px.box(df,df['age'],df['length_of_service'],color='gender',title='Length of Service',template='plotly_dark')


In [None]:
px.pie(df,values='is_promoted',names='department',template = 'plotly_dark')

In [None]:
px.violin(df,df['department'],df['awards_won'],color='department',title='Departments - Awards_won',template='plotly_dark')

In [None]:
df

In [None]:
col=['department','region','recruitment_channel','education']

for i in col:
    print(i)
    print(df[i].unique())
    print(df[i].nunique())
    print()

In [None]:
df

In [None]:
df.info()

In [None]:
numeric_df =df.select_dtypes(include=['number'])  # Select numeric columns only
numeric_df.corr()


In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(numeric_df.corr(),annot=True,cmap='Blues')
plt.show()

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = l.fit_transform(df[i])

df.head()

## Algorithms and Model Implementation

In [None]:
x = df.drop('is_promoted', axis=1)
y = df['is_promoted']



In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.30)


In [None]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler(feature_range=(0, 1))
xtrain = mms.fit_transform(xtrain)
xtest = mms.fit_transform(xtest)
xtrain = pd.DataFrame(xtrain)
xtest = pd.DataFrame(xtest)


In [None]:
R = {'Model': [], 'Accuracy': [], 'Recall': [], 'Precision': [], 'F1': []}


In [None]:
Results = pd.DataFrame(R)
Results.head()

In [None]:
df

## Prediction-Analyzing

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, f1_score, precision_score
import pandas as pd

# Handle missing values
imputer = SimpleImputer(strategy='mean')
xtrain = imputer.fit_transform(xtrain)
xtest = imputer.transform(xtest)

lr = LogisticRegression()
dc = DecisionTreeClassifier()
rf = RandomForestClassifier()

model = [lr, dc, rf]

Results = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1'])

for models in model:
    models.fit(xtrain, ytrain)
    ypred = models.predict(xtest)
    
    print('Model :', models)
    print('--------------------------------------------------------------------------------------------------------------')
    print('Confusion Matrix:\n', confusion_matrix(ytest, ypred))
    print('Classification Report:\n', classification_report(ytest, ypred))
    print('Accuracy:', round(accuracy_score(ytest, ypred), 2))
    print('Precision:', round(precision_score(ytest, ypred), 2))
    print('Recall:', round(recall_score(ytest, ypred), 2))
    print('F1 Score:', round(f1_score(ytest, ypred), 2))
    
    R = pd.DataFrame([{
        'Model': type(models).__name__,
        'Accuracy': round(accuracy_score(ytest, ypred), 2),
        'Recall': round(recall_score(ytest, ypred), 2),
        'Precision': round(precision_score(ytest, ypred), 2),
        'F1': round(f1_score(ytest, ypred), 2)
    }])
    Results = pd.concat([Results, R], ignore_index=True)

Results


In [None]:
Results

## Downloading Pickle

In [None]:
import pickle
for model in models:
    filename = model.__class__.__name__+'.pkl'
    with open(filename,'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved: {filename}")