# HR Attrition Analysis with Random Forest

This notebook walks through the process of analyzing and predicting employee attrition using synthetic HR data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

%matplotlib inline

## Load the data

In [None]:
df = pd.read_csv("employee_data.csv")
df.head()

## Data Overview

In [None]:
df.info()
df.describe()

## Encode Categorical Variables

In [None]:
df_encoded = df.copy()
label_encoders = {}
for column in df_encoded.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df_encoded[column] = le.fit_transform(df_encoded[column])
    label_encoders[column] = le

df_encoded.head()

## Train/Test Split

In [None]:
X = df_encoded.drop("Attrition", axis=1)
y = df_encoded["Attrition"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Train Random Forest Model

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

## Evaluate the Model

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

## Feature Importances

In [None]:
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": clf.feature_importances_
}).sort_values(by="Importance", ascending=False)

sns.barplot(data=importances, x="Importance", y="Feature")
plt.title("Feature Importances")
plt.show()