In [1]:
import pandas as pd
import numpy as np
from faker import Faker

# -----------------------------
# CONFIG (DO NOT CHANGE)
# -----------------------------
fake = Faker()
np.random.seed(7)

TOTAL_EMPLOYEES = 1250
TOTAL_EXITS = 185
TOTAL_HIRES_2021 = 90

OUTPUT_PATH = r"C:\Users\kisho\OneDrive\Desktop\Neelitech.AI\HR_Attrition_2021.csv"

# -----------------------------
# DEPARTMENT SETUP (FIXED)
# -----------------------------
dept_structure = {
    "Sales": {"employees": 450, "exits": 78},
    "Marketing": {"employees": 300, "exits": 46},
    "HR": {"employees": 200, "exits": 32},
    "Finance": {"employees": 300, "exits": 29}
}

regions = ["North America", "Europe", "Asia", "South America"]
ethnicity = ["Black", "Asian", "Other"]

# -----------------------------
# CREATE EMPLOYEE MASTER
# -----------------------------
records = []
emp_id = 1

for dept, meta in dept_structure.items():
    for _ in range(meta["employees"]):
        records.append({
            "EmployeeID": emp_id,
            "EmployeeName": fake.name(),
            "Department": dept,
            "Gender": np.random.choice(["Male", "Female"]),
            "Ethnicity": np.random.choice(ethnicity, p=[0.3, 0.4, 0.3]),
            "Region": np.random.choice(regions),
            "MonthlySalary": np.random.randint(
                45000 if dept == "Sales" else 40000,
                120000
            )
        })
        emp_id += 1

df = pd.DataFrame(records)

# -----------------------------
# HIRING LOGIC (LESS THAN EXITS)
# -----------------------------
df["HireDate"] = pd.NaT

hire_indices = np.random.choice(df.index, TOTAL_HIRES_2021, replace=False)
existing_indices = df.index.difference(hire_indices)

df.loc[existing_indices, "HireDate"] = pd.to_datetime(
    np.random.choice(
        pd.date_range("2016-01-01", "2020-12-31"),
        len(existing_indices)
    )
)

df.loc[hire_indices, "HireDate"] = pd.to_datetime(
    np.random.choice(
        pd.date_range("2021-01-01", "2021-07-31"),
        TOTAL_HIRES_2021
    )
)

# -----------------------------
# EXIT LOGIC (CONTROLLED BY DEPT)
# -----------------------------
df["ExitDate"] = pd.NaT

exit_indices = []

for dept, meta in dept_structure.items():
    dept_idx = df[df["Department"] == dept].index
    exit_indices.extend(
        np.random.choice(dept_idx, meta["exits"], replace=False)
    )

df.loc[exit_indices, "ExitDate"] = pd.to_datetime(
    np.random.choice(
        pd.date_range("2021-06-01", "2021-12-31"),
        TOTAL_EXITS
    )
)

df["AttritionFlag"] = np.where(df["ExitDate"].notna(), 1, 0)

# -----------------------------
# PERFORMANCE (ATTRITION BIAS)
# -----------------------------
df["PerformanceRating"] = np.where(
    df["AttritionFlag"] == 1,
    np.random.choice(["High", "Standard"], TOTAL_EMPLOYEES, p=[0.30, 0.70]),
    np.random.choice(["High", "Standard"], TOTAL_EMPLOYEES, p=[0.22, 0.78])
)

df["HighPerformerAttrition"] = np.where(
    (df["PerformanceRating"] == "High") & (df["AttritionFlag"] == 1),
    1, 0
)

# -----------------------------
# REASONS FOR LEAVING
# -----------------------------
df["ReasonForLeaving"] = np.where(
    df["AttritionFlag"] == 1,
    np.random.choice(
        ["Better Opportunity", "Work-Life Balance", "Management Issues", "Salary"],
        TOTAL_EMPLOYEES,
        p=[0.36, 0.27, 0.22, 0.15]
    ),
    None
)

# -----------------------------
# TENURE CALCULATION
# -----------------------------
reference_date = pd.to_datetime("2021-12-31")

df["TenureYears"] = (
    (df["ExitDate"].fillna(reference_date) - df["HireDate"])
    .dt.days / 365
).round(2)

def tenure_bucket(x):
    if x <= 1:
        return "0-1 Years"
    elif x <= 3:
        return "1-3 Years"
    elif x <= 5:
        return "3-5 Years"
    else:
        return "5+ Years"

df["TenureBucket"] = df["TenureYears"].apply(tenure_bucket)

# -----------------------------
# EXPORT
# -----------------------------
OUTPUT_PATH = r"C:\Users\kisho\OneDrive\Desktop\Neelitech.AI\HR_Attrition_2021.csv"

df.to_csv(OUTPUT_PATH, index=False)

print("‚úÖ HR Attrition dataset generated successfully")
print(f"üìÅ Saved to: {OUTPUT_PATH}")
print(f"üë• Employees: {len(df)} | ‚ùå Exits: {df['AttritionFlag'].sum()}")



‚úÖ HR Attrition dataset generated successfully
üìÅ Saved to: C:\Users\kisho\OneDrive\Desktop\Neelitech.AI\HR_Attrition_2021.csv
üë• Employees: 1250 | ‚ùå Exits: 185
