# Task 2: Employee Attrition Prediction using Logistic Regression

## Objective
Employee attrition is a major concern for organisations as retaining skilled employees is crucial for business success. Predicting whether an employee will leave the company can help HR departments take proactive measures.

In this notebook, we will:
1. Load and explore the HR dataset
2. Preprocess and prepare data for logistic regression
3. Build a predictive model using logistic regression
4. Analyze the coefficients and their significance
5. Evaluate model performance
6. Answer the assignment questions about coefficient interpretation

### Target Variable:
- **Attrition**: Yes = Left, No = Stayed

---
## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report, roc_curve, roc_auc_score)

# Statistical analysis
import statsmodels.api as sm

# Settings
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', None)

print("All libraries imported successfully!")

---
## 2. Load and Explore the Dataset

In [None]:
# Load the HR dataset
df = pd.read_csv('../Task_2_Assets/hr_dataset.csv')

# Display basic information
print("="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"\nShape of dataset: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"\nColumn names:\n{df.columns.tolist()}")
print("\n" + "="*60)
print("FIRST 5 ROWS")
print("="*60)
df.head()

In [None]:
# Check data types and missing values
print("DATA TYPES AND NON-NULL COUNTS")
print("="*60)
print(df.info())

print("\n" + "="*60)
print("MISSING VALUES")
print("="*60)
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values found!")

In [None]:
# Target variable distribution
print("TARGET VARIABLE: ATTRITION")
print("="*60)
print("\nValue Counts:")
print(df['Attrition'].value_counts())
print(f"\nPercentage:")
print(df['Attrition'].value_counts(normalize=True) * 100)

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
attrition_counts = df['Attrition'].value_counts()
colors = ['#2ecc71', '#e74c3c']
axes[0].bar(attrition_counts.index, attrition_counts.values, color=colors, edgecolor='black')
axes[0].set_title('Attrition Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Attrition')
axes[0].set_ylabel('Count')
for i, v in enumerate(attrition_counts.values):
    axes[0].text(i, v + 20, str(v), ha='center', fontsize=12, fontweight='bold')

# Pie chart
axes[1].pie(attrition_counts.values, labels=attrition_counts.index, autopct='%1.1f%%', 
            colors=colors, explode=[0, 0.1], shadow=True, startangle=90)
axes[1].set_title('Attrition Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

---
## 3. Exploratory Data Analysis (EDA)

In [None]:
# Analyze key features by Attrition
key_features = ['Age', 'YearsAtCompany', 'MonthlyIncome', 'TotalWorkingYears', 
                'WorkLifeBalance', 'JobSatisfaction', 'EnvironmentSatisfaction']

fig, axes = plt.subplots(2, 4, figsize=(18, 10))
axes = axes.flatten()

for i, feature in enumerate(key_features):
    if feature in df.columns:
        df.boxplot(column=feature, by='Attrition', ax=axes[i])
        axes[i].set_title(f'{feature} by Attrition', fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Attrition')

# Remove empty subplot
axes[-1].axis('off')
plt.suptitle('Key Features Distribution by Attrition Status', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Analyze OverTime vs Attrition
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# OverTime distribution
overtime_attrition = pd.crosstab(df['OverTime'], df['Attrition'])
overtime_attrition.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'], edgecolor='black')
axes[0].set_title('OverTime vs Attrition', fontsize=14, fontweight='bold')
axes[0].set_xlabel('OverTime')
axes[0].set_ylabel('Count')
axes[0].legend(title='Attrition')
axes[0].tick_params(axis='x', rotation=0)

# Work-Life Balance vs Attrition
wlb_attrition = pd.crosstab(df['WorkLifeBalance'], df['Attrition'])
wlb_attrition.plot(kind='bar', ax=axes[1], color=['#2ecc71', '#e74c3c'], edgecolor='black')
axes[1].set_title('Work-Life Balance vs Attrition', fontsize=14, fontweight='bold')
axes[1].set_xlabel('WorkLifeBalance (1=Low, 4=High)')
axes[1].set_ylabel('Count')
axes[1].legend(title='Attrition')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

# Print statistics
print("\nOverTime Attrition Rate:")
print((overtime_attrition['Yes'] / overtime_attrition.sum(axis=1) * 100).round(2))

---
## 4. Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Convert target variable to binary (Yes=1, No=0)
df_processed['Attrition'] = df_processed['Attrition'].map({'Yes': 1, 'No': 0})

# Identify categorical and numerical columns
categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('Attrition')  # Remove target variable

print("Categorical Columns:", categorical_cols)
print("\nNumerical Columns:", numerical_cols)

# Remove columns that don't add value
cols_to_drop = ['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber']
cols_to_drop = [col for col in cols_to_drop if col in df_processed.columns]
df_processed = df_processed.drop(columns=cols_to_drop)

print(f"\nDropped columns: {cols_to_drop}")
print(f"Remaining columns: {df_processed.shape[1]}")

In [None]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df_processed, drop_first=True)

print("Shape after encoding:", df_encoded.shape)
print("\nEncoded columns sample:")
print(df_encoded.columns.tolist()[:20], "...")

In [None]:
# Prepare features and target
X = df_encoded.drop('Attrition', axis=1)
y = df_encoded['Attrition']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"\nTarget distribution in training set:")
print(y_train.value_counts())

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for feature names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

---
## 5. Building the Logistic Regression Model

In [None]:
# Train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred = log_reg.predict(X_train_scaled)
y_test_pred = log_reg.predict(X_test_scaled)
y_test_proba = log_reg.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression Model Training Complete!")
print("="*60)

In [None]:
# Analyze model coefficients
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': log_reg.coef_[0],
    'Odds_Ratio': np.exp(log_reg.coef_[0])
}).sort_values('Coefficient', ascending=False, key=abs)

print("LOGISTIC REGRESSION COEFFICIENTS (Top 20 by magnitude)")
print("="*80)
print("\nIntercept:", log_reg.intercept_[0])
print("\nTop 20 Features by Absolute Coefficient Value:")
print(coefficients.head(20).to_string(index=False))

# Visualize top coefficients
top_coefs = coefficients.head(15)

plt.figure(figsize=(12, 8))
colors = ['green' if c > 0 else 'red' for c in top_coefs['Coefficient']]
bars = plt.barh(range(len(top_coefs)), top_coefs['Coefficient'], color=colors, edgecolor='black')
plt.yticks(range(len(top_coefs)), top_coefs['Feature'])
plt.axvline(x=0, color='black', linewidth=0.8)
plt.xlabel('Coefficient Value', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 15 Logistic Regression Coefficients\n(Positive = Higher Attrition Risk)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

---
## 6. Model Evaluation

In [None]:
# Model Evaluation Metrics
print("MODEL EVALUATION METRICS")
print("="*60)

print("\n--- Training Set ---")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")

print("\n--- Testing Set ---")
print(f"Accuracy:  {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_test_pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_test_pred):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_test_proba):.4f}")

print("\n--- Classification Report ---")
print(classification_report(y_test, y_test_pred, target_names=['No Attrition', 'Attrition']))

In [None]:
# Confusion Matrix and ROC Curve
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['No Attrition', 'Attrition'],
            yticklabels=['No Attrition', 'Attrition'])
axes[0].set_title('Confusion Matrix', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)
auc = roc_auc_score(y_test, y_test_proba)
axes[1].plot(fpr, tpr, color='blue', linewidth=2, label=f'ROC Curve (AUC = {auc:.3f})')
axes[1].plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
axes[1].fill_between(fpr, tpr, alpha=0.2)
axes[1].set_xlabel('False Positive Rate', fontsize=11)
axes[1].set_ylabel('True Positive Rate', fontsize=11)
axes[1].set_title('ROC Curve', fontsize=14, fontweight='bold')
axes[1].legend(loc='lower right')

plt.tight_layout()
plt.show()

---
## 7. Answers to Assignment Questions

### Question 1: Coefficient Interpretation and Strongest Influence

In [None]:
# Analyze coefficients for Question 1
print("="*80)
print("QUESTION 1: Coefficient Interpretation and Feature Influence")
print("="*80)

# Sort by coefficient value
positive_coefs = coefficients[coefficients['Coefficient'] > 0].sort_values('Coefficient', ascending=False).head(10)
negative_coefs = coefficients[coefficients['Coefficient'] < 0].sort_values('Coefficient', ascending=True).head(10)

print("\n1. POSITIVE COEFFICIENTS (Increase Attrition Risk):")
print("-"*60)
for _, row in positive_coefs.iterrows():
    print(f"   {row['Feature']:35s}: +{row['Coefficient']:.4f} (Odds Ratio: {row['Odds_Ratio']:.4f})")

print("\n2. NEGATIVE COEFFICIENTS (Decrease Attrition Risk):")
print("-"*60)
for _, row in negative_coefs.iterrows():
    print(f"   {row['Feature']:35s}: {row['Coefficient']:.4f} (Odds Ratio: {row['Odds_Ratio']:.4f})")

# Find strongest influence
strongest = coefficients.iloc[0]
print(f"\n3. STRONGEST INFLUENCE ON ATTRITION:")
print("-"*60)
print(f"   Feature: {strongest['Feature']}")
print(f"   Coefficient: {strongest['Coefficient']:.4f}")
print(f"   Odds Ratio: {strongest['Odds_Ratio']:.4f}")

#### Answer to Question 1: Coefficient Interpretation

In logistic regression, coefficients indicate the direction and magnitude of the relationship between each independent variable and the log-odds of the target variable (Attrition).

**Interpretation of Coefficients:**

1. **Positive Coefficient**: A positive coefficient means that as the feature value increases, the **probability of attrition increases**. For example, if OverTime has a positive coefficient, employees who work overtime are more likely to leave.

2. **Negative Coefficient**: A negative coefficient means that as the feature value increases, the **probability of attrition decreases**. For example, if YearsAtCompany has a negative coefficient, employees with longer tenure are less likely to leave.

3. **Odds Ratio**: The exponentiated coefficient (e^Î²) gives the odds ratio, which tells us how much the odds of attrition multiply for each one-unit increase in the feature. An odds ratio > 1 increases attrition risk, while an odds ratio < 1 decreases it.

**Key Findings:**
- Features with high positive coefficients (like OverTime, JobLevel in certain roles) increase attrition risk
- Features with negative coefficients (like YearsAtCompany, JobInvolvement) decrease attrition risk
- The feature with the largest absolute coefficient has the **strongest influence** on employee attrition

### Question 2: Practical Implications

In [None]:
# Analyze specific features for Question 2
print("="*80)
print("QUESTION 2: Practical Implications Analysis")
print("="*80)

# Get coefficients for key features mentioned in the question
key_features_q2 = ['OverTime_Yes', 'WorkLifeBalance', 'MonthlyIncome', 'YearsAtCompany']

print("\nKey Features Analysis:")
print("-"*60)
for feature in key_features_q2:
    if feature in coefficients['Feature'].values:
        row = coefficients[coefficients['Feature'] == feature].iloc[0]
        print(f"\n{feature}:")
        print(f"   Coefficient: {row['Coefficient']:.4f}")
        print(f"   Odds Ratio: {row['Odds_Ratio']:.4f}")
        if row['Coefficient'] > 0:
            print(f"   Interpretation: INCREASES attrition risk")
        else:
            print(f"   Interpretation: DECREASES attrition risk")
    else:
        # Check for similar features
        similar = coefficients[coefficients['Feature'].str.contains(feature.split('_')[0], case=False)]
        if len(similar) > 0:
            print(f"\n{feature} (related features):")
            for _, r in similar.head(3).iterrows():
                print(f"   {r['Feature']}: {r['Coefficient']:.4f} (OR: {r['Odds_Ratio']:.4f})")

#### Answer to Question 2: Practical Implications

**1. If Work-Life Balance has a Negative Coefficient:**

A negative coefficient for Work-Life Balance implies that as work-life balance **improves** (higher scores), the probability of attrition **decreases**. This makes intuitive sense because:

- Employees with better work-life balance are more satisfied with their jobs
- They experience less burnout and stress
- They have time for personal pursuits, family, and self-care
- They feel the organization values their well-being

**HR Implication**: Organizations should invest in work-life balance initiatives such as flexible working hours, remote work options, and reasonable workloads to reduce attrition.

---

**2. If OverTime has a High Positive Coefficient:**

A high positive coefficient for OverTime suggests that employees who frequently work overtime are **significantly more likely** to leave. This indicates:

- Overtime work leads to burnout and dissatisfaction
- Employees feel overworked and undervalued
- Work-life balance suffers when overtime is frequent
- It may signal poor resource planning or unrealistic workloads

**HR Implication**: Management should monitor overtime patterns, hire additional staff if needed, improve project planning, and ensure overtime is compensated fairly. Persistent overtime is a strong warning sign for potential attrition.

---

**3. If Salary/Monthly Income has a Small or Non-Significant Coefficient:**

A small or non-significant coefficient for Salary Level suggests that salary alone may not be a primary driver of attrition. This could be because:

- **Base salaries are competitive**: If the organization already pays market-competitive salaries, further increases have diminishing returns
- **Other factors matter more**: Work environment, growth opportunities, management quality, and work-life balance may be more important than salary
- **Threshold effect**: Once employees earn above a certain threshold, additional income has less impact on their decision to stay
- **Non-monetary compensation**: Benefits, recognition, career development opportunities may compensate for salary concerns

**HR Implication**: While competitive salaries are important, organizations should not rely solely on compensation to retain employees. Focus on holistic employee experience including career development, work environment, and meaningful work.

---
## 8. Summary and Conclusions

In [None]:
# Final Summary
print("="*80)
print("FINAL SUMMARY: EMPLOYEE ATTRITION PREDICTION MODEL")
print("="*80)

print("\n1. MODEL PERFORMANCE:")
print("-"*60)
print(f"   - Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"   - ROC AUC: {roc_auc_score(y_test, y_test_proba):.4f}")
print(f"   - Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"   - Recall: {recall_score(y_test, y_test_pred):.4f}")

print("\n2. TOP FACTORS INCREASING ATTRITION RISK:")
print("-"*60)
for i, (_, row) in enumerate(positive_coefs.head(5).iterrows(), 1):
    print(f"   {i}. {row['Feature']}: +{row['Coefficient']:.4f}")

print("\n3. TOP FACTORS DECREASING ATTRITION RISK:")
print("-"*60)
for i, (_, row) in enumerate(negative_coefs.head(5).iterrows(), 1):
    print(f"   {i}. {row['Feature']}: {row['Coefficient']:.4f}")

print("\n4. KEY HR RECOMMENDATIONS:")
print("-"*60)
print("   - Monitor and reduce overtime requirements")
print("   - Improve work-life balance initiatives")
print("   - Focus on employee engagement and job satisfaction")
print("   - Invest in career development and growth opportunities")
print("   - Address environmental and relationship satisfaction factors")

print("\n" + "="*80)
print("END OF TASK 2 SOLUTION")
print("="*80)