# Exploratory Data Analysis (EDA)

This notebook explores the HR Employee Attrition dataset.

**Steps:**
1. Load and view data
2. Clean data
3. Analyze target variable
4. Analyze features
5. Check correlations

## Step 1: Import Libraries & Define Functions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')

# --- Data Loading & Cleaning Functions ---
def load_data(filepath):
    try:
        df = pd.read_csv(filepath)
        print(f"Data loaded successfully. Shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None

def clean_data(df):
    df = df.copy()
    # Drop useless columns
    drop_cols = ['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber']
    cols_to_drop = [c for c in drop_cols if c in df.columns]
    df = df.drop(columns=cols_to_drop)
    return df

# --- Plotting Functions ---
def plot_distribution(df, column, title=None, ax=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 6))
    sns.histplot(data=df, x=column, kde=True, ax=ax)
    if title:
        ax.set_title(title)
    
def plot_categorical(df, column, target='Attrition', ax=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 6))
    sns.countplot(data=df, x=column, hue=target, ax=ax)
    ax.set_title(f'{column} vs Attrition')
    plt.xticks(rotation=45)


## Step 2: Load Data

In [None]:
df = load_data('../data/HR_Employee_Attrition.csv')

In [None]:
df.head()

## Step 3: Clean Data

In [None]:
df_clean = clean_data(df)
df_clean.info()

## Step 4: Analyze Target (Attrition)

In [None]:
plot_categorical(df_clean, 'Attrition')

## Step 5: Analyze Features

In [None]:
plot_distribution(df_clean, 'Age')

In [None]:
plot_categorical(df_clean, 'Department')

### Correlation Analysis

In [None]:
# Correlation Matrix
plt.figure(figsize=(14, 10))
# Select numerical columns only
numerical_df = df_clean.select_dtypes(include=[np.number])
corr = numerical_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='coolwarm', vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=False)
plt.title('Correlation Heatmap')
plt.show()

### Attrition Analysis by Key Features

In [None]:
# Attrition by Job Role
plt.figure(figsize=(12, 6))
sns.countplot(y='JobRole', hue='Attrition', data=df_clean)
plt.title('Attrition by Job Role')
plt.show()

In [None]:
# Job Satisfaction vs Attrition
plt.figure(figsize=(8, 6))
sns.countplot(x='JobSatisfaction', hue='Attrition', data=df_clean)
plt.title('Job Satisfaction vs Attrition')
plt.show()

In [None]:
# Monthly Income vs Attrition
plt.figure(figsize=(10, 6))
sns.boxplot(x='Attrition', y='MonthlyIncome', data=df_clean)
plt.title('Monthly Income vs Attrition')
plt.show()

In [None]:
# OverTime vs Attrition
plot_categorical(df_clean, 'OverTime')