# Employee Attrition Analysis - Exploratory Data Analysis

This notebook contains the initial exploratory data analysis for the HR Employee Attrition dataset.

## Objectives
1. Load and inspect the dataset
2. Understand data structure and types
3. Identify missing values and data quality issues
4. Perform univariate and bivariate analysis
5. Visualize key patterns and relationships
6. Identify features correlated with attrition

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Data

In [None]:
# Load the dataset
df = pd.read_csv('../WA_Fn-UseC_-HR-Employee-Attrition.csv')

print(f"Dataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

## 2. Initial Data Inspection

In [None]:
# Display first few rows
df.head()

In [None]:
# Data types and non-null counts
df.info()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])
print(f"\nTotal missing values: {missing_values.sum()}")

## 3. Target Variable Analysis

In [None]:
# Attrition distribution
attrition_counts = df['Attrition'].value_counts()
print("Attrition Distribution:")
print(attrition_counts)
print(f"\nAttrition Rate: {(attrition_counts['Yes'] / len(df) * 100):.2f}%")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(data=df, x='Attrition', ax=axes[0], palette='Set2')
axes[0].set_title('Attrition Distribution', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count')

# Pie chart
axes[1].pie(attrition_counts.values, labels=attrition_counts.index, autopct='%1.1f%%', 
            colors=sns.color_palette('Set2'), startangle=90)
axes[1].set_title('Attrition Percentage', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Feature Analysis

### 4.1 Numerical Features

In [None]:
# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")

In [None]:
# Distribution of key numerical features
key_numerical = ['Age', 'MonthlyIncome', 'YearsAtCompany', 'JobSatisfaction', 'WorkLifeBalance']
available_numerical = [col for col in key_numerical if col in df.columns]

if available_numerical:
    fig, axes = plt.subplots(2, 3, figsize=(16, 10))
    axes = axes.flatten()
    
    for idx, col in enumerate(available_numerical):
        if idx < len(axes):
            df[col].hist(bins=30, ax=axes[idx], edgecolor='black', alpha=0.7)
            axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
    
    # Hide unused subplots
    for idx in range(len(available_numerical), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

### 4.2 Categorical Features

In [None]:
# Explore categorical features
for col in categorical_cols:
    if col != 'Attrition':
        print(f"\n{col} - Unique values: {df[col].nunique()}")
        print(df[col].value_counts().head())

## 5. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
plt.figure(figsize=(14, 10))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Numerical Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Attrition Analysis by Features

In [None]:
# Attrition by Department (example)
if 'Department' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='Department', hue='Attrition', palette='Set1')
    plt.title('Attrition by Department', fontsize=14, fontweight='bold')
    plt.xlabel('Department')
    plt.ylabel('Count')
    plt.legend(title='Attrition')
    plt.tight_layout()
    plt.show()

In [None]:
# Age distribution by attrition
if 'Age' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=df, x='Attrition', y='Age', palette='Set2')
    plt.title('Age Distribution by Attrition Status', fontsize=14, fontweight='bold')
    plt.xlabel('Attrition')
    plt.ylabel('Age')
    plt.tight_layout()
    plt.show()

## 7. Next Steps

- Feature engineering and data preprocessing
- Handle class imbalance
- Build predictive models
- Model evaluation and selection
- Feature importance analysis