In [None]:
# PHASE 1 - DATA COLLECTION AND EXPLORATORY DATA ANALYSIS (EDA)

# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

In [None]:
# Step 1: Load and Preprocess Data
# Upload the employee_data.csv file from Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
# Make sure to adjust the path to where your file is located
df = pd.read_csv('/content/drive/MyDrive/employee_data.csv')

print("Dataset loaded successfully!")
print(f"\nDataset Shape: {df.shape}")
print(f"\nFirst 5 rows:")
df.head()

In [None]:
# Step 1 (continued): Data Inspection and Cleaning
# Check dataset information
print("Dataset Info:")
print(df.info())
print("\n" + "="*50)
print("\nMissing Values:")
print(df.isnull().sum())
print("\n" + "="*50)
print("\nDuplicate Rows:")
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Handle missing values
df = df.dropna()  # Remove rows with missing values

# Remove duplicates
df = df.drop_duplicates()

print("\n" + "="*50)
print(f"\nCleaned Dataset Shape: {df.shape}")
print("Data cleaning completed successfully!")

In [None]:
# Step 2: Exploratory Data Analysis (EDA)
# Calculate Descriptive Statistics
print("DESCRIPTIVE STATISTICS")
print("="*80)

# Select numerical columns
numerical_cols = ['Age', 'Salary', 'YearsAtCompany', 'PerformanceScore']

for col in numerical_cols:
    print(f"\n{col}:")
    print(f"  Mean: {df[col].mean():.2f}")
    print(f"  Median: {df[col].median():.2f}")
    print(f"  Mode: {df[col].mode().values[0] if len(df[col].mode()) > 0 else 'N/A'}")
    print(f"  Variance: {df[col].var():.2f}")
    print(f"  Standard Deviation: {df[col].std():.2f}")
    print(f"  Min: {df[col].min():.2f}")
    print(f"  Max: {df[col].max():.2f}")

print("\n" + "="*80)
print("\nOverall Statistical Summary:")
print(df[numerical_cols].describe())

In [None]:
# Step 2 (continued): Visualizations - Pairplot
print("Creating Pairplot to explore relationships between features...")
plt.figure(figsize=(15, 10))
pairplot = sns.pairplot(df, hue='Attrition', vars=numerical_cols, diag_kind='kde')
pairplot.fig.suptitle('Pairplot: Feature Relationships by Attrition', y=1.02, fontsize=16)
plt.show()
print("Pairplot created successfully!")

In [None]:
# Step 2 (continued): Correlation Heatmap and Outliers
print("Creating Correlation Heatmap...")
plt.figure(figsize=(10, 8))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f',
            square=True, linewidths=1)
plt.title('Correlation Heatmap of Numerical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nIdentifying Outliers using Boxplots...")
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
for idx, col in enumerate(numerical_cols):
    ax = axes[idx//2, idx%2]
    ax.boxplot(df[col])
    ax.set_title(f'Boxplot: {col}', fontweight='bold')
    ax.set_ylabel('Values')
    ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Visualizations completed successfully!")

In [None]:
# Step 3: Probability and Statistical Analysis
# Calculate probability of employee leaving
print("PROBABILITY AND STATISTICAL ANALYSIS")
print("="*80)

# Calculate probability of attrition
attrition_counts = df['Attrition'].value_counts()
print("\nAttrition Distribution:")
print(attrition_counts)

# Calculate overall probability
total_employees = len(df)
attrition_yes = attrition_counts.get('Yes', 0)
attrition_no = attrition_counts.get('No', 0)

prob_attrition = attrition_yes / total_employees
prob_retention = attrition_no / total_employees

print(f"\nProbability of Employee Leaving (Attrition): {prob_attrition:.4f} ({prob_attrition*100:.2f}%)")
print(f"Probability of Employee Staying (Retention): {prob_retention:.4f} ({prob_retention*100:.2f}%)")

# Probability based on performance score
print("\n" + "="*80)
print("\nProbability of Attrition by Performance Score:")
for score in sorted(df['PerformanceScore'].unique()):
    subset = df[df['PerformanceScore'] == score]
    attrition_at_score = subset[subset['Attrition'] == 'Yes'].shape[0]
    prob = attrition_at_score / len(subset) if len(subset) > 0 else 0
    print(f"  Performance Score {score}: {prob:.4f} ({prob*100:.2f}%)")

# Probability based on department
print("\nProbability of Attrition by Department:")
for dept in df['Department'].unique():
    subset = df[df['Department'] == dept]
    attrition_at_dept = subset[subset['Attrition'] == 'Yes'].shape[0]
    prob = attrition_at_dept / len(subset) if len(subset) > 0 else 0
    print(f"  {dept}: {prob:.4f} ({prob*100:.2f}%)")

In [None]:
# Step 3 (continued): Bayes' Theorem
print("\n" + "="*80)
print("\nBAYES' THEOREM - P(Attrition|Performance Score)\n")

# Example: Calculate P(Attrition|PerformanceScore >= 85)
# P(A|B) = P(B|A) * P(A) / P(B)

# Define high performance (>= 85)
high_perf = df[df['PerformanceScore'] >= 85]
low_perf = df[df['PerformanceScore'] < 85]

# P(Attrition)
P_attrition = len(df[df['Attrition'] == 'Yes']) / len(df)

# P(High Performance|Attrition)
attrition_df = df[df['Attrition'] == 'Yes']
P_high_given_attrition = len(attrition_df[attrition_df['PerformanceScore'] >= 85]) / len(attrition_df)

# P(High Performance)
P_high_perf = len(high_perf) / len(df)

# Apply Bayes' Theorem
if P_high_perf > 0:
    P_attrition_given_high = (P_high_given_attrition * P_attrition) / P_high_perf
else:
    P_attrition_given_high = 0

print(f"P(Attrition) = {P_attrition:.4f}")
print(f"P(High Performance | Attrition) = {P_high_given_attrition:.4f}")
print(f"P(High Performance) = {P_high_perf:.4f}")
print(f"\nP(Attrition | High Performance >= 85) = {P_attrition_given_high:.4f}")
print(f"\nInterpretation: The probability of attrition given high performance is {P_attrition_given_high*100:.2f}%")

In [None]:
# Step 3 (continued): Hypothesis Testing
print("\n" + "="*80)
print("\nHYPOTHESIS TESTING - ANOVA Test")
print("H0: Mean performance score is the same across all departments")
print("H1: Mean performance score differs across departments\n")

# Group data by department
departments = df.groupby('Department')['PerformanceScore'].apply(list)
dept_groups = [df[df['Department'] == dept]['PerformanceScore'].values for dept in df['Department'].unique()]

# Perform ANOVA test
f_statistic, p_value = stats.f_oneway(*dept_groups)

print(f"F-Statistic: {f_statistic:.4f}")
print(f"P-Value: {p_value:.4f}")
print(f"\nSignificance Level (alpha): 0.05")

if p_value < 0.05:
    print("\nResult: REJECT the null hypothesis")
    print("Conclusion: There is a significant difference in mean performance scores across departments.")
else:
    print("\nResult: FAIL TO REJECT the null hypothesis")
    print("Conclusion: There is no significant difference in mean performance scores across departments.")

print("\n" + "="*80)
print("\nPhase 1 Complete!\n")

In [None]:
# PHASE 2 - PREDICTIVE MODELING
print("="*80)
print("PHASE 2: PREDICTIVE MODELING")
print("="*80)

# Step 4: Feature Engineering and Encoding
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

print("\nStep 4: Feature Engineering and Encoding\n")

# Create a copy of the dataframe
df_processed = df.copy()

# Label Encoding for categorical variables
le = LabelEncoder()
df_processed['Attrition_Encoded'] = le.fit_transform(df_processed['Attrition'])
df_processed['Department_Encoded'] = le.fit_transform(df_processed['Department'])

print("Encoding Completed:")
print(f"  - Attrition: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# Min-Max Scaling for numerical features
scaler = MinMaxScaler()
numerical_features = ['Age', 'Salary', 'YearsAtCompany', 'PerformanceScore']
df_processed[numerical_features] = scaler.fit_transform(df_processed[numerical_features])

print(f"\nMin-Max Scaling applied to: {numerical_features}")
print(f"\nProcessed Dataset Shape: {df_processed.shape}")
print("\nFirst 5 rows of processed data:")
df_processed.head()

In [None]:
# Step 5: Employee Attrition Prediction Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

print("\n" + "="*80)
print("Step 5: Employee Attrition Prediction Model")
print("="*80 + "\n")

# Prepare features and target
X = df_processed[['Age', 'Salary', 'YearsAtCompany', 'PerformanceScore', 'Department_Encoded']]
y = df_processed['Attrition_Encoded']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

# Train Random Forest Classifier
print("\nTraining Random Forest Classifier...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print("\nModel Performance:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1-Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Attrition', 'Attrition']))

In [None]:
# Step 5 (continued): Confusion Matrix Visualization
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Attrition', 'Attrition'],
            yticklabels=['No Attrition', 'Attrition'])
plt.title('Confusion Matrix - Attrition Prediction', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

print("\nStep 5 Complete!")

In [None]:
# Step 6: Employee Performance Prediction Model (Regression)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

print("\n" + "="*80)
print("Step 6: Employee Performance Prediction Model")
print("="*80 + "\n")

# Prepare features and target for regression
X_perf = df_processed[['Age', 'Salary', 'YearsAtCompany', 'Department_Encoded']]
y_perf = df_processed['PerformanceScore']

# Split the data
X_train_perf, X_test_perf, y_train_perf, y_test_perf = train_test_split(
    X_perf, y_perf, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train_perf.shape}")
print(f"Testing set size: {X_test_perf.shape}")

# Train Linear Regression Model
print("\nTraining Linear Regression Model...")
lr_model = LinearRegression()
lr_model.fit(X_train_perf, y_train_perf)

# Make predictions
y_pred_perf = lr_model.predict(X_test_perf)

# Evaluate the model
r2 = r2_score(y_test_perf, y_pred_perf)
mse = mean_squared_error(y_test_perf, y_pred_perf)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_perf, y_pred_perf)

print("\nModel Performance:")
print(f"  R² Score: {r2:.4f}")
print(f"  Mean Squared Error (MSE): {mse:.6f}")
print(f"  Root Mean Squared Error (RMSE): {rmse:.6f}")
print(f"  Mean Absolute Error (MAE): {mae:.6f}")

In [None]:
# Step 6 (continued): Visualize Predicted vs Actual Performance
print("\nVisualizing Predicted vs Actual Performance Scores...")
plt.figure(figsize=(10, 6))
plt.scatter(y_test_perf, y_pred_perf, alpha=0.6, edgecolors='k')
plt.plot([y_test_perf.min(), y_test_perf.max()],
         [y_test_perf.min(), y_test_perf.max()],
         'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual Performance Score', fontsize=12)
plt.ylabel('Predicted Performance Score', fontsize=12)
plt.title('Predicted vs Actual Performance Scores', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("Phase 2 Complete!\n")

In [None]:
# PHASE 3 - DEEP LEARNING MODELS
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

print("="*80)
print("PHASE 3: DEEP LEARNING MODELS")
print("="*80)

# Step 7: Deep Learning for Employee Performance Prediction
print("\nStep 7: Deep Learning for Performance Prediction\n")

# Prepare data (using the same split as before)
X_train_dl = X_train_perf.values
X_test_dl = X_test_perf.values
y_train_dl = y_train_perf.values
y_test_dl = y_test_perf.values

print(f"Training data shape: {X_train_dl.shape}")
print(f"Testing data shape: {X_test_dl.shape}")

# Build Feedforward Neural Network
print("\nBuilding Feedforward Neural Network...")
model_performance = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_dl.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
model_performance.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mean_squared_error',
    metrics=['mae']
)

print("\nModel Architecture:")
model_performance.summary()

In [None]:
# Step 7 (continued): Train the model
print("\nTraining the Neural Network...")
history = model_performance.fit(
    X_train_dl, y_train_dl,
    validation_split=0.2,
    epochs=50,
    batch_size=8,
    verbose=0
)

print("Training completed!")

# Evaluate on test set
test_loss, test_mae = model_performance.evaluate(X_test_dl, y_test_dl, verbose=0)
print(f"\nTest Performance:")
print(f"  Test Loss (MSE): {test_loss:.6f}")
print(f"  Test MAE: {test_mae:.6f}")

# Make predictions
y_pred_dl = model_performance.predict(X_test_dl, verbose=0)
r2_dl = r2_score(y_test_dl, y_pred_dl)
print(f"  R² Score: {r2_dl:.4f}")

In [None]:
# Step 8: Employee Attrition Analysis with Deep Learning
print("\n" + "="*80)
print("Step 8: Employee Attrition Prediction using Deep Learning")
print("="*80 + "\n")

# Prepare data for classification
X_train_attr = X_train.values
X_test_attr = X_test.values
y_train_attr = y_train.values
y_test_attr = y_test.values

print(f"Training data shape: {X_train_attr.shape}")
print(f"Testing data shape: {X_test_attr.shape}")

# Build Neural Network for Classification
print("\nBuilding Neural Network for Attrition Prediction...")
model_attrition = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_attr.shape[1],)),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model_attrition.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', 'precision', 'recall']
)

print("\nModel Architecture:")
model_attrition.summary()

In [None]:
# Step 8 (continued): Train and Evaluate
print("\nTraining the Attrition Prediction Model...")
history_attr = model_attrition.fit(
    X_train_attr, y_train_attr,
    validation_split=0.2,
    epochs=50,
    batch_size=8,
    verbose=0
)

print("Training completed!")

# Evaluate on test set
test_results = model_attrition.evaluate(X_test_attr, y_test_attr, verbose=0)
print(f"\nTest Performance:")
print(f"  Test Loss: {test_results[0]:.4f}")
print(f"  Test Accuracy: {test_results[1]:.4f}")
print(f"  Test Precision: {test_results[2]:.4f}")
print(f"  Test Recall: {test_results[3]:.4f}")

# Calculate F1 Score
f1_dl = 2 * (test_results[2] * test_results[3]) / (test_results[2] + test_results[3]) if (test_results[2] + test_results[3]) > 0 else 0
print(f"  F1-Score: {f1_dl:.4f}")

print("\n" + "="*80)
print("Phase 3 Complete!\n")

In [None]:
# PHASE 4 - REPORTING AND INSIGHTS
print("="*80)
print("PHASE 4: REPORTING AND INSIGHTS")
print("="*80)

# Step 9: Insights and Recommendations
print("\nStep 9: Insights and Recommendations\n")
print("="*80)

print("\nKEY FINDINGS:\n")
print("1. EMPLOYEE PERFORMANCE FACTORS:")

# Analyze feature importance from Random Forest
feature_importance = pd.DataFrame({
    'Feature': ['Age', 'Salary', 'YearsAtCompany', 'PerformanceScore', 'Department'],
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n   Top factors contributing to employee performance:")
for idx, row in feature_importance.iterrows():
    print(f"   - {row['Feature']}: {row['Importance']:.4f}")

print("\n2. HIGH-RISK DEPARTMENTS FOR ATTRITION:")
attrition_by_dept = df.groupby('Department')['Attrition'].apply(
    lambda x: (x == 'Yes').sum() / len(x) * 100
).sort_values(ascending=False)

for dept, rate in attrition_by_dept.items():
    risk_level = "HIGH" if rate > 40 else "MEDIUM" if rate > 25 else "LOW"
    print(f"   - {dept}: {rate:.2f}% attrition rate [{risk_level} RISK]")

print("\n3. PERFORMANCE SCORE DISTRIBUTION:")
perf_stats = df['PerformanceScore'].describe()
print(f"   - Average Performance Score: {perf_stats['mean']:.2f}")
print(f"   - High Performers (>= 85): {len(df[df['PerformanceScore'] >= 85])} employees ({len(df[df['PerformanceScore'] >= 85])/len(df)*100:.1f}%)")
print(f"   - Low Performers (< 70): {len(df[df['PerformanceScore'] < 70])} employees ({len(df[df['PerformanceScore'] < 70])/len(df)*100:.1f}%)")

In [None]:
# Step 9 (continued): Recommendations
print("\n" + "="*80)
print("\nRECOMMENDATIONS FOR IMPROVEMENT:\n")

print("1. Department-wise Performance Improvement Plans:")
for dept in df['Department'].unique():
    dept_data = df[df['Department'] == dept]
    avg_perf = dept_data['PerformanceScore'].mean()
    print(f"   - {dept}: Average performance {avg_perf:.2f}")
    if avg_perf < 80:
        print(f"     → Implement targeted training and development programs")

print("\n2. Targeted Employee Engagement Programs:")
print("   - Focus on high-risk departments with attrition rates > 30%")
print("   - Develop mentorship programs for employees with < 5 years experience")
print("   - Create career development paths for high performers")

print("\n3. Retention Strategies:")
print("   - Regular performance reviews and feedback sessions")
print("   - Competitive compensation packages for high performers")
print("   - Work-life balance initiatives")
print("   - Recognition and rewards programs")

print("\n4. Predictive Model Integration:")
print(f"   - Use attrition prediction model (Accuracy: {accuracy:.2%}) to identify at-risk employees")
print(f"   - Use performance prediction model (R²: {r2:.2f}) for workforce planning")
print("   - Implement early warning system for potential attrition")

In [None]:
# Step 10: Data Visualization and Reporting
print("\n" + "="*80)
print("Step 10: Data Visualization and Reporting")
print("="*80 + "\n")

# Create comprehensive visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Line Plot - Performance Trends by Years at Company
ax1 = axes[0, 0]
perf_by_years = df.groupby('YearsAtCompany')['PerformanceScore'].mean().sort_index()
ax1.plot(perf_by_years.index, perf_by_years.values, marker='o', linewidth=2, markersize=8)
ax1.set_title('Performance Score Trends by Years at Company', fontsize=14, fontweight='bold')
ax1.set_xlabel('Years at Company')
ax1.set_ylabel('Average Performance Score')
ax1.grid(True, alpha=0.3)

# 2. Bar Chart - Attrition by Department
ax2 = axes[0, 1]
attrition_counts = df.groupby(['Department', 'Attrition']).size().unstack(fill_value=0)
attrition_counts.plot(kind='bar', ax=ax2, color=['#2ecc71', '#e74c3c'])
ax2.set_title('Attrition Distribution by Department', fontsize=14, fontweight='bold')
ax2.set_xlabel('Department')
ax2.set_ylabel('Number of Employees')
ax2.legend(['No Attrition', 'Attrition'])
ax2.tick_params(axis='x', rotation=45)

# 3. Scatter Plot - Salary vs Performance
ax3 = axes[1, 0]
for dept in df['Department'].unique():
    dept_data = df[df['Department'] == dept]
    ax3.scatter(dept_data['Salary'], dept_data['PerformanceScore'],
               label=dept, alpha=0.6, s=100)
ax3.set_title('Salary vs Performance Score by Department', fontsize=14, fontweight='bold')
ax3.set_xlabel('Salary')
ax3.set_ylabel('Performance Score')
ax3.legend()
ax3.grid(True, alpha=0.3)

# 4. Model Performance Comparison
ax4 = axes[1, 1]
models = ['Random Forest\n(Attrition)', 'Linear Regression\n(Performance)', 'Deep Learning\n(Performance)', 'Deep Learning\n(Attrition)']
scores = [accuracy, r2, r2_dl, test_results[1]]
colors = ['#3498db', '#2ecc71', '#e67e22', '#9b59b6']
ax4.bar(models, scores, color=colors, alpha=0.7)
ax4.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax4.set_ylabel('Score (Accuracy/R²)')
ax4.set_ylim([0, 1])
for i, v in enumerate(scores):
    ax4.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("\n✅ ALL PHASES COMPLETED SUCCESSFULLY!")
print("\n" + "="*80)
print("\nPROJECT SUMMARY:")
print(f"  • Total Employees Analyzed: {len(df)}")
print(f"  • Features Examined: Age, Salary, Years at Company, Performance, Department")
print(f"  • Attrition Rate: {prob_attrition*100:.2f}%")
print(f"  • Best Attrition Model: Random Forest (Accuracy: {accuracy:.2%})")
print(f"  • Best Performance Model: Deep Learning (R²: {r2_dl:.4f})")
print("\n" + "="*80)