# Task 5: Flight Risk Identification

This notebook implements **Task 5** of the Employee Sentiment Analysis project. The objective is to identify employees who are at risk of leaving based on their messaging patterns.

## Flight Risk Definition:
- **Flight Risk**: Any employee who has sent **4 or more negative messages** in any **30-day rolling period**
- **Rolling Window**: 30-day period is calculated as a rolling count, irrespective of month boundaries
- **Criteria**: Based on message count, not cumulative score

## Requirements:
- Implement 30-day rolling window analysis
- Identify all employees meeting the flight risk criteria
- Provide robust flagging process
- Extract comprehensive list of at-risk employees

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load data with sentiment labels
df = pd.read_csv('../data/processed/email_data_with_sentiment.csv')
df['date'] = pd.to_datetime(df['date'])

# Filter for negative messages only
negative_messages = df[df['sentiment_final'] == 'Negative'].copy()
negative_messages = negative_messages.sort_values(['from', 'date'])

print(f"Total negative messages: {len(negative_messages):,}")
print(f"Employees with negative messages: {negative_messages['from'].nunique():,}")

def identify_flight_risk_employees(df_negative, window_days=30, threshold=4):
    """
    Identify employees with 4+ negative messages in any 30-day rolling window
    """
    flight_risk_employees = set()
    flight_risk_details = []
    
    # Group by employee
    for employee in df_negative['from'].unique():
        employee_data = df_negative[df_negative['from'] == employee].copy()
        employee_data = employee_data.sort_values('date')
        
        # Check each message as a potential start of 30-day window
        for i, (idx, row) in enumerate(employee_data.iterrows()):
            start_date = row['date']
            end_date = start_date + timedelta(days=window_days)
            
            # Count negative messages in this 30-day window
            window_messages = employee_data[
                (employee_data['date'] >= start_date) & 
                (employee_data['date'] < end_date)
            ]
            
            negative_count = len(window_messages)
            
            if negative_count >= threshold:
                flight_risk_employees.add(employee)
                
                # Store details of this risk period
                flight_risk_details.append({
                    'employee': employee,
                    'risk_period_start': start_date.date(),
                    'risk_period_end': end_date.date(),
                    'negative_message_count': negative_count,
                    'risk_level': 'HIGH' if negative_count >= 6 else 'MEDIUM'
                })
                
                # Skip ahead to avoid overlapping windows for the same employee
                break
    
    return flight_risk_employees, flight_risk_details

# Identify flight risk employees
flight_risk_set, risk_details = identify_flight_risk_employees(negative_messages)

print(f"\n=== FLIGHT RISK ANALYSIS RESULTS ===")
print(f"Employees identified as flight risk: {len(flight_risk_set)}")

if flight_risk_set:
    print(f"\n⚠️ FLIGHT RISK EMPLOYEES:")
    flight_risk_df = pd.DataFrame(risk_details)
    
    for i, employee in enumerate(sorted(flight_risk_set), 1):
        employee_risks = flight_risk_df[flight_risk_df['employee'] == employee]
        max_negative_count = employee_risks['negative_message_count'].max()
        risk_level = employee_risks['risk_level'].iloc[0]
        
        print(f"{i}. {employee}")
        print(f"   - Max negative messages in 30 days: {max_negative_count}")
        print(f"   - Risk Level: {risk_level}")
        
        # Show the risk period details
        for _, risk in employee_risks.iterrows():
            print(f"   - Risk Period: {risk['risk_period_start']} to {risk['risk_period_end']} ({risk['negative_message_count']} negative messages)")
        print()

    # Save flight risk analysis
    flight_risk_df.to_csv('../data/processed/flight_risk_employees.csv', index=False)
    
    # Create summary for final report
    flight_risk_summary = {
        'total_flight_risk_employees': len(flight_risk_set),
        'flight_risk_employees': sorted(list(flight_risk_set)),
        'analysis_date': datetime.now().strftime('%Y-%m-%d'),
        'criteria': '4+ negative messages in any 30-day rolling period'
    }
    
    pd.DataFrame([flight_risk_summary]).to_csv('../data/processed/flight_risk_summary.csv', index=False)
    
    print(f"Flight risk analysis saved to:")
    print(f"- '../data/processed/flight_risk_employees.csv'")
    print(f"- '../data/processed/flight_risk_summary.csv'")
    
else:
    print("✅ No employees identified as flight risk based on current criteria.")

print(f"\n" + "="*60)
print("Task 5 completed successfully!")
print("Next: Run Task 6 (Predictive Modeling)")
print("="*60)