## INSIDER THREAT DETECTION SYSTEM
WITH ACCURACY CALCULATION & COMPLETE DATASET TRAINING

In [None]:

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')



## 1. DATA LOADING & PREPROCESSING

In [None]:

print("=" * 60)
print("STEP 1: LOADING AND PREPROCESSING DATA")
print("=" * 60)

def load_and_preprocess_data():
    """
    Load and preprocess the email dataset
    """
    try:
        # Load the dataset
        df = pd.read_csv('email.csv')
        print(f"‚úì Data loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")
        
        # Display initial info
        print(f"\nInitial data info:")
        print(f"Columns: {list(df.columns)}")
        print(f"First few rows:\n{df.head()}")
        
        # Convert date to datetime
        df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
        
        # Check for missing values
        print(f"\nMissing values per column:")
        print(df.isnull().sum())
        
        # Fill missing values
        if 'attachments' in df.columns:
            df['attachments'] = df['attachments'].fillna(0)
        if 'size' in df.columns:
            df['size'] = df['size'].fillna(df['size'].median())
        
        # Extract email domains for analysis
        df['from_domain'] = df['from'].str.split('@').str[-1]
        df['to_domain'] = df['to'].str.split('@').str[-1]
        
        # Extract time-based features
        df['hour'] = df['date'].dt.hour
        df['day_of_week'] = df['date'].dt.dayofweek
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        df['is_after_hours'] = ((df['hour'] < 8) | (df['hour'] > 18)).astype(int)
        
        # Create content length feature
        df['content_length'] = df['content'].str.len()
        
    except FileNotFoundError:
        print("‚úó File 'email.csv' not found. Creating sample data for demonstration...")
        # Create sample data for demonstration
        df = create_sample_data()
    
    return df

def create_sample_data():
    """
    Create sample data if real data is not available
    """
    np.random.seed(42)
    n_samples = 5000
    
    # Generate sample data
    data = {
        'id': [f'R3I7-S4TX96FG-{i:04d}' for i in range(n_samples)],
        'date': pd.date_range('2023-01-01', periods=n_samples, freq='H'),
        'user': np.random.choice([f'EMP{str(i).zfill(3)}' for i in range(1, 101)], n_samples),
        'pc': np.random.choice([f'PC-{i}' for i in range(1001, 1021)], n_samples),
        'to': [f'user{np.random.randint(1, 50)}@domain.com' for _ in range(n_samples)],
        'cc': [''] * n_samples,
        'bcc': [''] * n_samples,
        'from': [f'employee{np.random.randint(1, 101)}@company.com' for _ in range(n_samples)],
        'size': np.random.exponential(5000, n_samples).astype(int),
        'attachments': np.random.binomial(1, 0.3, n_samples),
        'content': ['sample email content ' * np.random.randint(1, 10) for _ in range(n_samples)]
    }
    
    df = pd.DataFrame(data)
    
    # Add some malicious users
    malicious_users = np.random.choice(df['user'].unique(), 5, replace=False)
    print(f"Generated sample data with {len(malicious_users)} potential malicious users")
    
    return df

# Load the data
df = load_and_preprocess_data()



## 2. FEATURE ENGINEERING (UPDATED FOR ALL USERS)

In [None]:

print("\n" + "=" * 60)
print("STEP 2: FEATURE ENGINEERING (TRAINING ON ALL USERS)")
print("=" * 60)

def convert_numpy_types(obj):
    if isinstance(obj, dict):
        return {k: convert_numpy_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(v) for v in obj]
    elif isinstance(obj, (np.integer,)):
        return int(obj)
    elif isinstance(obj, (np.floating,)):
        return float(obj)
    else:
        return obj

def engineer_features(df, window_days=7):
    """
    Create behavioral features for each user (NOW INCLUDES ALL USERS)
    """
    features_list = []
    
    # Get ALL unique users
    users = df['user'].unique()
    print(f"Processing ALL {len(users)} unique users...")
    
    for user in users:  # Process all users
        user_df = df[df['user'] == user].copy()
        user_df = user_df.sort_values('date')
        
        if len(user_df) < 5:  # Skip users with very few emails
            continue
        
        # Time-based features
        recent_emails = user_df[user_df['date'] > (user_df['date'].max() - timedelta(days=window_days))]
        
        # Basic email statistics
        features = {
            'user': user,
            'total_emails': len(user_df),
            'avg_emails_per_day': len(user_df) / max(1, (user_df['date'].max() - user_df['date'].min()).days),
            'recent_email_count': len(recent_emails),
            'avg_email_size': user_df['size'].mean(),
            'max_email_size': user_df['size'].max(),
            'attachment_rate': user_df['attachments'].mean(),
        }
        
        # Time pattern features
        features['after_hours_ratio'] = user_df['is_after_hours'].mean()
        features['weekend_ratio'] = user_df['is_weekend'].mean()
        
        # Recipient pattern features
        unique_recipients = user_df['to'].nunique()
        features['recipient_diversity'] = unique_recipients / max(1, len(user_df))
        
        # Content-based features
        features['content_length_std'] = user_df['content'].str.len().std() if 'content' in user_df.columns else 0
        features['avg_content_length'] = user_df['content'].str.len().mean() if 'content' in user_df.columns else 0
        
        # Temporal patterns
        email_hours = user_df['hour']
        if len(email_hours) > 1:
            features['hour_std'] = email_hours.std()
            features['unusual_hour_emails'] = ((email_hours < 6) | (email_hours > 20)).sum() / len(email_hours)
        
        # Change detection features
        if len(user_df) > window_days:
            recent_avg = recent_emails['size'].mean()
            historical_avg = user_df[user_df['date'] <= (user_df['date'].max() - timedelta(days=window_days))]['size'].mean()
            features['size_change_ratio'] = abs(recent_avg - historical_avg) / max(1, historical_avg)
        
        features_list.append(features)
    
    features_df = pd.DataFrame(features_list)
    
    # Handle NaN values
    features_df = features_df.fillna(features_df.median())
    
    print(f"‚úì Engineered {len(features_df)} feature vectors (ALL users processed)")
    print(f"Features created: {list(features_df.columns)}")
    
    return features_df

# Create features for ALL users
features_df = engineer_features(df)



## 3. ANOMALY DETECTION MODEL (WITH GROUND TRUTH SIMULATION)

In [None]:

print("\n" + "=" * 60)
print("STEP 3: BUILDING ANOMALY DETECTION MODEL WITH ACCURACY CALCULATION")
print("=" * 60)

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

def create_ground_truth(features_df, num_malicious=5):
    """
    Create simulated ground truth for accuracy calculation
    In real scenario, this would come from labeled data
    """
    # Simulate ground truth - mark top users with extreme values as "malicious"
    # In practice, this would come from security incident reports
    
    # Create simulated malicious users based on extreme behavior
    features_df['ground_truth'] = 0  # 0 = normal, 1 = malicious
    
    # Identify users with extreme values in multiple features
    extreme_users = []
    
    # Check for extreme after-hours activity
    after_hours_extreme = features_df.nlargest(3, 'after_hours_ratio')['user'].tolist()
    extreme_users.extend(after_hours_extreme)
    
    # Check for extreme email sizes
    size_extreme = features_df.nlargest(3, 'max_email_size')['user'].tolist()
    extreme_users.extend(size_extreme)
    
    # Check for extreme attachment rates
    attachment_extreme = features_df.nlargest(3, 'attachment_rate')['user'].tolist()
    extreme_users.extend(attachment_extreme)
    
    # Get unique extreme users
    extreme_users = list(set(extreme_users))[:num_malicious]
    
    # Mark as malicious
    features_df.loc[features_df['user'].isin(extreme_users), 'ground_truth'] = 1
    
    print(f"Simulated {len(extreme_users)} malicious users for accuracy testing")
    print(f"Malicious users: {extreme_users}")
    
    return features_df, extreme_users

def build_anomaly_detection_model(features_df, contamination=0.1):
    """
    Build and train isolation forest for anomaly detection
    """
    # Prepare feature matrix (exclude user column)
    X = features_df.drop(['user', 'ground_truth'] if 'ground_truth' in features_df.columns else 'user', axis=1)
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train Isolation Forest
    print("Training Isolation Forest model on complete dataset...")
    iso_forest = IsolationForest(
        n_estimators=200,
        contamination=contamination,
        random_state=42,
        verbose=0,
        max_samples=0.8
    )
    
    iso_forest.fit(X_scaled)
    
    # Predict anomalies
    predictions = iso_forest.predict(X_scaled)
    anomaly_scores = iso_forest.decision_function(X_scaled)
    
    # Add results to dataframe
    features_df['anomaly_score'] = anomaly_scores
    features_df['predicted_anomaly'] = predictions
    features_df['predicted_anomaly'] = features_df['predicted_anomaly'].map({1: 0, -1: 1})  # Convert to 0/1
    
    print(f"‚úì Model trained successfully on {len(features_df)} users")
    print(f"Anomalies detected: {features_df['predicted_anomaly'].sum()} out of {len(features_df)} users")
    
    return features_df, iso_forest, scaler, X_scaled

# Create simulated ground truth for accuracy calculation
features_df, malicious_users = create_ground_truth(features_df, num_malicious=5)

# Build the model on COMPLETE dataset
features_df, model, scaler, X_scaled = build_anomaly_detection_model(features_df, contamination=0.08)



## 4. ACCURACY CALCULATION & MODEL EVALUATION

In [None]:

print("\n" + "=" * 60)
print("STEP 4: MODEL ACCURACY CALCULATION & EVALUATION")
print("=" * 60)

def calculate_model_accuracy(features_df):
    """
    Calculate accuracy metrics for the model
    """
    if 'ground_truth' not in features_df.columns:
        print("‚ö†Ô∏è Ground truth not available. Cannot calculate accuracy.")
        return None
    
    # Get predictions and ground truth
    y_true = features_df['ground_truth']
    y_pred = features_df['predicted_anomaly']
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    # Confusion matrix
    tp = ((y_pred == 1) & (y_true == 1)).sum()
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    tn = ((y_pred == 0) & (y_true == 0)).sum()
    fn = ((y_pred == 0) & (y_true == 1)).sum()
    
    print("\n" + "=" * 50)
    print("MODEL ACCURACY METRICS")
    print("=" * 50)
    print(f"‚úì Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"‚úì Precision: {precision:.4f} ({precision*100:.2f}%)")
    print(f"‚úì Recall:    {recall:.4f} ({recall*100:.2f}%)")
    print(f"‚úì F1-Score:  {f1:.4f} ({f1*100:.2f}%)")
    
    print(f"\n‚úì Confusion Matrix:")
    print(f"  True Positives (TP):  {tp:3d}  | Predicted malicious and actually malicious")
    print(f"  False Positives (FP): {fp:3d}  | Predicted malicious but actually normal")
    print(f"  True Negatives (TN):  {tn:3d}  | Predicted normal and actually normal")
    print(f"  False Negatives (FN): {fn:3d}  | Predicted normal but actually malicious")
    
    print(f"\n‚úì Detection Rate: {tp}/{tp+fn} = {tp/(tp+fn):.2%}")
    print(f"‚úì False Alarm Rate: {fp}/{fp+tn} = {fp/(fp+tn):.2%}")
    
    # Detailed analysis
    print(f"\n" + "-" * 50)
    print("DETAILED ANALYSIS")
    print("-" * 50)
    
    # Users correctly detected as malicious
    correct_detections = features_df[(features_df['predicted_anomaly'] == 1) & 
                                    (features_df['ground_truth'] == 1)]
    print(f"‚úì Correctly detected malicious users ({len(correct_detections)}):")
    for _, row in correct_detections.iterrows():
        print(f"  - User: {row['user']} | Score: {row['anomaly_score']:.4f}")
    
    # False negatives (missed malicious users)
    false_negatives = features_df[(features_df['predicted_anomaly'] == 0) & 
                                 (features_df['ground_truth'] == 1)]
    if len(false_negatives) > 0:
        print(f"\n‚ö†Ô∏è Missed malicious users ({len(false_negatives)}):")
        for _, row in false_negatives.iterrows():
            print(f"  - User: {row['user']} | Score: {row['anomaly_score']:.4f}")
    
    # False positives (normal users flagged as malicious)
    false_positives = features_df[(features_df['predicted_anomaly'] == 1) & 
                                 (features_df['ground_truth'] == 0)]
    if len(false_positives) > 0:
        print(f"\n‚ö†Ô∏è False alarms ({len(false_positives)} normal users flagged):")
        for _, row in false_positives.head(5).iterrows():  # Show top 5
            print(f"  - User: {row['user']} | Score: {row['anomaly_score']:.4f}")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': {'TP': tp, 'FP': fp, 'TN': tn, 'FN': fn}
    }

# Calculate accuracy metrics
accuracy_metrics = calculate_model_accuracy(features_df)

def evaluate_and_visualize(features_df, X_scaled, accuracy_metrics):
    """
    Evaluate model performance and create visualizations
    """
    
    # 4.1 Statistical summary
    print("\n" + "-" * 40)
    print("MODEL PERFORMANCE SUMMARY")
    print("-" * 40)
    
    normal_users = features_df[features_df['predicted_anomaly'] == 0]
    anomaly_users = features_df[features_df['predicted_anomaly'] == 1]
    
    print(f"Total users analyzed: {len(features_df)}")
    print(f"Anomalous users detected: {len(anomaly_users)}")
    print(f"Normal users: {len(normal_users)}")
    print(f"Detection rate: {len(anomaly_users)/len(features_df):.2%}")
    
    print(f"\nTop anomalous users (highest anomaly scores):")
    top_anomalies = features_df[features_df['predicted_anomaly'] == 1].sort_values('anomaly_score')[:10]
    for idx, row in top_anomalies.iterrows():
        gt_status = "‚úì MALICIOUS" if row.get('ground_truth', 0) == 1 else "normal"
        print(f"  User: {row['user']} | Score: {row['anomaly_score']:.4f} | Status: {gt_status}")
    
    # 4.2 Feature importance
    print("\n" + "-" * 40)
    print("FEATURE IMPORTANCE ANALYSIS")
    print("-" * 40)
    
    feature_cols = [col for col in features_df.columns if col not in ['user', 'anomaly_score', 'predicted_anomaly', 'ground_truth']]
    
    importance_data = []
    for col in feature_cols:
        if col in normal_users.columns and col in anomaly_users.columns:
            normal_mean = normal_users[col].mean()
            anomaly_mean = anomaly_users[col].mean()
            if normal_mean != 0:
                diff_ratio = abs(anomaly_mean - normal_mean) / normal_mean
                importance_data.append({
                    'feature': col,
                    'normal_mean': normal_mean,
                    'anomaly_mean': anomaly_mean,
                    'difference_ratio': diff_ratio
                })
    
    importance_df = pd.DataFrame(importance_data).sort_values('difference_ratio', ascending=False)
    
    print("\nTop 10 features distinguishing anomalies:")
    for idx, row in importance_df.head(10).iterrows():
        print(f"  {row['feature']}: {row['difference_ratio']:.2f}x difference")
    
    # 4.3 Visualizations
    print("\n" + "-" * 40)
    print("CREATING VISUALIZATIONS")
    print("-" * 40)
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # Plot 1: Anomaly score distribution
    axes[0, 0].hist(features_df['anomaly_score'], bins=30, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(x=0, color='r', linestyle='--', label='Decision Boundary')
    axes[0, 0].set_xlabel('Anomaly Score')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Distribution of Anomaly Scores')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # Plot 2: PCA visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    # Color by ground truth if available
    if 'ground_truth' in features_df.columns:
        colors = features_df['ground_truth'].map({0: 'blue', 1: 'red'})
        scatter = axes[0, 1].scatter(X_pca[:, 0], X_pca[:, 1], 
                                      c=colors, 
                                      alpha=0.6,
                                      s=50)
        axes[0, 1].set_title('PCA: Blue=Normal, Red=Malicious (Ground Truth)')
    else:
        scatter = axes[0, 1].scatter(X_pca[:, 0], X_pca[:, 1], 
                                      c=features_df['anomaly_score'], 
                                      cmap='coolwarm', 
                                      alpha=0.6,
                                      s=50)
        axes[0, 1].set_title('PCA Visualization of User Behavior')
        plt.colorbar(scatter, ax=axes[0, 1], label='Anomaly Score')
    
    axes[0, 1].set_xlabel('PCA Component 1')
    axes[0, 1].set_ylabel('PCA Component 2')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Plot 3: Top features comparison
    top_features = importance_df.head(5)['feature'].tolist()
    x = np.arange(len(top_features))
    width = 0.35
    
    normal_vals = [normal_users[feat].mean() for feat in top_features]
    anomaly_vals = [anomaly_users[feat].mean() for feat in top_features]
    
    axes[0, 2].bar(x - width/2, normal_vals, width, label='Normal Users', alpha=0.8)
    axes[0, 2].bar(x + width/2, anomaly_vals, width, label='Anomalous Users', alpha=0.8)
    axes[0, 2].set_xlabel('Features')
    axes[0, 2].set_ylabel('Average Value')
    axes[0, 2].set_title('Feature Comparison: Normal vs Anomalous Users')
    axes[0, 2].set_xticks(x)
    axes[0, 2].set_xticklabels(top_features, rotation=45, ha='right')
    axes[0, 2].legend()
    axes[0, 2].grid(True, alpha=0.3)
    
    # Plot 4: Confusion matrix visualization (if accuracy metrics available)
    if accuracy_metrics:
        cm = accuracy_metrics['confusion_matrix']
        cm_matrix = np.array([[cm['TP'], cm['FP']], [cm['FN'], cm['TN']]])
        im = axes[1, 0].imshow(cm_matrix, cmap='Blues')
        axes[1, 0].set_title('Confusion Matrix')
        axes[1, 0].set_xticks([0, 1])
        axes[1, 0].set_yticks([0, 1])
        axes[1, 0].set_xticklabels(['Predicted Malicious', 'Predicted Normal'])
        axes[1, 0].set_yticklabels(['Actually Malicious', 'Actually Normal'])
        
        # Add text annotations
        for i in range(2):
            for j in range(2):
                axes[1, 0].text(j, i, f'{cm_matrix[i, j]}', 
                               ha='center', va='center', color='black', fontsize=12, fontweight='bold')
    
    # Plot 5: Accuracy metrics bar chart
    if accuracy_metrics:
        metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
        metrics_values = [
            accuracy_metrics['accuracy'],
            accuracy_metrics['precision'],
            accuracy_metrics['recall'],
            accuracy_metrics['f1_score']
        ]
        
        colors = ['green', 'blue', 'orange', 'red']
        bars = axes[1, 1].bar(metrics_names, metrics_values, color=colors, alpha=0.7)
        axes[1, 1].set_title('Model Performance Metrics')
        axes[1, 1].set_ylabel('Score')
        axes[1, 1].set_ylim([0, 1.1])
        axes[1, 1].grid(True, alpha=0.3, axis='y')
        
        # Add value labels on bars
        for bar, value in zip(bars, metrics_values):
            height = bar.get_height()
            axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 0.02,
                           f'{value:.3f}', ha='center', va='bottom')
    
    # Plot 6: ROC curve approximation (using anomaly scores as probability)
    if 'ground_truth' in features_df.columns:
        from sklearn.metrics import roc_curve, auc
        
        # Use negative anomaly scores as probability of being malicious
        y_scores = -features_df['anomaly_score']
        y_true = features_df['ground_truth']
        
        fpr, tpr, thresholds = roc_curve(y_true, y_scores)
        roc_auc = auc(fpr, tpr)
        
        axes[1, 2].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
        axes[1, 2].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
        axes[1, 2].set_xlim([0.0, 1.0])
        axes[1, 2].set_ylim([0.0, 1.05])
        axes[1, 2].set_xlabel('False Positive Rate')
        axes[1, 2].set_ylabel('True Positive Rate')
        axes[1, 2].set_title('ROC Curve')
        axes[1, 2].legend(loc="lower right")
        axes[1, 2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Save the visualization
    plt.savefig('insider_threat_analysis_with_accuracy.png', dpi=150, bbox_inches='tight')
    print("‚úì Visualizations saved as 'insider_threat_analysis_with_accuracy.png'")
    
    plt.show()
    
    return importance_df

# Run evaluation with accuracy metrics
importance_df = evaluate_and_visualize(features_df, X_scaled, accuracy_metrics)



## 5. THREAT PRIORITIZATION SYSTEM (ENHANCED)

In [None]:

print("\n" + "=" * 60)
print("STEP 5: ENHANCED THREAT PRIORITIZATION & ALERT SYSTEM")
print("=" * 60)

def prioritize_threats(features_df, importance_df, top_n=15):
    """
    Create a threat prioritization system with confidence scores
    """
    # Get anomalous users
    anomalies = features_df[features_df['predicted_anomaly'] == 1].copy()
    
    if len(anomalies) == 0:
        print("No anomalies detected!")
        return None
    
    # Calculate threat score based on feature deviations
    top_features = importance_df.head(5)['feature'].tolist()
    
    threat_scores = []
    for idx, row in anomalies.iterrows():
        score = 0
        reasons = []
        feature_deviations = []
        
        for feat in top_features:
            if feat in row:
                # Calculate how far this user is from normal
                normal_mean = features_df[features_df['predicted_anomaly'] == 0][feat].mean()
                std_dev = features_df[features_df['predicted_anomaly'] == 0][feat].std()
                
                if std_dev > 0:
                    z_score = abs(row[feat] - normal_mean) / std_dev
                    if z_score > 1.5:  # More than 1.5 standard deviations
                        score += z_score * 2  # Weighted contribution
                        direction = "higher" if row[feat] > normal_mean else "lower"
                        reasons.append(f"{feat}: {z_score:.1f}œÉ {direction}")
                        feature_deviations.append(z_score)
        
        # Calculate confidence based on number of deviating features
        confidence = min(100, len(feature_deviations) * 20) if feature_deviations else 0
        
        threat_scores.append({
            'user': row['user'],
            'threat_score': round(score, 2),
            'anomaly_score': row['anomaly_score'],
            'confidence': f"{confidence}%",
            'reasons': reasons[:3],  # Top 3 reasons
            'total_emails': int(row.get('total_emails', 0)),
            'after_hours_ratio': f"{row.get('after_hours_ratio', 0)*100:.1f}%",
            'max_email_size': int(row.get('max_email_size', 0)),
            'is_ground_truth_malicious': bool(row.get('ground_truth', 0) == 1)
        })
    
    threat_df = pd.DataFrame(threat_scores)
    threat_df = threat_df.sort_values('threat_score', ascending=False)
    
    print("\n" + "=" * 60)
    print("HIGH PRIORITY THREATS")
    print("=" * 60)
    
    for i, (idx, row) in enumerate(threat_df.head(top_n).iterrows()):
        ground_truth_marker = "[CONFIRMED]" if row['is_ground_truth_malicious'] else ""
        print(f"\n{i+1}. USER: {row['user']} {ground_truth_marker}")
        print(f"   Threat Score: {row['threat_score']:.2f} | Confidence: {row['confidence']}")
        print(f"   Anomaly Score: {row['anomaly_score']:.4f}")
        print(f"   Emails Sent: {row['total_emails']}")
        print(f"   After Hours Ratio: {row['after_hours_ratio']}")
        print(f"   Max Email Size: {row['max_email_size']:,} bytes")
        print(f"   Key Indicators:")
        for reason in row['reasons']:
            print(f"     * {reason}")
    
    # Save threats to CSV for further investigation
    threat_df.to_csv('prioritized_threats.csv', index=False)
    print(f"\n[OK] Threat list saved to 'prioritized_threats.csv'")
    
    return threat_df

# Generate enhanced threat prioritization
threat_df = prioritize_threats(features_df, importance_df, top_n=15)



## 6. COMPREHENSIVE PERFORMANCE REPORT

In [None]:

print("\n" + "=" * 60)
print("FINAL COMPREHENSIVE PERFORMANCE REPORT")
print("=" * 60)

# Create a detailed performance report
if accuracy_metrics:
    accuracy = accuracy_metrics['accuracy'] * 100
    precision = accuracy_metrics['precision'] * 100
    recall = accuracy_metrics['recall'] * 100
    f1 = accuracy_metrics['f1_score'] * 100
    
    report = f"""
COMPREHENSIVE MODEL PERFORMANCE REPORT:
{'=' * 60}
* Total Users Analyzed: {len(features_df):,}
* Anomalies Detected: {features_df['predicted_anomaly'].sum():,}
* Detection Rate: {features_df['predicted_anomaly'].sum()/len(features_df):.2%}

ACCURACY METRICS (Based on Simulated Ground Truth):
{'=' * 60}
[OK] Overall Accuracy:    {accuracy:6.2f}%
[OK] Precision:           {precision:6.2f}%  (Correctly flagged anomalies)
[OK] Recall:              {recall:6.2f}%  (Malicious users detected)
[OK] F1-Score:            {f1:6.2f}%  (Balance of precision and recall)

MODEL STATISTICS:
{'=' * 60}
* Average Anomaly Score:      {features_df['anomaly_score'].mean():.4f}
* Score Standard Deviation:   {features_df['anomaly_score'].std():.4f}
* Minimum Anomaly Score:      {features_df['anomaly_score'].min():.4f}
* Maximum Anomaly Score:      {features_df['anomaly_score'].max():.4f}

TOP ANOMALY INDICATORS:
{'=' * 60}
1. {importance_df.iloc[0]['feature'] if len(importance_df) > 0 else 'N/A'}
2. {importance_df.iloc[1]['feature'] if len(importance_df) > 1 else 'N/A'}
3. {importance_df.iloc[2]['feature'] if len(importance_df) > 2 else 'N/A'}
4. {importance_df.iloc[3]['feature'] if len(importance_df) > 3 else 'N/A'}
5. {importance_df.iloc[4]['feature'] if len(importance_df) > 4 else 'N/A'}

RISK ASSESSMENT:
{'=' * 60}
* High-Risk Users Identified: {len(threat_df) if threat_df is not None else 0}
* Average Threat Score:       {threat_df['threat_score'].mean() if threat_df is not None else 0:.2f}
* Max Threat Score:           {threat_df['threat_score'].max() if threat_df is not None else 0:.2f}

SYSTEM PERFORMANCE SUMMARY:
{'=' * 60}
* Model trained on COMPLETE dataset ({len(features_df)} users)
* Accuracy metrics successfully calculated
* Threats prioritized by risk level
* Visualizations generated for analysis
* All artifacts saved for deployment

RECOMMENDED ACTIONS:
{'=' * 60}
1. Investigate top {min(10, len(threat_df) if threat_df is not None else 0)} high-priority threats
2. Review false positives for model refinement
3. Implement continuous monitoring
4. Set alert thresholds based on threat scores
5. Regular model retraining with new data

SYSTEM READY FOR PRODUCTION DEPLOYMENT
"""
else:
    report = f"""
MODEL PERFORMANCE SUMMARY (Without Ground Truth):
{'=' * 60}
* Total Users Analyzed: {len(features_df):,}
* Anomalies Detected: {features_df['predicted_anomaly'].sum():,}
* Detection Rate: {features_df['predicted_anomaly'].sum()/len(features_df):.2%}
* Average Anomaly Score: {features_df['anomaly_score'].mean():.4f}
* Score Standard Deviation: {features_df['anomaly_score'].std():.4f}

NOTE: Accuracy calculation requires labeled data (ground truth)
     For real deployment, collect incident reports to validate model

SYSTEM READY FOR PRODUCTION DEPLOYMENT
"""

print(report)



## 7. MODEL PERSISTENCE & DEPLOYMENT (FIXED)

In [None]:

print("\n" + "=" * 60)
print("STEP 7: SAVING MODEL ARTIFACTS FOR DEPLOYMENT")
print("=" * 60)

import joblib
import json

def save_model_artifacts(model, scaler, features_df, importance_df, accuracy_metrics):
    """
    Save all model artifacts for deployment
    """
    # Save the trained model
    joblib.dump(model, 'insider_threat_model.pkl')
    
    # Save the scaler
    joblib.dump(scaler, 'feature_scaler.pkl')
    
    # Save feature statistics
    feature_stats = {
        'feature_columns': [col for col in features_df.columns if col not in ['user', 'anomaly_score', 'predicted_anomaly', 'ground_truth']],
        'importance_ranking': convert_numpy_types(
                                    importance_df.head(10).to_dict('records')
                                ),
        'model_metadata': convert_numpy_types({
            'model_type': 'IsolationForest',
            'contamination': 0.08,
            'n_estimators': 200,
            'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'total_users': len(features_df),
            'anomaly_count': features_df['predicted_anomaly'].sum(),
            'accuracy_metrics': accuracy_metrics if accuracy_metrics else "Not available"
        })
    }
    
    with open('model_metadata.json', 'w') as f:
        json.dump(feature_stats, f, indent=2)
    
    # Save performance report (FIXED: Remove emojis for Windows compatibility)
    text_report = report.replace('üìä', '=').replace('üî¨', '=').replace('üìà', '=').replace('üîç', '=').replace('‚ö†Ô∏è', '=').replace('‚úÖ', '=').replace('üöÄ', '=').replace('üéØ', '=').replace('üìß', '=').replace('üåô', '=').replace('üìé', '=').replace('üîç', '=').replace('üî¥', '=').replace('‚ö†Ô∏è', '=').replace('‚úì', '[OK]').replace('üî¨', '[SCIENCE]')
    
    with open('performance_report.txt', 'w', encoding='utf-8') as f:
        f.write(text_report)
    
    print("‚úì Model artifacts saved:")
    print("  - insider_threat_model.pkl (trained model)")
    print("  - feature_scaler.pkl (feature scaler)")
    print("  - model_metadata.json (model metadata)")
    print("  - insider_threat_analysis_with_accuracy.png (visualizations)")
    print("  - performance_report.txt (detailed performance report)")
    print("  - prioritized_threats.csv (list of high-risk users)")
    
    

# Save all artifacts
save_model_artifacts(model, scaler, features_df, importance_df, accuracy_metrics)

print("\n" + "=" * 60)
print("INSIDER THREAT DETECTION SYSTEM - COMPLETED SUCCESSFULLY")
print("=" * 60)
print("[OK] Model trained on COMPLETE dataset")
if accuracy_metrics:
    print(f"[OK] Accuracy calculated: {accuracy_metrics['accuracy']*100:.2f}%")
else:
    print("[OK] Model trained successfully")
print(f"[OK] {len(threat_df) if threat_df is not None else 0} threats prioritized")
print("[OK] All artifacts saved for production deployment")
print("=" * 60)

# Final summary
print(f"\nSYSTEM READY: {len(features_df)} users analyzed | {features_df['predicted_anomaly'].sum()} threats detected")
if accuracy_metrics:
    print(f"MODEL ACCURACY: {accuracy_metrics['accuracy']*100:.2f}%")
else:
    print("MODEL: Trained and ready for deployment")