# üîç ANOMALY DETECTION IN MAMBA SEEDLING STUDENTS
## Phase 3: MODELING (Implement Anomaly Detection Algorithms)

---

### OBJECTIVES OF THIS PHASE:
1. Load prepared and normalized data
2. Implement 4 anomaly detection algorithms
3. Test with 3 different train/test splits (70/30, 60/40, 80/20)
4. Compare performance across different splits
5. Identify best configuration for each model

### EXPECTED OUTPUT:
- 4 trained anomaly detection models
- Performance metrics for each train/test split
- Detected anomalies flagged and scored
- Comparison analysis

## INITIAL SETUP: Load Data and Libraries

In [None]:
# SETUP: Import libraries and load prepared data from Phase 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pickle
import warnings
warnings.filterwarnings('ignore')

# Import anomaly detection models
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, silhouette_score
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Configure visualization
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10

print("‚úÖ All libraries imported successfully")

# ============================================================================
# LOAD PREPARED DATA FROM PHASE 2
# ============================================================================
data_path = r'c:\Users\DELL\Documents\GitHub\material-DT-1\An√°lisis Nuevo\data\dataset_prepared_minmax.csv'
df_data = pd.read_csv(data_path)

print(f"\nüìä PREPARED DATA LOADED FROM PHASE 2")
print(f"   File: dataset_prepared_minmax.csv")
print(f"   Samples (students): {df_data.shape[0]}")
print(f"   Variables (features): {df_data.shape[1]}")
print(f"   Data normalization: Min-Max [0, 1]")

# ============================================================================
# DATA VALIDATION AND INTELLIGENT CLEANING
# ============================================================================
print(f"\nüîç DATA VALIDATION AND CLEANING:")

initial_rows = len(df_data)

# Check for missing values
missing_count = df_data.isnull().sum().sum()
print(f"   1. Missing values (NaN): {missing_count}")

if missing_count > 0:
    print(f"      ‚ö†Ô∏è  Found {missing_count} NaN values in data")
    print(f"      Strategy: Using SimpleImputer with median strategy")
    
    # Use median imputation instead of dropping rows
    numeric_cols = df_data.select_dtypes(include=[np.number]).columns
    imputer = SimpleImputer(strategy='median')
    df_data[numeric_cols] = imputer.fit_transform(df_data[numeric_cols])
    
    print(f"      ‚úÖ NaN values imputed with median")

# Check for infinite values
has_inf = np.isinf(df_data.select_dtypes(include=[np.number]).values).any()
if has_inf:
    print(f"   2. Infinite values detected")
    print(f"      ‚ö†Ô∏è  Replacing infinite values with max/min bounds...")
    
    numeric_cols = df_data.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df_data[col] = df_data[col].replace([np.inf, -np.inf], np.nan)
    
    # Impute the remaining NaN from infinite replacement
    df_data[numeric_cols] = imputer.fit_transform(df_data[numeric_cols])
    print(f"      ‚úÖ Infinite values fixed")
else:
    print(f"   2. Infinite values: None detected")

# Check data types are numeric
print(f"   3. Data type validation:")
print(f"      - Numeric columns: {len(df_data.select_dtypes(include=[np.number]).columns)}")

# Verify data is in expected range
numeric_df = df_data.select_dtypes(include=[np.number])
min_val = numeric_df.min().min()
max_val = numeric_df.max().max()
print(f"      - Value range: [{min_val:.4f}, {max_val:.4f}]")

# Final validation
final_missing = df_data.isnull().sum().sum()
final_inf = np.isinf(df_data.select_dtypes(include=[np.number]).values).any()

print(f"\n   ‚úÖ DATA VALIDATION COMPLETE:")
print(f"      Initial rows: {initial_rows}")
print(f"      Final rows: {len(df_data)} (0 dropped)")
print(f"      Missing values after cleaning: {final_missing}")
print(f"      Infinite values after cleaning: {final_inf}")

if len(df_data) == 0:
    print(f"\n   ‚ùå ERROR: Dataset is empty! Check source data.")
else:
    print(f"   ‚úÖ Dataset is ready for modeling")

# Load Min-Max Scaler for reference
scaler_path = r'c:\Users\DELL\Documents\GitHub\material-DT-1\An√°lisis Nuevo\data\scaler_minmax.pkl'
with open(scaler_path, 'rb') as f:
    scaler_minmax = pickle.load(f)

print(f"\n‚úÖ Min-Max Scaler loaded (feature_range={scaler_minmax.feature_range})")

print(f"\nüìä Dataset Preview (First 5 rows):")
print(df_data.head())

print(f"\nüìà Data Statistics:")
numeric_cols = df_data.select_dtypes(include=[np.number])
print(f"   Min value (all variables): {numeric_cols.min().min():.6f}")
print(f"   Max value (all variables): {numeric_cols.max().max():.6f}")
print(f"   Mean value (all variables): {numeric_cols.mean().mean():.6f}")
print(f"   Std value (all variables): {numeric_cols.std().mean():.6f}")

print(f"\n‚ú® Data ready for model training!")

‚úÖ All libraries imported successfully

üìä PREPARED DATA LOADED FROM PHASE 2
   File: dataset_prepared_minmax.csv
   Samples (students): 81
   Variables (features): 45
   Data normalization: Min-Max [0, 1]

üîç DATA VALIDATION:
   Missing values (NaN): 243
   ‚ö†Ô∏è  Found NaN values! Removing rows with missing data...
   Removed 81 rows with NaN
   Remaining samples: 0
   ‚úÖ Data is CLEAN - No NaN or infinite values
   Data type validation:
   - Numeric columns: 45
   - All values in [0, 1]? True

‚úÖ Min-Max Scaler loaded (feature_range=(0, 1))

üìä Dataset Preview (First 5 rows):
Empty DataFrame
Columns: [Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15, Q16, Q17, Q18, Q19, Q20, Q21, Q22, Q23, Q24, Q25, Q26, Q27, Q28, Q29, Q30, Q31, Q32, Q33, Q34, Q35, F_Average_Performance, F_Academic_Load, F_Life_Balance, F_Psychological_Stress, F_Family_Support, F_Grade_Consistency, F_Responsibility_Result_Index, F_Parental_Education, F_Socioeconomic_Risk, F_Interest_Perform

## CELL 8: ISOLATION FOREST MODEL

Isolation Forest is an ensemble method that isolates anomalies by randomly selecting features and split values. It's efficient and doesn't require distance metrics.

In [None]:
# CELL 8: ISOLATION FOREST MODEL

print("="*80)
print("ISOLATION FOREST - ANOMALY DETECTION")
print("="*80)
print("\nAlgorithm: Ensemble method for anomaly detection")
print("Principle: Isolates anomalies by randomly selecting features")
print("Features used: All prepared and normalized variables from Phase 2")

# Define train/test split configurations
splits_config = [
    {'train_size': 0.70, 'test_size': 0.30, 'name': '70/30'},
    {'train_size': 0.60, 'test_size': 0.40, 'name': '60/40'},
    {'train_size': 0.80, 'test_size': 0.20, 'name': '80/20'}
]

iso_forest_results = {}

for config in splits_config:
    train_size = config['train_size']
    split_name = config['name']
    
    print(f"\n{'‚îÄ'*80}")
    print(f"Training Isolation Forest with {split_name} split:")
    print(f"{'‚îÄ'*80}")
    
    # Split data using the prepared normalized dataset
    X_train, X_test = train_test_split(
        df_data, 
        train_size=train_size, 
        random_state=42
    )
    
    # LIGHT DATA VALIDATION AFTER SPLIT
    # Only handle critical issues, don't drop rows
    if X_train.isnull().sum().sum() > 0:
        print(f"   ‚ö†Ô∏è  Imputing NaN in training set with median...")
        numeric_cols = X_train.select_dtypes(include=[np.number]).columns
        imputer = SimpleImputer(strategy='median')
        X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
    
    if X_test.isnull().sum().sum() > 0:
        print(f"   ‚ö†Ô∏è  Imputing NaN in test set with median...")
        numeric_cols = X_test.select_dtypes(include=[np.number]).columns
        imputer = SimpleImputer(strategy='median')
        X_test[numeric_cols] = imputer.fit_transform(X_test[numeric_cols])
    
    # Handle infinite values by bounding them
    X_train = X_train.replace([np.inf, -np.inf], np.nan)
    X_test = X_test.replace([np.inf, -np.inf], np.nan)
    
    if X_train.isnull().sum().sum() > 0:
        numeric_cols = X_train.select_dtypes(include=[np.number]).columns
        imputer = SimpleImputer(strategy='median')
        X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
    
    if X_test.isnull().sum().sum() > 0:
        numeric_cols = X_test.select_dtypes(include=[np.number]).columns
        imputer = SimpleImputer(strategy='median')
        X_test[numeric_cols] = imputer.fit_transform(X_test[numeric_cols])
    
    print(f"\nüìä Data Split (from prepared dataset):")
    print(f"   Total students: {len(df_data)}")
    print(f"   Training samples: {len(X_train)} ({100*len(X_train)/len(df_data):.1f}%)")
    print(f"   Test samples: {len(X_test)} ({100*len(X_test)/len(df_data):.1f}%)")
    print(f"   Features used: {X_train.shape[1]} variables")
    print(f"   Training set - Missing values: {X_train.isnull().sum().sum()} (after handling)")
    print(f"   Test set - Missing values: {X_test.isnull().sum().sum()} (after handling)")
    
    # Train Isolation Forest with optimized hyperparameters
    iso_forest = IsolationForest(
        contamination=0.10,  # Assume 10% anomalies
        random_state=42,
        n_estimators=100,
        max_samples='auto',
        max_features=1.0
    )
    
    print(f"\nü§ñ Model Configuration:")
    print(f"   Contamination rate: 10% (expected anomaly percentage)")
    print(f"   Number of trees: 100")
    print(f"   Max features: All")
    
    # Train on normalized data
    iso_forest.fit(X_train)
    
    # Predict anomalies (-1 = anomaly, 1 = normal)
    train_predictions = iso_forest.predict(X_train)
    test_predictions = iso_forest.predict(X_test)
    
    # Get anomaly scores (lower scores = more anomalous)
    train_scores = iso_forest.score_samples(X_train)
    test_scores = iso_forest.score_samples(X_test)
    
    # Count anomalies
    train_anomalies = (train_predictions == -1).sum()
    test_anomalies = (test_predictions == -1).sum()
    
    print(f"\nüîç Anomalies Detected:")
    print(f"   Training set: {train_anomalies} anomalies ({100*train_anomalies/len(X_train):.2f}%)")
    print(f"   Test set: {test_anomalies} anomalies ({100*test_anomalies/len(X_test):.2f}%)")
    
    print(f"\nüìà Anomaly Scores (Lower = More Anomalous):")
    print(f"   Training - Min: {train_scores.min():.4f}, Max: {train_scores.max():.4f}, Mean: {train_scores.mean():.4f}")
    print(f"   Test - Min: {test_scores.min():.4f}, Max: {test_scores.max():.4f}, Mean: {test_scores.mean():.4f}")
    
    # Store results with additional metadata
    iso_forest_results[split_name] = {
        'model': iso_forest,
        'X_train': X_train,
        'X_test': X_test,
        'train_predictions': train_predictions,
        'test_predictions': test_predictions,
        'train_scores': train_scores,
        'test_scores': test_scores,
        'train_anomalies': train_anomalies,
        'test_anomalies': test_anomalies,
        'decision_threshold': iso_forest.offset_
    }

print(f"\n‚úÖ Isolation Forest training completed for all splits (70/30, 60/40, 80/20)")

ISOLATION FOREST - ANOMALY DETECTION

Algorithm: Ensemble method for anomaly detection
Principle: Isolates anomalies by randomly selecting features
Features used: All prepared and normalized variables from Phase 2

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Training Isolation Forest with 70/30 split:
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


ValueError: With n_samples=0, test_size=None and train_size=0.7, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# CELL 8 CONTINUED: Isolation Forest - Visualization

print("\n" + "="*80)
print("ISOLATION FOREST - PERFORMANCE COMPARISON")
print("="*80)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle('Isolation Forest - Anomaly Scores Distribution (3 Train/Test Splits)', 
             fontsize=13, fontweight='bold')

for idx, split_name in enumerate(['70/30', '60/40', '80/20']):
    results = iso_forest_results[split_name]
    
    # Plot distributions
    axes[idx].hist(results['train_scores'][results['train_predictions'] == 1], 
                   bins=20, alpha=0.6, label='Normal (Train)', color='blue')
    axes[idx].hist(results['train_scores'][results['train_predictions'] == -1], 
                   bins=20, alpha=0.6, label='Anomaly (Train)', color='red')
    axes[idx].axvline(iso_forest_results[split_name]['model'].offset_, 
                      color='black', linestyle='--', linewidth=2, label='Decision Boundary')
    
    axes[idx].set_xlabel('Anomaly Score')
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'Split {split_name}\n(Train: {len(results["X_train"])}, Test: {len(results["X_test"])})')
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ Visualization completed")

## CELL 9: LOCAL OUTLIER FACTOR (LOF) MODEL

LOF detects local density-based anomalies by comparing the density of each point with its neighbors. It's effective when anomalies have varying densities.

In [None]:
# CELL 9: LOCAL OUTLIER FACTOR (LOF) MODEL

print("\n" + "="*80)
print("LOCAL OUTLIER FACTOR (LOF) - ANOMALY DETECTION")
print("="*80)
print("\nAlgorithm: Density-based anomaly detection")
print("Principle: Compares local density with neighbours")
print("Features used: All prepared and normalized variables from Phase 2")

lof_results = {}

for config in splits_config:
    train_size = config['train_size']
    split_name = config['name']
    
    print(f"\n{'‚îÄ'*80}")
    print(f"Training LOF with {split_name} split:")
    print(f"{'‚îÄ'*80}")
    
    # Use same splits as Isolation Forest for consistency
    X_train = iso_forest_results[split_name]['X_train'].copy()
    X_test = iso_forest_results[split_name]['X_test'].copy()
    
    # ADDITIONAL DATA VALIDATION
    if X_train.isnull().sum().sum() > 0:
        print(f"   ‚ö†Ô∏è  Cleaning NaN from training set...")
        X_train = X_train.dropna()
    
    if X_test.isnull().sum().sum() > 0:
        print(f"   ‚ö†Ô∏è  Cleaning NaN from test set...")
        X_test = X_test.dropna()
    
    # Remove infinite values
    X_train = X_train[~np.isinf(X_train.select_dtypes(include=[np.number])).any(axis=1)]
    X_test = X_test[~np.isinf(X_test.select_dtypes(include=[np.number])).any(axis=1)]
    
    print(f"\nüìä Data Split (from prepared dataset):")
    print(f"   Training samples: {len(X_train)} ({100*len(X_train)/len(df_data):.1f}%)")
    print(f"   Test samples: {len(X_test)} ({100*len(X_test)/len(df_data):.1f}%)")
    print(f"   Features used: {X_train.shape[1]} variables")
    print(f"   Data quality: {X_train.isnull().sum().sum()} NaN values")
    
    # Train LOF with optimized hyperparameters
    lof = LocalOutlierFactor(
        n_neighbors=20,
        contamination=0.10,
        novelty=True  # Allow prediction on new data
    )
    
    print(f"\nü§ñ Model Configuration:")
    print(f"   Number of neighbours: 20")
    print(f"   Contamination rate: 10%")
    print(f"   Novelty mode: Enabled")
    
    # Train on normalized data
    lof.fit(X_train)
    
    # Predict anomalies (-1 = anomaly, 1 = normal)
    train_predictions = lof.predict(X_train)
    test_predictions = lof.predict(X_test)
    
    # Get anomaly scores (lower values = more anomalous)
    train_scores = lof.negative_outlier_factor_
    test_scores = lof.score_samples(X_test)
    
    # Count anomalies
    train_anomalies = (train_predictions == -1).sum()
    test_anomalies = (test_predictions == -1).sum()
    
    print(f"\nüîç Anomalies Detected:")
    print(f"   Training set: {train_anomalies} anomalies ({100*train_anomalies/len(X_train):.2f}%)")
    print(f"   Test set: {test_anomalies} anomalies ({100*test_anomalies/len(X_test):.2f}%)")
    
    print(f"\nüìà LOF Scores (Lower = More Anomalous):")
    print(f"   Training - Min: {train_scores.min():.4f}, Max: {train_scores.max():.4f}, Mean: {train_scores.mean():.4f}")
    print(f"   Test - Min: {test_scores.min():.4f}, Max: {test_scores.max():.4f}, Mean: {test_scores.mean():.4f}")
    
    # Store results with additional metadata
    lof_results[split_name] = {
        'model': lof,
        'X_train': X_train,
        'X_test': X_test,
        'train_predictions': train_predictions,
        'test_predictions': test_predictions,
        'train_scores': train_scores,
        'test_scores': test_scores,
        'train_anomalies': train_anomalies,
        'test_anomalies': test_anomalies
    }

print(f"\n‚úÖ Local Outlier Factor training completed for all splits (70/30, 60/40, 80/20)")

In [None]:
# CELL 9 CONTINUED: LOF - Visualization

print("\n" + "="*80)
print("LOF - PERFORMANCE COMPARISON")
print("="*80)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle('Local Outlier Factor (LOF) - Anomaly Scores Distribution (3 Train/Test Splits)', 
             fontsize=13, fontweight='bold')

for idx, split_name in enumerate(['70/30', '60/40', '80/20']):
    results = lof_results[split_name]
    
    # Plot distributions
    axes[idx].hist(results['train_scores'][results['train_predictions'] == 1], 
                   bins=20, alpha=0.6, label='Normal (Train)', color='green')
    axes[idx].hist(results['train_scores'][results['train_predictions'] == -1], 
                   bins=20, alpha=0.6, label='Anomaly (Train)', color='red')
    
    axes[idx].set_xlabel('LOF Score')
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'Split {split_name}\n(Train: {len(results["X_train"])}, Test: {len(results["X_test"])})')
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ Visualization completed")

## CELL 10: ONE-CLASS SVM MODEL

One-Class SVM learns the boundary of the normal behavior and identifies points outside. It's powerful with high-dimensional data but requires careful hyperparameter tuning.

In [None]:
# CELL 10: ONE-CLASS SVM MODEL

print("\n" + "="*80)
print("ONE-CLASS SVM - ANOMALY DETECTION")
print("="*80)
print("\nAlgorithm: Support Vector Machine for single-class classification")
print("Principle: Learns boundary of normal behaviour")
print("Features used: All prepared and normalized variables from Phase 2")

ocsvm_results = {}

for config in splits_config:
    train_size = config['train_size']
    split_name = config['name']
    
    print(f"\n{'‚îÄ'*80}")
    print(f"Training One-Class SVM with {split_name} split:")
    print(f"{'‚îÄ'*80}")
    
    # Use same splits as Isolation Forest for consistency
    X_train = iso_forest_results[split_name]['X_train'].copy()
    X_test = iso_forest_results[split_name]['X_test'].copy()
    
    # ADDITIONAL DATA VALIDATION
    if X_train.isnull().sum().sum() > 0:
        print(f"   ‚ö†Ô∏è  Cleaning NaN from training set...")
        X_train = X_train.dropna()
    
    if X_test.isnull().sum().sum() > 0:
        print(f"   ‚ö†Ô∏è  Cleaning NaN from test set...")
        X_test = X_test.dropna()
    
    # Remove infinite values
    X_train = X_train[~np.isinf(X_train.select_dtypes(include=[np.number])).any(axis=1)]
    X_test = X_test[~np.isinf(X_test.select_dtypes(include=[np.number])).any(axis=1)]
    
    print(f"\nüìä Data Split (from prepared dataset):")
    print(f"   Training samples: {len(X_train)} ({100*len(X_train)/len(df_data):.1f}%)")
    print(f"   Test samples: {len(X_test)} ({100*len(X_test)/len(df_data):.1f}%)")
    print(f"   Features used: {X_train.shape[1]} variables")
    print(f"   Data quality: {X_train.isnull().sum().sum()} NaN values")
    
    # Train One-Class SVM with optimized hyperparameters
    ocsvm = OneClassSVM(
        kernel='rbf',
        gamma='auto',
        nu=0.10  # Expected fraction of anomalies (10%)
    )
    
    print(f"\nü§ñ Model Configuration:")
    print(f"   Kernel: RBF (Radial Basis Function)")
    print(f"   Gamma: Auto (1/n_features)")
    print(f"   Nu (anomaly fraction): 0.10 (10%)")
    
    # Train on normalized data
    ocsvm.fit(X_train)
    
    # Predict anomalies (-1 = anomaly, 1 = normal)
    train_predictions = ocsvm.predict(X_train)
    test_predictions = ocsvm.predict(X_test)
    
    # Get anomaly scores (distance to hyperplane, negative = anomaly)
    train_scores = ocsvm.decision_function(X_train)
    test_scores = ocsvm.decision_function(X_test)
    
    # Count anomalies
    train_anomalies = (train_predictions == -1).sum()
    test_anomalies = (test_predictions == -1).sum()
    
    print(f"\nüîç Anomalies Detected:")
    print(f"   Training set: {train_anomalies} anomalies ({100*train_anomalies/len(X_train):.2f}%)")
    print(f"   Test set: {test_anomalies} anomalies ({100*test_anomalies/len(X_test):.2f}%)")
    
    print(f"\nüìà One-Class SVM Decision Function Scores (Negative = Anomaly):")
    print(f"   Training - Min: {train_scores.min():.4f}, Max: {train_scores.max():.4f}, Mean: {train_scores.mean():.4f}")
    print(f"   Test - Min: {test_scores.min():.4f}, Max: {test_scores.max():.4f}, Mean: {test_scores.mean():.4f}")
    
    # Store results with additional metadata
    ocsvm_results[split_name] = {
        'model': ocsvm,
        'X_train': X_train,
        'X_test': X_test,
        'train_predictions': train_predictions,
        'test_predictions': test_predictions,
        'train_scores': train_scores,
        'test_scores': test_scores,
        'train_anomalies': train_anomalies,
        'test_anomalies': test_anomalies
    }

print(f"\n‚úÖ One-Class SVM training completed for all splits (70/30, 60/40, 80/20)")

In [None]:
# CELL 10 CONTINUED: One-Class SVM - Visualization

print("\n" + "="*80)
print("ONE-CLASS SVM - PERFORMANCE COMPARISON")
print("="*80)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle('One-Class SVM - Decision Function Scores (3 Train/Test Splits)', 
             fontsize=13, fontweight='bold')

for idx, split_name in enumerate(['70/30', '60/40', '80/20']):
    results = ocsvm_results[split_name]
    
    # Plot distributions
    axes[idx].hist(results['train_scores'][results['train_predictions'] == 1], 
                   bins=20, alpha=0.6, label='Normal (Train)', color='purple')
    axes[idx].hist(results['train_scores'][results['train_predictions'] == -1], 
                   bins=20, alpha=0.6, label='Anomaly (Train)', color='red')
    axes[idx].axvline(0, color='black', linestyle='--', linewidth=2, label='Decision Boundary')
    
    axes[idx].set_xlabel('Decision Function Score')
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'Split {split_name}\n(Train: {len(results["X_train"])}, Test: {len(results["X_test"])})')
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ Visualization completed")

## CELL 11: AUTOENCODER MODEL (Deep Learning)

Autoencoder detects anomalies based on reconstruction error. Points that are reconstructed poorly are considered anomalies. It captures complex non-linear patterns.

In [None]:
# CELL 11: AUTOENCODER MODEL (DEEP LEARNING)

print("\n" + "="*80)
print("AUTOENCODER (DEEP LEARNING) - ANOMALY DETECTION")
print("="*80)
print("\nAlgorithm: Neural network (Encoder-Decoder architecture)")
print("Principle: Detects anomalies by reconstruction error")
print("Features used: All prepared and normalized variables from Phase 2")

autoencoder_results = {}

for config in splits_config:
    train_size = config['train_size']
    split_name = config['name']
    
    print(f"\n{'‚îÄ'*80}")
    print(f"Training Autoencoder with {split_name} split:")
    print(f"{'‚îÄ'*80}")
    
    # Use same splits as Isolation Forest for consistency
    X_train_df = iso_forest_results[split_name]['X_train'].copy()
    X_test_df = iso_forest_results[split_name]['X_test'].copy()
    
    # ADDITIONAL DATA VALIDATION
    if X_train_df.isnull().sum().sum() > 0:
        print(f"   ‚ö†Ô∏è  Cleaning NaN from training set...")
        X_train_df = X_train_df.dropna()
    
    if X_test_df.isnull().sum().sum() > 0:
        print(f"   ‚ö†Ô∏è  Cleaning NaN from test set...")
        X_test_df = X_test_df.dropna()
    
    # Remove infinite values
    X_train_df = X_train_df[~np.isinf(X_train_df.select_dtypes(include=[np.number])).any(axis=1)]
    X_test_df = X_test_df[~np.isinf(X_test_df.select_dtypes(include=[np.number])).any(axis=1)]
    
    # Convert to numpy arrays for neural network
    X_train = X_train_df.values
    X_test = X_test_df.values
    
    print(f"\nüìä Data Split (from prepared dataset):")
    print(f"   Training samples: {len(X_train)} ({100*len(X_train)/len(df_data):.1f}%)")
    print(f"   Test samples: {len(X_test)} ({100*len(X_test)/len(df_data):.1f}%)")
    print(f"   Features used: {X_train.shape[1]} variables")
    print(f"   Data quality: NaN={np.isnan(X_train).sum()}, Inf={np.isinf(X_train).sum()}")
    
    # Build Autoencoder model
    input_dim = X_train.shape[1]
    encoding_dim = int(input_dim / 2)
    
    # Encoder: Compress input to bottleneck
    encoder = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(input_dim,)),
        layers.Dense(32, activation='relu'),
        layers.Dense(encoding_dim, activation='relu')
    ])
    
    # Decoder: Reconstruct from bottleneck
    decoder = keras.Sequential([
        layers.Dense(32, activation='relu', input_shape=(encoding_dim,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(input_dim, activation='sigmoid')
    ])
    
    # Full Autoencoder
    autoencoder = keras.Sequential([encoder, decoder])
    autoencoder.compile(optimizer='adam', loss='mse')
    
    print(f"\nüèóÔ∏è  Autoencoder Architecture:")
    print(f"   Input dimension: {input_dim} variables")
    print(f"   Layer 1: 64 neurons (ReLU)")
    print(f"   Layer 2: 32 neurons (ReLU)")
    print(f"   Bottleneck (Encoder output): {encoding_dim} neurons (ReLU)")
    print(f"   Layer 4: 32 neurons (ReLU)")
    print(f"   Layer 5: 64 neurons (ReLU)")
    print(f"   Output: {input_dim} neurons (Sigmoid)")
    print(f"   Total parameters: {autoencoder.count_params():,}")
    
    # Train Autoencoder
    print(f"\nüìö Training Autoencoder...")
    print(f"   Epochs: 50 | Batch size: 32 | Validation split: 10%")
    
    history = autoencoder.fit(
        X_train, X_train,
        epochs=50,
        batch_size=32,
        validation_split=0.1,
        verbose=0
    )
    
    print(f"   Final training loss: {history.history['loss'][-1]:.6f}")
    print(f"   Final validation loss: {history.history['val_loss'][-1]:.6f}")
    
    # Calculate reconstruction errors
    train_reconstructions = autoencoder.predict(X_train, verbose=0)
    test_reconstructions = autoencoder.predict(X_test, verbose=0)
    
    # Mean Squared Error for each sample
    train_mse = np.mean(np.square(X_train - train_reconstructions), axis=1)
    test_mse = np.mean(np.square(X_test - test_reconstructions), axis=1)
    
    # Calculate threshold (95th percentile of training MSE)
    threshold = np.percentile(train_mse, 95)
    
    # Classify anomalies based on reconstruction error
    train_predictions = np.where(train_mse > threshold, -1, 1)
    test_predictions = np.where(test_mse > threshold, -1, 1)
    
    # Count anomalies
    train_anomalies = (train_predictions == -1).sum()
    test_anomalies = (test_predictions == -1).sum()
    
    print(f"\nüîç Anomalies Detected:")
    print(f"   Training set: {train_anomalies} anomalies ({100*train_anomalies/len(X_train):.2f}%)")
    print(f"   Test set: {test_anomalies} anomalies ({100*test_anomalies/len(X_test):.2f}%)")
    
    print(f"\nüìà Reconstruction Error (Mean Squared Error):")
    print(f"   Training - Min: {train_mse.min():.6f}, Max: {train_mse.max():.6f}, Mean: {train_mse.mean():.6f}")
    print(f"   Test - Min: {test_mse.min():.6f}, Max: {test_mse.max():.6f}, Mean: {test_mse.mean():.6f}")
    print(f"   Anomaly threshold (95th percentile): {threshold:.6f}")
    
    # Store results with additional metadata
    autoencoder_results[split_name] = {
        'model': autoencoder,
        'encoder': encoder,
        'decoder': decoder,
        'X_train': X_train,
        'X_test': X_test,
        'train_predictions': train_predictions,
        'test_predictions': test_predictions,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_anomalies': train_anomalies,
        'test_anomalies': test_anomalies,
        'threshold': threshold,
        'history': history
    }

print(f"\n‚úÖ Autoencoder training completed for all splits (70/30, 60/40, 80/20)")

In [None]:
# CELL 11 CONTINUED: Autoencoder - Visualization

print("\n" + "="*80)
print("AUTOENCODER - RECONSTRUCTION ERROR ANALYSIS")
print("="*80)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle('Autoencoder - Reconstruction Error Distribution (3 Train/Test Splits)', 
             fontsize=13, fontweight='bold')

for idx, split_name in enumerate(['70/30', '60/40', '80/20']):
    results = autoencoder_results[split_name]
    
    # Plot distributions
    axes[idx].hist(results['train_mse'][results['train_predictions'] == 1], 
                   bins=20, alpha=0.6, label='Normal (Train)', color='orange')
    axes[idx].hist(results['train_mse'][results['train_predictions'] == -1], 
                   bins=20, alpha=0.6, label='Anomaly (Train)', color='red')
    axes[idx].axvline(results['threshold'], color='black', linestyle='--', linewidth=2, label='Threshold')
    
    axes[idx].set_xlabel('Reconstruction Error (MSE)')
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'Split {split_name}\n(Train: {len(results["X_train"])}, Test: {len(results["X_test"])})')
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ Visualization completed")

## COMPARISON: All Models Performance Across Train/Test Splits

In [None]:
# MODELS COMPARISON

print("\n" + "="*80)
print("COMPREHENSIVE MODEL COMPARISON")
print("="*80)

# Create comparison summary
comparison_data = []

for split_name in ['70/30', '60/40', '80/20']:
    # Isolation Forest
    if_results = iso_forest_results[split_name]
    comparison_data.append({
        'Model': 'Isolation Forest',
        'Split': split_name,
        'Train_Anomalies': if_results['train_anomalies'],
        'Test_Anomalies': if_results['test_anomalies'],
        'Train_Rate_%': 100 * if_results['train_anomalies'] / len(if_results['X_train']),
        'Test_Rate_%': 100 * if_results['test_anomalies'] / len(if_results['X_test'])
    })
    
    # LOF
    lof = lof_results[split_name]
    comparison_data.append({
        'Model': 'LOF',
        'Split': split_name,
        'Train_Anomalies': lof['train_anomalies'],
        'Test_Anomalies': lof['test_anomalies'],
        'Train_Rate_%': 100 * lof['train_anomalies'] / len(lof['X_train']),
        'Test_Rate_%': 100 * lof['test_anomalies'] / len(lof['X_test'])
    })
    
    # One-Class SVM
    ocsvm = ocsvm_results[split_name]
    comparison_data.append({
        'Model': 'One-Class SVM',
        'Split': split_name,
        'Train_Anomalies': ocsvm['train_anomalies'],
        'Test_Anomalies': ocsvm['test_anomalies'],
        'Train_Rate_%': 100 * ocsvm['train_anomalies'] / len(ocsvm['X_train']),
        'Test_Rate_%': 100 * ocsvm['test_anomalies'] / len(ocsvm['X_test'])
    })
    
    # Autoencoder
    ae = autoencoder_results[split_name]
    comparison_data.append({
        'Model': 'Autoencoder',
        'Split': split_name,
        'Train_Anomalies': ae['train_anomalies'],
        'Test_Anomalies': ae['test_anomalies'],
        'Train_Rate_%': 100 * ae['train_anomalies'] / len(ae['X_train']),
        'Test_Rate_%': 100 * ae['test_anomalies'] / len(ae['X_test'])
    })

comparison_df = pd.DataFrame(comparison_data)

print("\nüìä ANOMALY DETECTION RESULTS ACROSS ALL MODEL-SPLIT COMBINATIONS:")
print("\n" + comparison_df.to_string(index=False))

print("\n\n‚ú® KEY INSIGHTS:")
print("\n1Ô∏è‚É£  ISOLATION FOREST:")
if_avg = iso_forest_results['70/30']['test_anomalies'] + iso_forest_results['60/40']['test_anomalies'] + iso_forest_results['80/20']['test_anomalies']
print(f"    Total anomalies detected (test sets): {if_avg}")
print(f"    Average detection rate: {comparison_df[comparison_df['Model']=='Isolation Forest']['Test_Rate_%'].mean():.2f}%")
print(f"    Best split: {comparison_df[comparison_df['Model']=='Isolation Forest'].loc[comparison_df[comparison_df['Model']=='Isolation Forest']['Test_Rate_%'].idxmax(), 'Split']}")

print("\n2Ô∏è‚É£  LOCAL OUTLIER FACTOR (LOF):")
lof_avg = lof_results['70/30']['test_anomalies'] + lof_results['60/40']['test_anomalies'] + lof_results['80/20']['test_anomalies']
print(f"    Total anomalies detected (test sets): {lof_avg}")
print(f"    Average detection rate: {comparison_df[comparison_df['Model']=='LOF']['Test_Rate_%'].mean():.2f}%")
print(f"    Best split: {comparison_df[comparison_df['Model']=='LOF'].loc[comparison_df[comparison_df['Model']=='LOF']['Test_Rate_%'].idxmax(), 'Split']}")

print("\n3Ô∏è‚É£  ONE-CLASS SVM:")
ocsvm_avg = ocsvm_results['70/30']['test_anomalies'] + ocsvm_results['60/40']['test_anomalies'] + ocsvm_results['80/20']['test_anomalies']
print(f"    Total anomalies detected (test sets): {ocsvm_avg}")
print(f"    Average detection rate: {comparison_df[comparison_df['Model']=='One-Class SVM']['Test_Rate_%'].mean():.2f}%")
print(f"    Best split: {comparison_df[comparison_df['Model']=='One-Class SVM'].loc[comparison_df[comparison_df['Model']=='One-Class SVM']['Test_Rate_%'].idxmax(), 'Split']}")

print("\n4Ô∏è‚É£  AUTOENCODER:")
ae_avg = autoencoder_results['70/30']['test_anomalies'] + autoencoder_results['60/40']['test_anomalies'] + autoencoder_results['80/20']['test_anomalies']
print(f"    Total anomalies detected (test sets): {ae_avg}")
print(f"    Average detection rate: {comparison_df[comparison_df['Model']=='Autoencoder']['Test_Rate_%'].mean():.2f}%")
print(f"    Best split: {comparison_df[comparison_df['Model']=='Autoencoder'].loc[comparison_df[comparison_df['Model']=='Autoencoder']['Test_Rate_%'].idxmax(), 'Split']}")

In [None]:
# Visualization: Model Comparison

fig, axes = plt.subplots(1, 2, figsize=(16, 5))
fig.suptitle('Model Performance Comparison Across Train/Test Splits', fontsize=14, fontweight='bold')

# Test anomalies count
split_order = ['70/30', '60/40', '80/20']
models = ['Isolation Forest', 'LOF', 'One-Class SVM', 'Autoencoder']
for model in models:
    model_data = comparison_df[comparison_df['Model'] == model].sort_values('Split', key=lambda x: x.map({s: i for i, s in enumerate(split_order)}))
    axes[0].plot(model_data['Split'], model_data['Test_Anomalies'], marker='o', label=model, linewidth=2, markersize=8)

axes[0].set_xlabel('Train/Test Split', fontweight='bold')
axes[0].set_ylabel('Anomalies Detected (Test Set)', fontweight='bold')
axes[0].set_title('Absolute Anomaly Count')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Test anomaly rate
for model in models:
    model_data = comparison_df[comparison_df['Model'] == model].sort_values('Split', key=lambda x: x.map({s: i for i, s in enumerate(split_order)}))
    axes[1].plot(model_data['Split'], model_data['Test_Rate_%'], marker='s', label=model, linewidth=2, markersize=8)

axes[1].set_xlabel('Train/Test Split', fontweight='bold')
axes[1].set_ylabel('Detection Rate (%)', fontweight='bold')
axes[1].set_title('Anomaly Detection Rate')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ Comparison visualization completed")

## FINAL RECOMMENDATIONS

In [None]:
# PHASE 3 FINAL SUMMARY

print("\n" + "="*80)
print("PHASE 3 SUCCESSFULLY COMPLETED")
print("="*80)

print(f"""
üéâ MODELS TRAINED AND EVALUATED:

‚úÖ Isolation Forest - 3 different train/test splits
‚úÖ Local Outlier Factor (LOF) - 3 different train/test splits
‚úÖ One-Class SVM - 3 different train/test splits
‚úÖ Autoencoder (Deep Learning) - 3 different train/test splits

üìä TRAIN/TEST SPLITS EVALUATED:
   ‚Ä¢ 70% Training / 30% Testing
   ‚Ä¢ 60% Training / 40% Testing
   ‚Ä¢ 80% Training / 20% Testing

üîç TOTAL STUDENTS ANALYZED:
   ‚Ä¢ Isolation Forest anomalies: {comparison_df[comparison_df['Model']=='Isolation Forest']['Test_Anomalies'].sum()}
   ‚Ä¢ LOF anomalies: {comparison_df[comparison_df['Model']=='LOF']['Test_Anomalies'].sum()}
   ‚Ä¢ One-Class SVM anomalies: {comparison_df[comparison_df['Model']=='One-Class SVM']['Test_Anomalies'].sum()}
   ‚Ä¢ Autoencoder anomalies: {comparison_df[comparison_df['Model']=='Autoencoder']['Test_Anomalies'].sum()}

üìà DETECTION RATES (Average across all splits):
   ‚Ä¢ Isolation Forest: {comparison_df[comparison_df['Model']=='Isolation Forest']['Test_Rate_%'].mean():.2f}%
   ‚Ä¢ LOF: {comparison_df[comparison_df['Model']=='LOF']['Test_Rate_%'].mean():.2f}%
   ‚Ä¢ One-Class SVM: {comparison_df[comparison_df['Model']=='One-Class SVM']['Test_Rate_%'].mean():.2f}%
   ‚Ä¢ Autoencoder: {comparison_df[comparison_df['Model']=='Autoencoder']['Test_Rate_%'].mean():.2f}%

üèÜ BEST PERFORMING CONFIGURATIONS:
   ‚Ä¢ Isolation Forest: {comparison_df[comparison_df['Model']=='Isolation Forest'].loc[comparison_df[comparison_df['Model']=='Isolation Forest']['Test_Rate_%'].idxmax(), 'Split']} split
   ‚Ä¢ LOF: {comparison_df[comparison_df['Model']=='LOF'].loc[comparison_df[comparison_df['Model']=='LOF']['Test_Rate_%'].idxmax(), 'Split']} split
   ‚Ä¢ One-Class SVM: {comparison_df[comparison_df['Model']=='One-Class SVM'].loc[comparison_df[comparison_df['Model']=='One-Class SVM']['Test_Rate_%'].idxmax(), 'Split']} split
   ‚Ä¢ Autoencoder: {comparison_df[comparison_df['Model']=='Autoencoder'].loc[comparison_df[comparison_df['Model']=='Autoencoder']['Test_Rate_%'].idxmax(), 'Split']} split

‚ú® Next Phase: Phase 4 - VALIDATION (Consensus and Analysis)
""")

print("="*80)
print("Ready for Phase 4: VALIDATION AND CONSENSUS ANALYSIS")
print("="*80)