In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GraphSAGE, GATConv
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import joblib
import json
from datetime import datetime

# Set seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("âœ… All dependencies installed and ready!")

# ============================================================
# PART 1: GNN FRAUD GRAPH MODEL (Member 3)
# ============================================================

class FraudGraphSAGE(nn.Module):
    """GraphSAGE model for fraud detection in UPI networks"""
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2):
        super(FraudGraphSAGE, self).__init__()
        self.graphsage = GraphSAGE(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            num_layers=num_layers,
            out_channels=out_channels
        )

    def forward(self, x, edge_index):
        out = self.graphsage(x, edge_index)
        return torch.sigmoid(out)  # Probability output


class GNNFraudDetector:
    """Complete GNN pipeline for fraud graph analysis"""

    def __init__(self, dataset_path='dataset_graph.csv'):
        self.dataset_path = dataset_path
        self.model = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.node_mapping = {}
        self.edge_mapping = {}
        self.fraud_clusters = {}

    def load_graph_data(self):
        """Load and preprocess graph dataset"""
        df = pd.read_csv(self.dataset_path)
        print(f"âœ… Loaded {len(df)} edges from {self.dataset_path}")

        # Create node mappings
        all_nodes = set(df['src_id'].unique()) | set(df['dst_id'].unique())
        self.node_mapping = {node: idx for idx, node in enumerate(sorted(all_nodes))}
        self.reverse_node_mapping = {idx: node for node, idx in self.node_mapping.items()}
        num_nodes = len(self.node_mapping)

        edge_index_list = []
        node_labels = torch.zeros(num_nodes, dtype=torch.float) # Target labels for nodes

        # For node features (average incident edge risk)
        node_incident_risk_sum = torch.zeros(num_nodes, dtype=torch.float)
        node_incident_edge_count = torch.zeros(num_nodes, dtype=torch.float)

        for _, row in df.iterrows():
            src_idx = self.node_mapping[row['src_id']]
            dst_idx = self.node_mapping[row['dst_id']] # Corrected: previously typo `row['src_id']`
            edge_index_list.append([src_idx, dst_idx])
            risk_hint = row['risk_hint']

            # Assign node labels (target): a node is risky if any of its incident edges are risky
            node_labels[src_idx] = max(node_labels[src_idx], risk_hint)
            node_labels[dst_idx] = max(node_labels[dst_idx], risk_hint)

            # Accumulate risk for node feature calculation
            node_incident_risk_sum[src_idx] += risk_hint
            node_incident_edge_count[src_idx] += 1
            node_incident_risk_sum[dst_idx] += risk_hint
            node_incident_edge_count[dst_idx] += 1

        edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()

        # Node features (in-degree, out-degree, average incident edge risk)
        x = torch.zeros(num_nodes, 3)

        # Calculate in-degree and out-degree
        for src, dst in edge_index.t().tolist(): # Transpose back to iterate (src, dst) pairs
            x[dst, 0] += 1  # in-degree
            x[src, 1] += 1  # out-degree

        # Feature 3: Average incident edge risk
        # Avoid division by zero
        avg_incident_risk = torch.zeros(num_nodes, dtype=torch.float)
        valid_indices = node_incident_edge_count > 0
        avg_incident_risk[valid_indices] = node_incident_risk_sum[valid_indices] / node_incident_edge_count[valid_indices]
        x[:, 2] = avg_incident_risk

        # Create the Data object with node-level target labels
        self.graph_data = Data(x=x, edge_index=edge_index, y=node_labels.view(-1, 1))
        print(f"âœ… Graph created: {self.graph_data.num_nodes} nodes, {self.graph_data.num_edges} edges")
        return self.graph_data

    def train(self, epochs=50, hidden_channels=64, learning_rate=0.01):
        """Train GNN model"""
        self.model = FraudGraphSAGE(
            in_channels=self.graph_data.x.shape[1],
            hidden_channels=hidden_channels,
            out_channels=1,
            num_layers=2
        ).to(self.device)

        optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        criterion = nn.BCELoss()

        # Move data to device
        x = self.graph_data.x.to(self.device)
        edge_index = self.graph_data.edge_index.to(self.device)
        y = self.graph_data.y.to(self.device) # .view(-1, 1) is already done in Data creation

        print("\nðŸ”„ Training GNN...")
        for epoch in range(epochs):
            self.model.train()
            optimizer.zero_grad()

            # Forward pass
            out = self.model(x, edge_index)
            loss = criterion(out, y)

            # Backward pass
            loss.backward()
            optimizer.step()

            if (epoch + 1) % 10 == 0:
                print(f"  Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

        print("âœ… GNN training complete!")
        return self.model

    def get_node_risk_scores(self):
        """Get risk scores for all nodes"""
        self.model.eval()
        x = self.graph_data.x.to(self.device)
        edge_index = self.graph_data.edge_index.to(self.device)

        with torch.no_grad():
            scores = self.model(x, edge_index).cpu().numpy()

        node_risks = {self.reverse_node_mapping[i]: scores[i][0]
                      for i in range(len(scores))}
        return node_risks

    def detect_fraud_clusters(self, threshold=0.5):
        """Detect connected fraud clusters"""
        node_risks = self.get_node_risk_scores()
        high_risk_nodes = {k: v for k, v in node_risks.items() if v > threshold}

        print(f"âœ… Detected {len(high_risk_nodes)} high-risk nodes")
        return high_risk_nodes

    def save_model(self, save_path='models/gnn/gnn_model.pt'):
        """Save trained model"""
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        torch.save(
            {
                'model_state': self.model.state_dict(),
                'node_mapping': self.node_mapping,
                'reverse_node_mapping': self.reverse_node_mapping
            }, save_path)
        print(f"âœ… GNN model saved to {save_path}")


# ============================================================
# PART 2: BEHAVIORAL ANOMALY MODEL (Member 4)
# ============================================================

class BehavioralAnomalyDetector:
    """Isolation Forest for behavioral anomaly detection"""

    def __init__(self, dataset_path='dataset_behavior.csv', contamination=0.3):
        self.dataset_path = dataset_path
        self.contamination = contamination
        self.model = None
        self.scaler = StandardScaler()
        self.feature_names = None

    def load_data(self):
        """Load behavioral dataset"""
        df = pd.read_csv(self.dataset_path)
        self.feature_names = [col for col in df.columns
                             if col not in ['session_id', 'label']]
        X = df[self.feature_names]
        y = df['label']

        print(f"âœ… Loaded {len(df)} behavioral sessions")
        print(f"  Features: {self.feature_names}")
        print(f"  Label distribution: {y.value_counts().to_dict()}")

        return X, y

    def train(self, X):
        """Train Isolation Forest"""
        # Scale features
        X_scaled = self.scaler.fit_transform(X)

        # Train model
        self.model = IsolationForest(
            contamination=self.contamination,
            random_state=42,
            n_estimators=100
        )
        self.model.fit(X_scaled)

        print(f"âœ… Behavioral anomaly model trained!")
        print(f"  Contamination: {self.contamination}")
        print(f"  Features: {len(self.feature_names)}")

        return self.model

    def predict_risk(self, X):
        """Predict anomaly scores (0-1, higher = more anomalous)"""
        X_scaled = self.scaler.transform(X)

        # Get anomaly scores (-1 to 1, convert to 0-1)
        scores = self.model.score_samples(X_scaled)
        normalized_scores = (1 - (scores + 1) / 2)  # Normalize to 0-1

        return normalized_scores

    def save_model(self, save_path='models/behavior/behavior_model.pkl'):
        """Save trained model"""
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        joblib.dump(
            {
                'model': self.model,
                'scaler': self.scaler,
                'feature_names': self.feature_names
            }, save_path)
        print(f"âœ… Behavioral model saved to {save_path}")


# ============================================================
# PART 3: FUSION ENGINE (Member 4)
# ============================================================

class FraudFusionEngine:
    """Fuses all 4 model outputs into final fraud score"""

    # Weights for fusion (customizable)
    WEIGHTS = {
        'nlp': 0.30,
        'transaction': 0.35,
        'graph': 0.20,
        'behavior': 0.15
    }

    FRAUD_TYPES = {
        'refund_scam': 'Refund/UPI reversal scam',
        'kyc_scam': 'Fake KYC/identity verification',
        'impersonation': 'Impersonation (delivery/bank/support)',
        'verification_fraud': 'Fake verification transaction',
        'safe': 'Safe transaction'
    }

    @staticmethod
    def compute_final_score(nlp_risk, transaction_risk, graph_risk, behavior_risk):
        """
        Compute final fraud score using weighted fusion

        Args:
            nlp_risk (float): 0-1, scam language likelihood
            transaction_risk (float): 0-1, suspicious transaction pattern
            graph_risk (float): 0-1, linked to fraud network
            behavior_risk (float): 0-1, user behavior anomaly

        Returns:
            dict: final_score, fraud_type, confidence, explanation
        """
        final_score = (
            FraudFusionEngine.WEIGHTS['nlp'] * nlp_risk +
            FraudFusionEngine.WEIGHTS['transaction'] * transaction_risk +
            FraudFusionEngine.WEIGHTS['graph'] * graph_risk +
            FraudFusionEngine.WEIGHTS['behavior'] * behavior_risk
        )

        # Determine fraud type (highest contributing risk)
        risks = {
            'nlp': nlp_risk,
            'transaction': transaction_risk,
            'graph': graph_risk,
            'behavior': behavior_risk
        }
        dominant_risk = max(risks, key=risks.get)

        # Map to fraud type
        fraud_type_map = {
            'nlp': 'refund_scam' if nlp_risk > 0.7 else 'kyc_scam',
            'transaction': 'verification_fraud',
            'graph': 'impersonation',
            'behavior': 'refund_scam'
        }
        predicted_type = fraud_type_map.get(dominant_risk, 'safe')

        # Confidence based on consensus
        max_score = max(nlp_risk, transaction_risk, graph_risk, behavior_risk)
        min_score = min(nlp_risk, transaction_risk, graph_risk, behavior_risk)
        confidence = 1 - (abs(max_score - min_score) / (max_score + 1e-8))

        # Human-readable explanation
        explanation = FraudFusionEngine.generate_explanation(
            final_score, nlp_risk, transaction_risk, graph_risk, behavior_risk
        )

        return {
            'final_score': round(final_score, 3),
            'fraud_type': predicted_type,
            'confidence': round(confidence, 3),
            'explanation': explanation,
            'component_scores': {
                'nlp_risk': round(nlp_risk, 3),
                'transaction_risk': round(transaction_risk, 3),
                'graph_risk': round(graph_risk, 3),
                'behavior_risk': round(behavior_risk, 3)
            }
        }

    @staticmethod
    def generate_explanation(final_score, nlp_risk, transaction_risk, graph_risk, behavior_risk):
        """Generate human-readable explanation for fraud score"""
        if final_score > 0.7:
            severity = "ðŸ”´ HIGH RISK"
        elif final_score > 0.4:
            severity = "ðŸŸ  MEDIUM RISK"
        else:
            severity = "ðŸŸ¢ LOW RISK"

        reasons = []
        if nlp_risk > 0.6:
            reasons.append("scam language detected")
        if transaction_risk > 0.6:
            reasons.append("suspicious transaction pattern")
        if graph_risk > 0.6:
            reasons.append("linked to fraud network")
        if behavior_risk > 0.6:
            reasons.append("unusual user behavior")

        reason_str = ", ".join(reasons) if reasons else "all signals normal"

        return f"{severity} - {reason_str.capitalize()}"

    @staticmethod
    def save_fusion_config(save_path='models/fusion_engine.json'):
        """Save fusion engine configuration"""
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        config = {
            'weights': FraudFusionEngine.WEIGHTS,
            'fraud_types': FraudFusionEngine.FRAUD_TYPES,
            'timestamp': datetime.now().isoformat()
        }
        with open(save_path, 'w') as f:
            json.dump(config, f, indent=2)
        print(f"âœ… Fusion engine config saved to {save_path}")


# ============================================================
# EXECUTION PIPELINE
# ============================================================

def run_full_pipeline():
    """Execute complete Member 3 & 4 pipeline"""

    print("=" * 60)
    print("UPI-GUARDIAN: MEMBER 3 & 4 PIPELINE (Colab)")
    print("=" * 60)

    # ========== MEMBER 3: GNN MODEL ==========
    print("\n" + "="*60)
    print("MEMBER 3: GNN FRAUD GRAPH MODEL")
    print("="*60)

    gnn_detector = GNNFraudDetector('dataset_graph.csv')
    gnn_detector.load_graph_data()
    gnn_detector.train(epochs=50, hidden_channels=64)

    # Get fraud clusters
    high_risk_nodes = gnn_detector.detect_fraud_clusters(threshold=0.5)
    print(f"\nðŸ“Š Top 10 High-Risk Nodes:")
    for node, score in sorted(high_risk_nodes.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  {node}: {score:.3f}")

    # Save GNN model
    gnn_detector.save_model('models/gnn/gnn_model.pt')

    # ========== MEMBER 4A: BEHAVIORAL MODEL ==========
    print("\n" + "="*60)
    print("MEMBER 4A: BEHAVIORAL ANOMALY MODEL")
    print("="*60)

    behavior_detector = BehavioralAnomalyDetector('dataset_behavior.csv', contamination=0.3)
    X, y = behavior_detector.load_data()
    behavior_detector.train(X)

    # Get sample predictions
    sample_risks = behavior_detector.predict_risk(X[:5])
    print(f"\nðŸ“Š Sample Risk Predictions (first 5 sessions):")
    for i, risk in enumerate(sample_risks):
        print(f"  Session {i}: {risk:.3f} {'ðŸ”´ HIGH RISK' if risk > 0.6 else 'ðŸŸ¢ LOW RISK'}")

    # Save behavioral model
    behavior_detector.save_model('models/behavior/behavior_model.pkl')

    # ========== MEMBER 4B: FUSION ENGINE ==========
    print("\n" + "="*60)
    print("MEMBER 4B: FUSION ENGINE")
    print("="*60)

    # Example: Simulate predictions from all 4 members
    print("\nðŸ“Š Example Fusion Results (Simulated Inputs):")

    test_cases = [
        {'nlp': 0.85, 'transaction': 0.90, 'graph': 0.75, 'behavior': 0.65, 'name': 'High Risk (Refund Scam)'},
        {'nlp': 0.20, 'transaction': 0.15, 'graph': 0.10, 'behavior': 0.25, 'name': 'Low Risk (Safe)'},
        {'nlp': 0.60, 'transaction': 0.55, 'graph': 0.40, 'behavior': 0.50, 'name': 'Medium Risk (KYC Scam)'}
    ]

    for test in test_cases:
        result = FraudFusionEngine.compute_final_score(
            test['nlp'], test['transaction'], test['graph'], test['behavior']
        )
        print(f"\n  {test['name']}:")
        print(f"    Final Score: {result['final_score']}")
        print(f"    Fraud Type: {result['fraud_type']}")
        print(f"    Confidence: {result['confidence']}")
        print(f"    Explanation: {result['explanation']}")

    # Save fusion config
    FraudFusionEngine.save_fusion_config('models/fusion_engine.json')

    print("\n" + "="*60)
    print("âœ… PIPELINE COMPLETE!")
    print("="*60)
    print("\nðŸ“¦ Saved Artifacts:")
    print("  - models/gnn/gnn_model.pt")
    print("  - models/behavior/behavior_model.pkl")
    print("  - models/fusion_engine.json")
    print("\n Ready for integration with Member 1 & 2 models!")


if __name__ == "__main__":
    run_full_pipeline()


âœ… All dependencies installed and ready!
UPI-GUARDIAN: MEMBER 3 & 4 PIPELINE (Colab)

MEMBER 3: GNN FRAUD GRAPH MODEL
âœ… Loaded 604 edges from dataset_graph.csv
âœ… Graph created: 290 nodes, 604 edges

ðŸ”„ Training GNN...
  Epoch 10/50, Loss: 0.3486
  Epoch 20/50, Loss: 0.1427
  Epoch 30/50, Loss: 0.0503
  Epoch 40/50, Loss: 0.0210
  Epoch 50/50, Loss: 0.0113
âœ… GNN training complete!
âœ… Detected 158 high-risk nodes

ðŸ“Š Top 10 High-Risk Nodes:
  device:android_49: 1.000
  device:android_14: 1.000
  device:android_4: 1.000
  phone:9507943839: 1.000
  device:android_6: 1.000
  device:ios_16: 1.000
  device:android_8: 1.000
  phone:9914763202: 1.000
  phone:9734036506: 1.000
  phone:9485451171: 1.000
âœ… GNN model saved to models/gnn/gnn_model.pt

MEMBER 4A: BEHAVIORAL ANOMALY MODEL
âœ… Loaded 3000 behavioral sessions
  Features: ['total_time_seconds', 'tab_switches', 'scroll_loops', 'hover_time_ms', 'avg_click_interval_ms', 'click_repetition_count', 'keystroke_latency_ms', 'window