# üêÑ Cow Lameness Detection - Comprehensive Multi-Modal Pipeline v18

**Architecture:**
- YOLO v8: Multi-cow detection (select largest)
- SAM: Segment Anything (cow isolation)
- Pose: DeepLabCut/MMPose (configurable)
- VideoMAE: Self-supervised visual features
- RAFT: Optical flow (temporal motion)
- Temporal Transformer: Multi-modal fusion

**Dataset:** 1167 videos (Saƒülƒ±klƒ±: 642, Topal: 525)

---

## 1. Setup & Installation

In [None]:
# Install all required packages
!pip install -q ultralytics  # YOLO v8
!pip install -q segment-anything git+https://github.com/facebookresearch/segment-anything.git  # SAM
!pip install -q transformers  # VideoMAE
!pip install -q opencv-python-headless  # RAFT dependencies
!pip install -q pandas numpy scikit-learn scipy matplotlib seaborn tqdm
!pip install -q torch torchvision
!pip install -q timm  # For VideoMAE
!pip install -q pyyaml

print("‚úÖ All packages installed")

In [None]:
# Imports
import os
import glob
import json
import yaml
from pathlib import Path
from datetime import datetime

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score, roc_curve

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from ultralytics import YOLO
from segment_anything import sam_model_registry, SamPredictor
from transformers import VideoMAEFeatureExtractor, VideoMAEModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Device: {device}")

## 2. Mount Drive & Load Config

In [None]:
from google.colab import drive
drive.mount('/content/drive')

BASE = "/content/drive/MyDrive/Inek Topallik Tespiti Parcalanmis Inek Videolari"
VIDEO_DIR = f"{BASE}/cow_single_videos"
POSE_CSV_DIR = f"{BASE}/outputs/deeplabcut"  # or mmpose based on config
OUTPUT_DIR = f"{BASE}/outputs/colab_results"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/figures", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/models", exist_ok=True)

# Load config
config = {
    'pose_framework': 'deeplabcut',
    'features': {
        'yolo_detection': True,
        'sam_segmentation': True,
        'videomae': True,
        'optical_flow': True,
        'back_curvature': True
    }
}

print(f"‚úÖ Config loaded: {config['pose_framework']}")
print(f"   Features: {list(config['features'].keys())}")

## 2.5 Detect Available Pose Framework

In [None]:
def detect_pose_framework(video_dir, pose_base_dir):
    """
    Detect which pose estimation framework outputs are available.
    
    Returns:
        tuple: (framework_name, pose_csv_dir)
            - framework_name: 'deeplabcut', 'mmpose', or None
            - pose_csv_dir: path to the pose CSV directory
    """
    dlc_dir = f"{pose_base_dir}/deeplabcut"
    mmpose_dir = f"{pose_base_dir}/mmpose"
    
    # Check for DeepLabCut outputs
    dlc_available = False
    dlc_files = []
    if os.path.exists(dlc_dir):
        dlc_files = glob.glob(f"{dlc_dir}/*DLC*.csv")
        dlc_available = len(dlc_files) > 0
    
    # Check for MMPose outputs
    mmpose_available = False
    mmpose_files = []
    if os.path.exists(mmpose_dir):
        mmpose_files = glob.glob(f"{mmpose_dir}/*_MMPose.csv")
        mmpose_available = len(mmpose_files) > 0
    
    # Decision logic
    if dlc_available and mmpose_available:
        print("‚ö†Ô∏è  Both DeepLabCut and MMPose outputs detected!")
        print(f"   DeepLabCut: {len(dlc_files)} files")
        print(f"   MMPose: {len(mmpose_files)} files")
        print("\nWhich framework would you like to use?")
        print("  1. DeepLabCut")
        print("  2. MMPose")
        
        choice = input("Enter choice (1 or 2): ").strip()
        
        if choice == "1":
            return "deeplabcut", dlc_dir
        elif choice == "2":
            return "mmpose", mmpose_dir
        else:
            print("‚ùå Invalid choice. Defaulting to DeepLabCut.")
            return "deeplabcut", dlc_dir
    
    elif dlc_available:
        print(f"‚úÖ DeepLabCut outputs detected: {len(dlc_files)} files")
        return "deeplabcut", dlc_dir
    
    elif mmpose_available:
        print(f"‚úÖ MMPose outputs detected: {len(mmpose_files)} files")
        return "mmpose", mmpose_dir
    
    else:
        print("‚ùå ERROR: No pose estimation outputs found!")
        print(f"   Checked directories:")
        print(f"     - {dlc_dir}")
        print(f"     - {mmpose_dir}")
        print("\n   Please run pose estimation first:")
        print("     - DeepLabCut: python DeepLabCut/process_videos.py --batch")
        print("     - MMPose: python MMPose/process_videos.py --batch")
        return None, None

# Detect framework
POSE_FRAMEWORK, POSE_CSV_DIR = detect_pose_framework(VIDEO_DIR, f"{BASE}/outputs")

if POSE_FRAMEWORK is None:
    raise RuntimeError("No pose estimation data found. Cannot proceed.")

print(f"\nüéØ Using pose framework: {POSE_FRAMEWORK.upper()}")
print(f"   CSV directory: {POSE_CSV_DIR}")

# Update config
config['pose_framework'] = POSE_FRAMEWORK
print(f"\n‚úÖ Config updated with detected framework")

## 3. Initialize Models

In [None]:
# YOLO v8 for cow detection
yolo_model = YOLO('yolov8n.pt')
print("‚úÖ YOLO v8 loaded")

# SAM for segmentation
sam_checkpoint = "sam_vit_h_4b8939.pth"
!wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
sam = sam_model_registry["vit_h"](checkpoint=sam_checkpoint)
sam.to(device)
sam_predictor = SamPredictor(sam)
print("‚úÖ SAM loaded")

# VideoMAE for visual features
videomae_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
videomae_model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(device)
print("‚úÖ VideoMAE loaded")

## 4. Video Processing Pipeline

In [None]:
def detect_largest_cow(frame, yolo_model):
    """Detect multiple cows, return bounding box of largest (foreground cow)"""
    results = yolo_model(frame, classes=[19])  # Class 19 = cow in COCO
    
    if len(results[0].boxes) == 0:
        return None
    
    # Find largest box (by area)
    boxes = results[0].boxes.xyxy.cpu().numpy()
    areas = [(box[2]-box[0]) * (box[3]-box[1]) for box in boxes]
    largest_idx = np.argmax(areas)
    
    return boxes[largest_idx].astype(int)

def segment_cow(frame, bbox, sam_predictor):
    """Segment cow using SAM"""
    sam_predictor.set_image(frame)
    
    # Use bbox as prompt
    masks, _, _ = sam_predictor.predict(
        box=bbox,
        multimask_output=False
    )
    
    return masks[0]

def extract_optical_flow(frames):
    """Compute optical flow using Farneback (lightweight alternative to RAFT)"""
    flows = []
    
    for i in range(len(frames)-1):
        gray1 = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
        gray2 = cv2.cvtColor(frames[i+1], cv2.COLOR_BGR2GRAY)
        
        flow = cv2.calcOpticalFlowFarneback(
            gray1, gray2, None, 0.5, 3, 15, 3, 5, 1.2, 0
        )
        
        # Flow magnitude and angle
        mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
        flows.append({'magnitude': mag.mean(), 'angle': ang.mean()})
    
    return flows

def extract_videomae_features(frames, videomae_model, videomae_extractor):
    """Extract self-supervised visual features using VideoMAE"""
    # Sample 16 frames (VideoMAE input)
    indices = np.linspace(0, len(frames)-1, 16, dtype=int)
    sampled_frames = [frames[i] for i in indices]
    
    # Preprocess
    inputs = videomae_extractor(sampled_frames, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Extract features
    with torch.no_grad():
        outputs = videomae_model(**inputs)
        features = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    
    return features

print("‚úÖ Processing functions defined")

## 5. Load Data & Extract Multi-Modal Features

In [None]:
# Get all videos
video_files = []
for label_folder in ['Saglikli', 'Topal']:
    folder_path = f"{VIDEO_DIR}/{label_folder}"
    videos = glob.glob(f"{folder_path}/*.mp4")
    video_files.extend([(v, 0 if label_folder=='Saglikli' else 1) for v in videos])

print(f"üìä Found {len(video_files)} videos")
print(f"   Healthy: {sum(1 for _, l in video_files if l==0)}")
print(f"   Lame: {sum(1 for _, l in video_files if l==1)}")

# Process subset for demo (full processing takes hours)
DEMO_MODE = True
if DEMO_MODE:
    video_files = video_files[:50]  # Process 50 videos for demo
    print(f"\n‚ö†Ô∏è  DEMO MODE: Processing {len(video_files)} videos")

In [None]:
# Extract multi-modal features
dataset = []

for video_path, label in tqdm(video_files, desc="Processing videos"):
    try:
        # Load video
        cap = cv2.VideoCapture(video_path)
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frames.append(frame)
        cap.release()
        
        if len(frames) < 10:
            continue
        
        # 1. YOLO: Detect largest cow
        bbox = detect_largest_cow(frames[len(frames)//2], yolo_model)
        
        # 2. SAM: Segment cow
        if bbox is not None:
            mask = segment_cow(frames[len(frames)//2], bbox, sam_predictor)
            mask_area = mask.sum()
        else:
            mask_area = 0
        
        # 3. Optical Flow
        flows = extract_optical_flow(frames[::5])  # Sample every 5 frames
        flow_mag_mean = np.mean([f['magnitude'] for f in flows])
        flow_mag_std = np.std([f['magnitude'] for f in flows])
        
        # 4. VideoMAE features
        videomae_feat = extract_videomae_features(frames, videomae_model, videomae_extractor)
        
        # 5. Load pose CSV
        video_name = Path(video_path).stem
        
        if POSE_FRAMEWORK == "deeplabcut":
            # DeepLabCut pattern: {video}DLC*.csv
            pose_csv_pattern = f"{POSE_CSV_DIR}/{video_name}DLC*.csv"
            pose_csv_files = glob.glob(pose_csv_pattern)
            
            if pose_csv_files:
                pose_csv = pose_csv_files[0]  # Take first match
                pose_df = pd.read_csv(pose_csv, header=[1,2])  # DLC has multi-level header
                pose_features = pose_df.values.mean(axis=0)[:50]
            else:
                pose_features = np.zeros(50)
                
        elif POSE_FRAMEWORK == "mmpose":
            # MMPose pattern: {video}_MMPose.csv
            pose_csv = f"{POSE_CSV_DIR}/{video_name}_MMPose.csv"
            
            if os.path.exists(pose_csv):
                pose_df = pd.read_csv(pose_csv, index_col=0)  # MMPose has simple header
                pose_features = pose_df.values.mean(axis=0)[:50]
            else:
                pose_features = np.zeros(50)
        
        # Combine all features
        combined_features = np.concatenate([
            pose_features,                    # Pose (50)
            videomae_feat[:100],              # VideoMAE (100)
            [flow_mag_mean, flow_mag_std],    # Optical flow (2)
            [mask_area / (640*480)]           # Segmentation (1)
        ])
        
        dataset.append({
            'video': video_name,
            'label': label,
            'features': combined_features
        })
        
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Failed: {Path(video_path).name} - {e}")

print(f"\n‚úÖ Processed {len(dataset)} videos")
print(f"   Feature dimension: {dataset[0]['features'].shape[0]}")

## 6. Prepare Data for Training

In [None]:
# Academic Standard: Train/Test split only
# Test set is held out for final evaluation (NEVER used in training/tuning)
X = np.array([d['features'] for d in dataset])
y = np.array([d['label'] for d in dataset])

# Split into Train (85%) and Test (15%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.15,  # 15% for final test evaluation
    stratify=y, 
    random_state=42
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"üìä Data Split (Academic Standard):")
print(f"   Train: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Test:  {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\n   ‚úÖ Test set isolated for final evaluation")
print(f"   ‚úÖ 5-Fold CV will be performed on Train set for hyperparameter tuning")

## 7. Temporal Transformer Model

In [None]:
class TemporalTransformer(nn.Module):
    """Multi-modal Temporal Transformer for lameness classification"""
    def __init__(self, input_dim=153, hidden_dim=256, num_heads=8, num_layers=4):
        super().__init__()
        
        self.input_proj = nn.Linear(input_dim, hidden_dim)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim*4,
            dropout=0.3,
            batch_first=True
        )
        
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)
        )
    
    def forward(self, x):
        # x: (batch, features)
        x = self.input_proj(x).unsqueeze(1)  # (batch, 1, hidden)
        x = self.transformer(x)
        x = x.squeeze(1)  # (batch, hidden)
        return self.classifier(x)

model = TemporalTransformer(input_dim=X_train_scaled.shape[1]).to(device)
print(f"‚úÖ Model created: {sum(p.numel() for p in model.parameters())} parameters")

## 8. Hyperparameter Tuning with 5-Fold CV

In [None]:
# Hyperparameter configurations to test
configs_to_test = [
    {'hidden_dim': 256, 'num_heads': 8, 'num_layers': 4, 'dropout': 0.3},  # Default
    {'hidden_dim': 512, 'num_heads': 8, 'num_layers': 6, 'dropout': 0.2},  # Larger
    {'hidden_dim': 128, 'num_heads': 4, 'num_layers': 2, 'dropout': 0.4},  # Smaller
]

print("="*60)
print("HYPERPARAMETER TUNING WITH 5-FOLD CROSS-VALIDATION")
print("="*60)
print(f"Testing {len(configs_to_test)} configurations")
print(f"Each configuration evaluated with 5-Fold CV")
print("="*60)

# Track best configuration
best_cv_score = 0
best_params = None
all_results = []

for config_idx, config in enumerate(configs_to_test):
    print(f"\n{'='*60}")
    print(f"Configuration {config_idx+1}/{len(configs_to_test)}")
    print(f"  hidden_dim={config['hidden_dim']}, num_heads={config['num_heads']}")
    print(f"  num_layers={config['num_layers']}, dropout={config['dropout']}")
    print(f"{'='*60}")
    
    # 5-Fold CV for this configuration
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_results = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train)):
        print(f"  Fold {fold+1}/5...", end=' ')
        
        # Create model with current config
        model = TemporalTransformer(
            input_dim=X_train_scaled.shape[1],
            hidden_dim=config['hidden_dim'],
            num_heads=config['num_heads'],
            num_layers=config['num_layers']
        ).to(device)
        
        # Train on this fold
        best_model, val_acc = train_model(
            model, 
            X_train_scaled[train_idx], y_train[train_idx],
            X_train_scaled[val_idx], y_train[val_idx],
            epochs=20
        )
        
        fold_results.append(val_acc)
        print(f"Accuracy: {val_acc:.4f}")
    
    # Calculate mean CV score
    mean_cv_score = np.mean(fold_results)
    std_cv_score = np.std(fold_results)
    
    print(f"\n  üìä Configuration {config_idx+1} Results:")
    print(f"     Mean CV Accuracy: {mean_cv_score:.4f} ¬± {std_cv_score:.4f}")
    print(f"     Fold Accuracies: {[f'{acc:.4f}' for acc in fold_results]}")
    
    all_results.append({
        'config': config,
        'mean_cv_score': mean_cv_score,
        'std_cv_score': std_cv_score,
        'fold_results': fold_results
    })
    
    # Track best configuration
    if mean_cv_score > best_cv_score:
        best_cv_score = mean_cv_score
        best_params = config
        print(f"     ‚úÖ New best configuration!")

print(f"\n{'='*60}")
print("HYPERPARAMETER TUNING COMPLETE")
print(f"{'='*60}")
print(f"Best Configuration:")
print(f"  {best_params}")
print(f"  CV Accuracy: {best_cv_score:.4f}")
print(f"{'='*60}")

## 9. Final Model Training & Test Evaluation

In [None]:
print("\n" + "="*60)
print("FINAL MODEL TRAINING")
print("="*60)
print(f"Training with best hyperparameters on FULL training set")
print(f"  Parameters: {best_params}")
print(f"  Training samples: {len(X_train_scaled)}")
print("="*60 + "\n")

# Create final model with best hyperparameters
final_model = TemporalTransformer(
    input_dim=X_train_scaled.shape[1],
    hidden_dim=best_params['hidden_dim'],
    num_heads=best_params['num_heads'],
    num_layers=best_params['num_layers']
).to(device)

# Train on FULL training set (no validation split)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(final_model.parameters(), lr=0.001)

X_train_t = torch.FloatTensor(X_train_scaled).to(device)
y_train_t = torch.LongTensor(y_train).to(device)

for epoch in range(30):
    final_model.train()
    optimizer.zero_grad()
    outputs = final_model(X_train_t)
    loss = criterion(outputs, y_train_t)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 5 == 0:
        print(f"  Epoch {epoch+1}/30 - Loss: {loss.item():.4f}")

print("\n‚úÖ Final model training complete")

# Evaluate on held-out test set
print("\n" + "="*60)
print("EVALUATING ON HELD-OUT TEST SET")
print("="*60)

final_model.eval()
with torch.no_grad():
    X_test_t = torch.FloatTensor(X_test_scaled).to(device)
    test_outputs = final_model(X_test_t)
    test_preds = test_outputs.argmax(dim=1).cpu().numpy()
    test_probs = torch.softmax(test_outputs, dim=1).cpu().numpy()[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, test_preds, average='binary')
cm = confusion_matrix(y_test, test_preds)
auc = roc_auc_score(y_test, test_probs)

print(f"\n{'='*60}")
print("FINAL TEST RESULTS")
print(f"{'='*60}")
print(f"Test Accuracy:  {accuracy:.4f}")
print(f"Precision:      {precision:.4f}")
print(f"Recall:         {recall:.4f}")
print(f"F1-Score:       {f1:.4f}")
print(f"ROC-AUC:        {auc:.4f}")
print(f"\nConfusion Matrix:")
print(cm)
print(f"{'='*60}")

# Academic reporting
print(f"\n{'='*60}")
print("ACADEMIC SUMMARY")
print(f"{'='*60}")
print(f"Cross-Validation (Training Set):")
print(f"  Mean Accuracy: {best_cv_score:.4f}")
print(f"\nFinal Test Set Performance:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  F1-Score: {f1:.4f}")
print(f"  ROC-AUC:  {auc:.4f}")
print(f"{'='*60}")

# Save comprehensive results
metrics = {
    'methodology': '5-Fold Cross-Validation for Hyperparameter Tuning',
    'data_split': {
        'train_size': len(X_train),
        'test_size': len(X_test),
        'train_percentage': 85,
        'test_percentage': 15
    },
    'best_hyperparameters': best_params,
    'cv_results': {
        'mean_accuracy': float(best_cv_score),
        'all_configurations': all_results
    },
    'test_results': {
        'accuracy': float(accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1_score': float(f1),
        'roc_auc': float(auc),
        'confusion_matrix': cm.tolist()
    },
    'features_used': list(config['features'].keys()),
    'timestamp': datetime.now().isoformat()
}

with open(f"{OUTPUT_DIR}/metrics.json", 'w') as f:
    json.dump(metrics, f, indent=2)

torch.save({
    'model_state_dict': final_model.state_dict(),
    'scaler': scaler,
    'config': config,
    'best_params': best_params,
    'cv_score': best_cv_score,
    'test_metrics': metrics['test_results']
}, f"{OUTPUT_DIR}/models/best_model_multimodal.pth")

print("\n‚úÖ Results saved to:")
print(f"   - {OUTPUT_DIR}/metrics.json")
print(f"   - {OUTPUT_DIR}/models/best_model_multimodal.pth")

## 10. Visualizations

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Healthy', 'Lame'],
            yticklabels=['Healthy', 'Lame'])
plt.ylabel('True')
plt.xlabel('Predicted')
plt.title(f'Confusion Matrix (Acc: {accuracy:.2%})')
plt.savefig(f"{OUTPUT_DIR}/figures/confusion_matrix.png", dpi=300, bbox_inches='tight')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, test_probs)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, linewidth=2, label=f'AUC = {auc:.3f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.legend()
plt.grid(alpha=0.3)
plt.savefig(f"{OUTPUT_DIR}/figures/roc_curve.png", dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Complete!")