# BetaMove Model Training on Google Colab

This notebook enables training YOLO and XGBoost models using Google Colab's GPU resources.

## Workflow
1. Mount Google Drive (training data storage)
2. Clone the repository
3. Install dependencies
4. Load training data from Google Drive
5. Train the model
6. Save results to Google Drive

## Prerequisites
- Training data uploaded to your Google Drive
- GPU runtime enabled (Runtime > Change runtime type > GPU)

## 1. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Configure paths - adjust these to your setup
DRIVE_ROOT = '/content/drive/MyDrive'
BETAMOVE_DIR = f'{DRIVE_ROOT}/BetaMove'  # Root folder for BetaMove data
TRAINING_DATA_DIR = f'{BETAMOVE_DIR}/training_data'
MODELS_DIR = f'{BETAMOVE_DIR}/models'

print(f'Google Drive mounted at: {DRIVE_ROOT}')
print(f'Training data directory: {TRAINING_DATA_DIR}')
print(f'Models directory: {MODELS_DIR}')

## 2. Clone Repository and Install Dependencies

In [None]:
import os
import sys
from pathlib import Path

# Clone the repository
REPO_URL = 'https://github.com/harrisonkimdev/6156-capstone-project.git'
REPO_DIR = '/content/6156-capstone-project'

if not Path(REPO_DIR).exists():
    !git clone {REPO_URL}
    print(f'Repository cloned to {REPO_DIR}')
else:
    %cd {REPO_DIR}
    !git pull
    print('Repository updated')

%cd {REPO_DIR}
print(f'Working directory: {os.getcwd()}')

In [None]:
# Install dependencies
!pip install -q -r requirements.txt

# Add src to Python path
sys.path.insert(0, str(Path(REPO_DIR) / 'src'))

print('Dependencies installed')
print(f'Python path includes: {Path(REPO_DIR) / "src"}')

## 3. Check GPU Availability

In [None]:
import torch

if torch.cuda.is_available():
    device = 'cuda'
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'GPU available: {gpu_name}')
    print(f'GPU memory: {gpu_memory:.1f} GB')
else:
    device = 'cpu'
    print('WARNING: No GPU available. Training will be slow.')
    print('Go to Runtime > Change runtime type > GPU')

## 4. Setup Training Data Directories

In [None]:
import os
from pathlib import Path

# Create directories if they don't exist
for dir_path in [TRAINING_DATA_DIR, MODELS_DIR]:
    Path(dir_path).mkdir(parents=True, exist_ok=True)
    print(f'Directory ready: {dir_path}')

# List existing training data
print('\nExisting training data:')
if Path(TRAINING_DATA_DIR).exists():
    for item in Path(TRAINING_DATA_DIR).iterdir():
        if item.is_dir():
            files = list(item.rglob('*'))
            print(f'  {item.name}/ ({len(files)} files)')
        else:
            print(f'  {item.name} ({item.stat().st_size / 1024:.1f} KB)')
else:
    print('  (empty - upload training data to Google Drive)')

---
# YOLO Hold Detection Training

Train a YOLO model for hold detection.

In [None]:
# Configuration for YOLO training
YOLO_CONFIG = {
    'dataset_yaml': f'{TRAINING_DATA_DIR}/hold_detection/dataset.yaml',  # Path to dataset.yaml in Drive
    'model': 'yolov8n.pt',  # Base model: yolov8n.pt, yolov8s.pt, yolov8m.pt
    'epochs': 100,
    'batch': 16,
    'imgsz': 640,
    'project': f'{MODELS_DIR}/yolo_runs',
    'name': 'hold_detection',
}

print('YOLO Training Configuration:')
for key, value in YOLO_CONFIG.items():
    print(f'  {key}: {value}')

In [None]:
# Verify dataset exists
dataset_yaml = Path(YOLO_CONFIG['dataset_yaml'])
if not dataset_yaml.exists():
    print(f'ERROR: Dataset not found at {dataset_yaml}')
    print('\nPlease upload your YOLO dataset to Google Drive with this structure:')
    print(f'  {TRAINING_DATA_DIR}/hold_detection/')
    print('    ├── dataset.yaml')
    print('    ├── images/')
    print('    │   ├── train/')
    print('    │   └── val/')
    print('    └── labels/')
    print('        ├── train/')
    print('        └── val/')
else:
    print(f'Dataset found: {dataset_yaml}')
    print('\nDataset configuration:')
    print(dataset_yaml.read_text())

In [None]:
# Train YOLO model
from ultralytics import YOLO
from datetime import datetime
import json

print('Starting YOLO training...')
print(f'Time: {datetime.now().isoformat()}')

# Load model
model = YOLO(YOLO_CONFIG['model'])

# Train
results = model.train(
    data=YOLO_CONFIG['dataset_yaml'],
    epochs=YOLO_CONFIG['epochs'],
    batch=YOLO_CONFIG['batch'],
    imgsz=YOLO_CONFIG['imgsz'],
    project=YOLO_CONFIG['project'],
    name=YOLO_CONFIG['name'],
    device=device,
    verbose=True,
)

print(f'\nTraining completed at: {datetime.now().isoformat()}')

In [None]:
# Save training metadata
import json
from datetime import datetime

# Get best model path
best_model_path = Path(YOLO_CONFIG['project']) / YOLO_CONFIG['name'] / 'weights' / 'best.pt'

# Extract metrics
metrics = {}
if hasattr(results, 'results_dict'):
    metrics = results.results_dict

# Create metadata
metadata = {
    'model_type': 'yolo',
    'base_model': YOLO_CONFIG['model'],
    'hyperparameters': {
        'epochs': YOLO_CONFIG['epochs'],
        'batch': YOLO_CONFIG['batch'],
        'imgsz': YOLO_CONFIG['imgsz'],
    },
    'metrics': metrics,
    'best_model_path': str(best_model_path),
    'trained_on': 'colab',
    'device': device,
    'completed_at': datetime.now().isoformat(),
}

# Save metadata
metadata_path = best_model_path.parent.parent / 'model_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print(f'Metadata saved to: {metadata_path}')
print(f'Best model saved to: {best_model_path}')
print('\nTraining Results:')
print(json.dumps(metadata, indent=2, default=str))

---
# XGBoost Pose Classification Training

Train an XGBoost model for pose classification.

In [None]:
# Configuration for XGBoost training
XGB_CONFIG = {
    'features_path': f'{TRAINING_DATA_DIR}/pose_features/features.json',  # Path to features file
    'task': 'classification',
    'label_column': 'detection_score',
    'label_threshold': 0.6,
    'test_size': 0.2,
    'n_estimators': 300,
    'learning_rate': 0.05,
    'max_depth': 4,
    'model_out': f'{MODELS_DIR}/xgb_pose/model.json',
}

print('XGBoost Training Configuration:')
for key, value in XGB_CONFIG.items():
    print(f'  {key}: {value}')

In [None]:
# Verify features file exists
features_path = Path(XGB_CONFIG['features_path'])
if not features_path.exists():
    print(f'ERROR: Features file not found at {features_path}')
    print('\nPlease upload your pose features to Google Drive:')
    print(f'  {TRAINING_DATA_DIR}/pose_features/features.json')
else:
    import json
    with open(features_path) as f:
        data = json.load(f)
    print(f'Features file found: {features_path}')
    print(f'Number of samples: {len(data)}')
    if data:
        print(f'Features per sample: {len(data[0].keys())}')

In [None]:
# Train XGBoost model
from pose_ai.ml.xgb_trainer import TrainParams, train_from_file
from datetime import datetime
import json

print('Starting XGBoost training...')
print(f'Time: {datetime.now().isoformat()}')

# Create output directory
model_out = Path(XGB_CONFIG['model_out'])
model_out.parent.mkdir(parents=True, exist_ok=True)

# Create training parameters
params = TrainParams(
    task=XGB_CONFIG['task'],
    label_column=XGB_CONFIG['label_column'],
    label_threshold=XGB_CONFIG['label_threshold'],
    test_size=XGB_CONFIG['test_size'],
    n_estimators=XGB_CONFIG['n_estimators'],
    learning_rate=XGB_CONFIG['learning_rate'],
    max_depth=XGB_CONFIG['max_depth'],
    model_out=model_out,
)

# Train
metrics = train_from_file(features_path, params)

print(f'\nTraining completed at: {datetime.now().isoformat()}')
print(f'Metrics: {metrics}')

In [None]:
# Save XGBoost training metadata
import json
from datetime import datetime

metadata = {
    'model_type': 'xgboost',
    'hyperparameters': {
        'task': XGB_CONFIG['task'],
        'n_estimators': XGB_CONFIG['n_estimators'],
        'learning_rate': XGB_CONFIG['learning_rate'],
        'max_depth': XGB_CONFIG['max_depth'],
        'test_size': XGB_CONFIG['test_size'],
    },
    'metrics': metrics,
    'model_path': str(model_out),
    'features_path': str(features_path),
    'trained_on': 'colab',
    'completed_at': datetime.now().isoformat(),
}

# Save metadata
metadata_path = model_out.parent / 'model_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print(f'Metadata saved to: {metadata_path}')
print(f'Model saved to: {model_out}')
print('\nTraining Results:')
print(json.dumps(metadata, indent=2, default=str))

---
# Summary

After training, your models are saved to Google Drive at:
- YOLO: `{MODELS_DIR}/yolo_runs/hold_detection/weights/best.pt`
- XGBoost: `{MODELS_DIR}/xgb_pose/model.json`

To use these models in the BetaMove application:
1. Download the model files from Google Drive
2. Place them in the appropriate `models/` directory
3. Update the model paths in the application configuration

In [None]:
# List all trained models in Google Drive
print('Trained models in Google Drive:')
print('=' * 50)

models_path = Path(MODELS_DIR)
if models_path.exists():
    for model_dir in models_path.iterdir():
        if model_dir.is_dir():
            print(f'\n{model_dir.name}/')
            for item in model_dir.rglob('*'):
                if item.is_file():
                    rel_path = item.relative_to(model_dir)
                    size_kb = item.stat().st_size / 1024
                    print(f'  {rel_path} ({size_kb:.1f} KB)')
else:
    print('No models found')