# Video Translation - GPU Accelerated Processing

This notebook provides GPU-accelerated processing for:
- **Whisper Speech-to-Text** (much faster on GPU)
- **LLM Translation** (much faster on GPU)

## Instructions
1. Enable GPU: `Runtime â†’ Change runtime type â†’ Hardware accelerator: GPU`
2. Run cells sequentially
3. Upload your audio file when prompted
4. Download results at the end

## Step 1: Check GPU Availability

In [None]:
import torch

if torch.cuda.is_available():
    print("âœ“ GPU is available")
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"  CUDA Version: {torch.version.cuda}")
else:
    print("âœ— No GPU available - using CPU")
    print("\nTo enable GPU:")
    print("  Runtime â†’ Change runtime type â†’ Hardware accelerator: GPU")

## Step 2: Install Dependencies

In [None]:
# !pip install -q openai-whisper transformers accelerate bitsandbytes
print("âœ“ Dependencies installed successfully")

In [None]:
# !pip install -q flask flask-cors pyngrok openai-whisper transformers accelerate bitsandbytes


## Step 3: Whisper Speech-to-Text Function

In [None]:
def whisper_transcribe_gpu(audio_path, output_json, output_txt, 
                          model_size="medium", language=None):
    """
    Transcribe audio using Whisper on GPU
    
    Args:
        audio_path: Path to audio file
        output_json: Path to save JSON output
        output_txt: Path to save text output
        model_size: Whisper model size (tiny, base, small, medium, large)
        language: Language code (e.g., 'en', 'es') or None for auto-detect
    
    Returns:
        Transcription result dictionary
    """
    import whisper
    import json
    
    print(f"Loading Whisper {model_size} on GPU...")
    model = whisper.load_model(model_size, device="cuda")
    
    print(f"Transcribing {audio_path}...")
    result = model.transcribe(
        audio_path,
        language=language,
        verbose=True
    )
    
    # Save JSON
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    
    # Save text
    with open(output_txt, 'w', encoding='utf-8') as f:
        f.write(result['text'])
    
    print(f"\nâœ“ Transcription saved to {output_json} and {output_txt}")
    print(f"Detected language: {result.get('language', 'unknown')}")
    print(f"Number of segments: {len(result.get('segments', []))}")
    
    return result

print("âœ“ Whisper function loaded")

## Step 4: LLM Translation Function

In [None]:
def llm_translate_gpu(input_json, output_json, output_txt,
                     target_language, 
                     model_name="mistralai/Mistral-7B-Instruct-v0.2",
                     visual_context=None):
    """
    Translate transcription using LLM on GPU
    
    Args:
        input_json: Path to transcription JSON (from Whisper)
        output_json: Path to save translated JSON
        output_txt: Path to save translated text
        target_language: Target language (e.g., 'Spanish', 'French')
        model_name: HuggingFace model name
        visual_context: Optional visual context string
    
    Returns:
        List of translated segments
    """
    import json
    import torch
    from transformers import AutoTokenizer, AutoModelForCausalLM
    
    print(f"Loading {model_name} on GPU...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_8bit=True
    )
    
    # Load transcription
    with open(input_json, 'r', encoding='utf-8') as f:
        transcription = json.load(f)
    
    segments = transcription.get('segments', [])
    print(f"Translating {len(segments)} segments to {target_language}...")
    
    translated_segments = []
    
    for i, segment in enumerate(segments):
        # Build prompt
        context_info = f"\n\nVisual Context: {visual_context}" if visual_context else ""
        prompt = f"""[INST] You are a professional translator. Translate the following text to {target_language}.
Only provide the translation, nothing else.{context_info}

Text to translate: {segment['text']}

Translation: [/INST]"""
        
        # Generate translation
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=512,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translation = full_output.replace(prompt, "").strip()
        
        translated_segments.append({
            'start': segment['start'],
            'end': segment['end'],
            'original': segment['text'],
            'translated': translation
        })
        
        if (i + 1) % 5 == 0:
            print(f"  Translated {i + 1}/{len(segments)} segments")
    
    # Save JSON
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(translated_segments, f, indent=2, ensure_ascii=False)
    
    # Save text
    full_text = " ".join([seg['translated'] for seg in translated_segments])
    with open(output_txt, 'w', encoding='utf-8') as f:
        f.write(full_text)
    
    print(f"\nâœ“ Translation saved to {output_json} and {output_txt}")
    return translated_segments

print("âœ“ Translation function loaded")

## Step 5: Upload Audio File

In [None]:
from google.colab import files

print("Please upload your audio file:")
uploaded = files.upload()
audio_file = list(uploaded.keys())[0]
print(f"\nâœ“ Uploaded: {audio_file}")

## Step 6: Run Whisper Transcription

In [None]:
# Configure settings
WHISPER_MODEL = "medium"  # Options: tiny, base, small, medium, large
SOURCE_LANGUAGE = None    # None for auto-detect, or 'en', 'es', 'fr', etc.

print("="*60)
print("RUNNING WHISPER SPEECH-TO-TEXT")
print("="*60)

transcription_result = whisper_transcribe_gpu(
    audio_path=audio_file,
    output_json="transcription.json",
    output_txt="transcription.txt",
    model_size=WHISPER_MODEL,
    language=SOURCE_LANGUAGE
)

print("\nâœ“ Transcription complete!")
print(f"\nTranscribed text preview:")
print(transcription_result['text'][:500] + "..." if len(transcription_result['text']) > 500 else transcription_result['text'])

## Step 7: Run LLM Translation

In [None]:
# Configure settings
TARGET_LANGUAGE = "Spanish"  # Change to your target language
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"  # Or use another model

print("="*60)
print(f"RUNNING LLM TRANSLATION TO {TARGET_LANGUAGE.upper()}")
print("="*60)

translation_result = llm_translate_gpu(
    input_json="transcription.json",
    output_json="translation.json",
    output_txt="translation.txt",
    target_language=TARGET_LANGUAGE,
    model_name=LLM_MODEL
)

print("\nâœ“ Translation complete!")
print(f"\nTranslated text preview:")
preview_text = " ".join([seg['translated'] for seg in translation_result[:3]])
print(preview_text)

## Step 8: Download Results

In [None]:
from google.colab import files

print("Downloading results...\n")

files.download("transcription.json")
files.download("transcription.txt")
files.download("translation.json")
files.download("translation.txt")

print("\nâœ“ All files downloaded!")
print("\nNext steps:")
print("1. Use these files in your local pipeline")
print("2. The translation.json contains timing information")
print("3. Feed to gTTS for speech synthesis")
print("4. Reconstruct video with new audio")

In [None]:
!ngrok config add-authtoken 35QmdBdgUvzBVxpSZRTs4iAEgu9_2w7NRNiXPG4QeANGqmYHQ

In [None]:
# Install dependencies and start Flask server

import os
import json
from flask import Flask, request, jsonify
from flask_cors import CORS
import whisper
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from pyngrok import ngrok
import tempfile

app = Flask(__name__)
CORS(app)

# Global model cache
whisper_model = None
llm_model = None
llm_tokenizer = None

@app.route('/health', methods=['GET'])
def health():
    '''Health check endpoint'''
    gpu_available = torch.cuda.is_available()
    gpu_name = torch.cuda.get_device_name(0) if gpu_available else "No GPU"
    
    return jsonify({
        'status': 'healthy',
        'gpu_available': gpu_available,
        'gpu_name': gpu_name,
        'whisper_loaded': whisper_model is not None,
        'llm_loaded': llm_model is not None
    })

@app.route('/load_whisper', methods=['POST'])
def load_whisper_model():
    '''Load Whisper model into memory'''
    global whisper_model
    
    data = request.json
    model_size = data.get('model_size', 'medium')
    
    print(f"Loading Whisper {model_size}...")
    whisper_model = whisper.load_model(model_size, device="cuda")
    
    return jsonify({
        'status': 'success',
        'message': f'Whisper {model_size} loaded on GPU'
    })

@app.route('/load_llm', methods=['POST'])
def load_llm_model():
    '''Load LLM model into memory'''
    global llm_model, llm_tokenizer
    
    data = request.json
    model_name = data.get('model_name', 'mistralai/Mistral-7B-Instruct-v0.2')
    
    print(f"Loading {model_name}...")
    llm_tokenizer = AutoTokenizer.from_pretrained(model_name)
    llm_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_8bit=True
    )
    
    return jsonify({
        'status': 'success',
        'message': f'{model_name} loaded on GPU'
    })

@app.route('/whisper/transcribe', methods=['POST'])
def transcribe():
    '''Transcribe audio using Whisper'''
    global whisper_model
    
    if whisper_model is None:
        return jsonify({'error': 'Whisper model not loaded. Call /load_whisper first'}), 400
    
    # Get audio file
    if 'audio' not in request.files:
        return jsonify({'error': 'No audio file provided'}), 400
    
    audio_file = request.files['audio']
    language = request.form.get('language', None)
    
    # Save to temp file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
        audio_file.save(temp_audio.name)
        temp_path = temp_audio.name
    
    try:
        print(f"Transcribing {temp_path}...")
        result = whisper_model.transcribe(
            temp_path,
            language=language if language != 'auto' else None,
            verbose=False
        )
        
        # Clean up
        os.unlink(temp_path)
        
        return jsonify({
            'status': 'success',
            'result': result
        })
        
    except Exception as e:
        os.unlink(temp_path)
        return jsonify({'error': str(e)}), 500

@app.route('/llm/translate', methods=['POST'])
def translate():
    '''Translate text using LLM'''
    global llm_model, llm_tokenizer
    
    if llm_model is None or llm_tokenizer is None:
        return jsonify({'error': 'LLM model not loaded. Call /load_llm first'}), 400
    
    data = request.json
    segments = data.get('segments', [])
    target_language = data.get('target_language', 'Spanish')
    visual_context = data.get('visual_context', None)
    
    print(f"Translating {len(segments)} segments to {target_language}...")
    
    translated_segments = []
    
    for i, segment in enumerate(segments):
        # Build prompt
        context_info = f"\n\nVisual Context: {visual_context}" if visual_context else ""
        prompt = f"""[INST] You are a professional translator. Translate the following text to {target_language}.
Only provide the translation, nothing else.{context_info}

Text to translate: {segment['text']}

Translation: [/INST]"""
        
        # Generate
        inputs = llm_tokenizer(prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = llm_model.generate(
                **inputs,
                max_new_tokens=512,
                temperature=0.7,
                do_sample=True,
                pad_token_id=llm_tokenizer.eos_token_id
            )
        
        full_output = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
        translation = full_output.replace(prompt, "").strip()
        
        translated_segments.append({
            'start': segment['start'],
            'end': segment['end'],
            'original': segment['text'],
            'translated': translation
        })
        
        if (i + 1) % 5 == 0:
            print(f"  Translated {i + 1}/{len(segments)}")
    
    return jsonify({
        'status': 'success',
        'translated_segments': translated_segments
    })

# Start ngrok tunnel
print("Starting ngrok tunnel...")
public_url = ngrok.connect(5000)
print(f"\n{'='*60}")
print(f"ðŸš€ Colab GPU Server is running!")
print(f"{'='*60}")
print(f"Public URL: {public_url}")
print(f"\nAdd this to your local .env file:")
print(f"COLAB_API_URL={public_url}")
print(f"USE_COLAB_GPU=True")
print(f"{'='*60}\n")

# Run Flask
app.run(port=5000)

## Optional: Quick Test with Sample Text

In [None]:
# Quick test without uploading files
# Uncomment to test the translation function directly

# import json
# 
# sample_transcription = {
#     "text": "Hello, how are you today?",
#     "language": "en",
#     "segments": [
#         {"start": 0.0, "end": 2.0, "text": "Hello, how are you today?"}
#     ]
# }
# 
# with open("test_transcription.json", "w") as f:
#     json.dump(sample_transcription, f)
# 
# test_result = llm_translate_gpu(
#     input_json="test_transcription.json",
#     output_json="test_translation.json",
#     output_txt="test_translation.txt",
#     target_language="French"
# )
# 
# print("Test translation:", test_result[0]['translated'])