## Step 1Ô∏è‚É£: Mount Google Drive & Setup Cache

In [None]:
from google.colab import drive
import os
from pathlib import Path

# Mount Google Drive
drive.mount('/content/drive')
print("‚úÖ Google Drive mounted!")

# Create model cache directory in Google Drive
# This ensures models persist between sessions!
model_cache = Path('/content/drive/MyDrive/Stock-Price-Extractor-Cache')
model_cache.mkdir(exist_ok=True, parents=True)

print(f"‚úÖ Model cache directory: {model_cache}")
print(f"   üìä Cache size: {sum(f.stat().st_size for f in model_cache.rglob('*') if f.is_file()) / (1024**3):.2f} GB")

## Step 2Ô∏è‚É£: Install Dependencies

In [None]:
# Install required packages
!pip install -q transformers torch librosa soundfile accelerate huggingface-hub vllm

print("‚úÖ Dependencies installed!")

## Step 3Ô∏è‚É£: Clone Repository & Set Model Cache

In [None]:
import os
from pathlib import Path

# Clone the project
!git clone https://github.com/hasipfaruk/Stock-Price-Extractor.git

# Navigate to project
os.chdir("Stock-Price-Extractor")
print("‚úÖ Project cloned!")
print(f"üìÅ Current directory: {os.getcwd()}")

# Set HuggingFace cache to Google Drive for persistence
model_cache = '/content/drive/MyDrive/Stock-Price-Extractor-Cache'
os.environ['HF_HOME'] = model_cache
os.environ['TRANSFORMERS_CACHE'] = model_cache

print(f"\n‚úÖ Model cache set to Google Drive:")
print(f"   üìÅ {model_cache}")
print(f"\nüí° Models will persist between sessions!")
print(f"   First run: Download & save to Google Drive")
print(f"   Next runs: Reuse from Google Drive (instant!)")

## Step 4Ô∏è‚É£: HuggingFace Authentication

In [None]:
from huggingface_hub import login

# Set your token (replace with yours)
HF_TOKEN = "hf_YOUR_TOKEN_HERE"

if HF_TOKEN == "hf_YOUR_TOKEN_HERE":
    print("‚ö†Ô∏è IMPORTANT: Replace with your actual HuggingFace token!")
    print("üìñ Get token from: https://huggingface.co/settings/tokens")
    print("1. Go to the link above")
    print("2. Create new token (read access is fine)")
    print("3. Copy and paste in the line above")
    print("\n‚úÖ Mistral models are open - no license acceptance needed!")
    print("   Using: mistralai/Mistral-7B-Instruct-v0.2")
else:
    login(token=HF_TOKEN)
    print("‚úÖ Authenticated with HuggingFace!")
    print("   Using Mistral-7B-Instruct-v0.2 (optimized for <6s processing)")

## Step 5Ô∏è‚É£: Verify GPU & Check Cached Models

In [None]:
import torch
from pathlib import Path

print("=" * 70)
print("GPU STATUS")
print("=" * 70)

gpu_available = torch.cuda.is_available()
print(f"‚úÖ GPU Available: {gpu_available}")

if gpu_available:
    print(f"üìä GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print("\n‚ú® GPU enabled - processing optimized for <6s per file!")
    print("   ‚ö° Transcription: <2s (Distil-Whisper)")
    print("   ‚ö° LLM Extraction: <3s (Mistral-7B)")
else:
    print("\n‚ö†Ô∏è GPU not available - will use CPU (slower)")
    print("üí° To enable GPU: Runtime ‚Üí Change runtime type ‚Üí GPU")
    print("   ‚ö†Ô∏è CPU mode will be slower than <6s target")

# Check cached models in Google Drive
print("\n" + "=" * 70)
print("CACHED MODELS (Google Drive)")
print("=" * 70)

cache_dir = Path('/content/drive/MyDrive/Stock-Price-Extractor-Cache')
if cache_dir.exists():
    models = list(cache_dir.glob('models--*/snapshots/*/model*.safetensors'))
    if models:
        print(f"\n‚úÖ Found {len(models)} cached models:")
        total_size = 0
        for model_file in models:
            size_mb = model_file.stat().st_size / (1024**2)
            total_size += model_file.stat().st_size
            model_name = model_file.parent.parent.parent.name.replace('models--', '').replace('--', '/')
            print(f"  üì¶ {model_name}: {size_mb:.0f} MB")
        print(f"\nüìä Total cached: {total_size / (1024**3):.2f} GB")
    else:
        print("\nüÜï No cached models yet. Will download on first run.")
        print("   First run: ~10-15 minutes (includes model downloads)")
        print("   - Distil-Whisper: ~1GB (fast transcription)")
        print("   - Mistral-7B: ~14GB (fast extraction)")
        print("   Next runs: Instant (models cached)")
        print("   ‚ö° Optimized for <6s processing per file!")
else:
    print("\nüÜï Cache directory created. Ready for downloads!")

## Step 6Ô∏è‚É£: Upload Audio Files

In [None]:
from google.colab import files
import os

# Create upload directory
os.makedirs('uploaded_audio', exist_ok=True)

print("üìÅ Upload audio files:")
print("1. Click 'Choose Files'")
print("2. Select multiple audio files (WAV, MP3, FLAC, M4A)")
print("3. Wait for upload to complete\n")

uploaded = files.upload()

print(f"\n‚úÖ {len(uploaded)} files uploaded:")
for filename in uploaded.keys():
    file_path = f'uploaded_audio/{filename}'
    os.rename(filename, file_path)
    file_size = os.path.getsize(file_path) / (1024 * 1024)
    print(f"  üìÑ {filename} ({file_size:.1f} MB)")

## Step 7Ô∏è‚É£: Upload Extraction Prompt

In [None]:
from google.colab import files

# Upload your prompt file
print("üìù Upload extraction prompt file:")
print("Click 'Choose Files' and select your prompt.txt file\n")

prompt_files = files.upload()

if prompt_files:
    prompt_filename = list(prompt_files.keys())[0]
    os.rename(prompt_filename, 'colab_prompt.txt')
    print(f"‚úÖ Prompt uploaded: {prompt_filename}")
    
    # Show first 200 chars
    with open('colab_prompt.txt', 'r') as f:
        content = f.read()
    print(f"\nüìñ Prompt preview ({len(content)} chars):")
    print(content[:200] + "...\n" if len(content) > 200 else content)
else:
    print("‚ö†Ô∏è No prompt file uploaded. Using default prompt.")
    # Create default prompt
    default_prompt = """Extract stock price information from the transcript.

Return JSON with these fields:
- index_name: Stock index name (e.g., \"S&P 500\")
- price: Current price
- change: Change in points
- change_percent: Percent change

Return ONLY valid JSON, no explanation."""
    
    with open('colab_prompt.txt', 'w') as f:
        f.write(default_prompt)
    print("‚úÖ Default prompt created")

## Step 8Ô∏è‚É£: Import Functions

In [None]:
import sys
from pathlib import Path

# Add project to path
project_path = Path.cwd()
sys.path.insert(0, str(project_path))

# Import functions
from app.models.transcribe import transcribe
from app.models.llm_extract import extract_with_long_prompt

print("‚úÖ Functions imported successfully!")
print(f"  üìç transcribe() - ready")
print(f"  üìç extract_with_long_prompt() - ready")

## Step 9Ô∏è‚É£: Batch Processing (with Timing & Drive Cache)

In [None]:
import json
from pathlib import Path
import torch
import gc
import time

# Find all audio files
audio_files = sorted(list(Path('uploaded_audio').glob('*')))
audio_files = [f for f in audio_files if f.suffix.lower() in ['.wav', '.mp3', '.flac', '.m4a', '.ogg']]

print(f"üìÅ Found {len(audio_files)} audio files\n")

if len(audio_files) == 0:
    print("‚ùå No audio files to process")
else:
    all_results = {}
    batch_start_time = time.time()
    
    print("üîÑ PROCESSING AUDIO FILES (Optimized for <6s per file)")
    print("   ‚ö° Using Distil-Whisper + Mistral-7B")
    print("   üíæ Models cached in Google Drive\n")
    
    for i, audio_file in enumerate(audio_files, 1):
        filename = audio_file.name
        file_start_time = time.time()
        print(f"[{i}/{len(audio_files)}] Processing {filename}...", end=' ', flush=True)
        
        try:
            # Clear memory
            torch.cuda.empty_cache()
            gc.collect()
            
            # Transcribe
            trans_start = time.time()
            result = transcribe(str(audio_file))
            trans_duration = time.time() - trans_start
            
            transcript = result.get('result') if isinstance(result, dict) else result
            trans_time = result.get('time', trans_duration) if isinstance(result, dict) else trans_duration
            
            # Extract
            extract_start = time.time()
            extraction = extract_with_long_prompt(transcript, prompt_file='colab_prompt.txt')
            extract_duration = time.time() - extract_start
            
            file_total = time.time() - file_start_time
            
            all_results[filename] = {
                "status": "success",
                "data": extraction,
                "timing": {
                    "transcription_s": round(trans_time, 3),
                    "extraction_s": round(extract_duration, 3),
                    "total_s": round(file_total, 3)
                }
            }
            # Check if within target
            if file_total < 6.0:
                print(f"‚úÖ ({file_total:.2f}s) ‚ö°")
            else:
                print(f"‚úÖ ({file_total:.2f}s) ‚ö†Ô∏è (target: <6s)")
            
        except Exception as e:
            file_total = time.time() - file_start_time
            all_results[filename] = {
                "status": "error",
                "error": str(e)[:100],
                "timing": {
                    "total_s": round(file_total, 3)
                }
            }
            print(f"‚ùå ({file_total:.2f}s)")
    
    # Save results
    with open('batch_results.json', 'w') as f:
        json.dump(all_results, f, indent=2)
    
    batch_total_time = time.time() - batch_start_time
    print(f"\n‚úÖ All results saved to: batch_results.json")
    
    # Summary with timing
    success = sum(1 for r in all_results.values() if r["status"] == "success")
    failed = len(all_results) - success
    avg_time = sum(r.get("timing", {}).get("total_s", 0) for r in all_results.values() if r["status"] == "success") / max(success, 1)
    
    print(f"\nüìä Summary:")
    print(f"  ‚úÖ Successful: {success}/{len(all_results)}")
    print(f"  ‚ùå Failed: {failed}/{len(all_results)}")
    print(f"  ‚è±Ô∏è Batch Total: {batch_total_time:.2f}s")
    print(f"  ‚è±Ô∏è Average per file: {avg_time:.2f}s")
    
    # Performance check
    if avg_time < 6.0:
        print(f"  ‚ö° Performance: EXCELLENT (target: <6s)")
    elif avg_time < 8.0:
        print(f"  ‚ö†Ô∏è Performance: GOOD (target: <6s, actual: {avg_time:.2f}s)")
    else:
        print(f"  ‚ö†Ô∏è Performance: SLOW (target: <6s, actual: {avg_time:.2f}s)")
        print(f"     üí° Check GPU is enabled and has enough memory")
    
    print(f"\nüíæ Google Drive Cache Info:")
    cache_dir = Path('/content/drive/MyDrive/Stock-Price-Extractor-Cache')
    if cache_dir.exists():
        total_cache = sum(f.stat().st_size for f in cache_dir.rglob('*') if f.is_file())
        print(f"  üìÅ Cache Directory: {cache_dir}")
        print(f"  üìä Total Cached: {total_cache / (1024**3):.2f} GB")
        print(f"  ‚úÖ Persisted across sessions!")
    
    # Show sample results with timing
    if success > 0:
        print(f"\nüìà Sample Results:")
        for filename, result in list(all_results.items())[:3]:
            if result["status"] == "success":
                print(f"\n  {filename}:")
                data = result["data"]
                timing = result.get("timing", {})
                print(f"    Index: {data.get('index_name')}")
                print(f"    Price: {data.get('price')}")
                print(f"    Change: {data.get('change')} ({data.get('change_percent')})")
                print(f"    üé§ Transcription: {timing.get('transcription_s', 0):.3f}s")
                print(f"    ü§ñ Extraction: {timing.get('extraction_s', 0):.3f}s")
                print(f"    ‚è±Ô∏è Total: {timing.get('total_s', 0):.3f}s")

## Step üîü: Download Results

In [None]:
from google.colab import files
from pathlib import Path

print("üì• Download your results:\n")

# Download batch results if exists
if Path('batch_results.json').exists():
    print("1. Downloading batch_results.json (with timing data)...")
    files.download('batch_results.json')
    print("   ‚úÖ Downloaded!")

print("\n‚úÖ All files ready for download!")
print("\nCheck the 'Files' tab (left panel) to download any files.")
print("\nüìä Results include:")
print("  - Index name, price, change, change%")
print("  - üé§ Transcription time (target: <2s)")
print("  - ü§ñ Extraction time (target: <3s)")
print("  - ‚è±Ô∏è Total time per file (target: <6s)")

print("\n‚ö° Speed Optimizations:")
print("  - Distil-Whisper: Fast transcription (~1-1.5s)")
print("  - Mistral-7B: Fast extraction (~2-3s)")
print("  - Optimized settings: Zero temperature, reduced tokens")

print("\nüíæ Google Drive Model Cache:")
print("  - Models saved in: /MyDrive/Stock-Price-Extractor-Cache")
print("  - Reused in next Colab sessions (no re-download!)")
print("  - Saves 10-15 minutes on next run! üöÄ")

## üéâ Complete!

Your stock price extraction is done! üöÄ

**‚ú® What's Special:**
1. ‚úÖ Stock prices extracted from audio using **Mistral-7B**
2. ‚ö° **Optimized for <6s processing per file**
3. üé§ Transcription: <2s (Distil-Whisper)
4. ü§ñ LLM extraction: <3s (Mistral-7B)
5. ‚è±Ô∏è Total processing time tracked for each file
6. üíæ **Models saved to Google Drive (persistent!)**

**Performance:**
- **Target**: <6 seconds per file
- **Transcription**: Distil-Whisper (fast, accurate)
- **Extraction**: Mistral-7B (fast, open-source)
- **GPU Required**: For <6s target (T4 or better)

**Next Session:**
- Run this notebook again with new audio files
- Models are already in Google Drive
- **Skip model downloads (saves 10-15 minutes!)**
- Just upload new audio ‚Üí Process ‚Üí Download results

**Next steps:**
1. Download `batch_results.json` from the Files panel
2. Check timing data - should be <6s per file
3. Import results into your spreadsheet or database

**For more info:**
- See `README.md` for general documentation
- See `SPEED_OPTIMIZATIONS.md` for speed optimization details
- See `USAGE.md` for usage examples