#  Stock Price Extractor - Kaggle Notebook

Extract stock price information from audio files using LLM-based extraction.

**Features:**
-  Whisper: Audio transcription to text
-  Mistral-3B LLM: Financial data extraction (smallest, fastest model)
-  GPU T4: Fast processing (<5 seconds per audio file)
-  JSON output: Structured results ready to download

**Prerequisites:**
-  GPU T4 enabled (Settings ‚Üí Accelerator ‚Üí GPU T4)
-  HuggingFace account (Mistral models are open, no license acceptance needed)
-  HuggingFace API token created

**Performance:**
-  Mistral-3B: Smallest model (~6GB memory, faster than 7B)
-  Target: <5 seconds per file
-  Transcription: <2s, Extraction: <2s

## Step 1Ô∏è‚É£: Verify GPU and Find Files

In [None]:
import sys
import torch
from pathlib import Path
import json

print("=" * 70)
print(" KAGGLE STOCK PRICE EXTRACTOR")
print("=" * 70)

# 1. Verify GPU
print("\n‚úì GPU Check:")
gpu_available = torch.cuda.is_available()
print(f"  GPU Available: {gpu_available}")
if gpu_available:
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("    GPU not enabled - enable in Settings ‚Üí Accelerator ‚Üí GPU T4")

# 2. Find uploaded files
print("\n‚úì Finding uploaded files:")
audio_files = sorted(list(Path('/kaggle/input').glob('**/*.wav')))
prompt_files = sorted(list(Path('/kaggle/input').glob('**/*.txt')))

print(f"  Audio files found: {len(audio_files)}")
for f in audio_files[:3]:
    size_mb = f.stat().st_size / (1024 * 1024)
    print(f"    ‚Ä¢ {f.name} ({size_mb:.1f} MB)")

print(f"  Prompt files found: {len(prompt_files)}")
for f in prompt_files[:2]:
    print(f"    ‚Ä¢ {f.name}")

# 3. Set paths
if audio_files and prompt_files:
    audio_path = str(audio_files[0])
    prompt_path = str(prompt_files[0])
    print(f"\n Ready to process!")
    print(f"  Audio: {Path(audio_path).name}")
    print(f"  Prompt: {Path(prompt_path).name}")
else:
    print(f"\n Missing files - cannot proceed")
    audio_path = None
    prompt_path = None

## Step 2Ô∏è‚É£: Configure HuggingFace Authentication

In [None]:
from huggingface_hub import login

#  REPLACE WITH YOUR TOKEN
HF_TOKEN = "hf_YOUR_TOKEN_HERE"

print("\n" + "=" * 70)
print(" HUGGINGFACE AUTHENTICATION")
print("=" * 70)

if HF_TOKEN == "hf_YOUR_TOKEN_HERE":
    print("\n  Token not configured!")
    print("\n Setup Instructions:")
    print("  1. Go to: https://huggingface.co/mistralai/Mistral-3B-Instruct-v0.1")
    print("  2. Mistral models are open - no license acceptance needed!")
    print("  3. Get your token: https://huggingface.co/settings/tokens")
    print("  4. Replace 'hf_YOUR_TOKEN_HERE' above with your actual token")
    print("  5. Re-run this cell\n")
else:
    try:
        login(token=HF_TOKEN)
        print("\n‚úÖ Successfully logged into HuggingFace")
        print("   Using Mistral-3B-Instruct-v0.1 (smallest, fastest model)")
        print("   üí° 3B model: ~6GB memory, faster than 7B!")
    except Exception as e:
        print(f"\n‚ùå Authentication failed: {e}")
        print("  Check that your token is correct")

## Step 3Ô∏è‚É£: Load Models (Transcribe & Extract)

In [None]:
import sys
import os
import shutil
from pathlib import Path

print("\n" + "=" * 70)
print(" LOADING MODELS")
print("=" * 70)

# Auto-detect app/models/transcribe.py anywhere under /kaggle/input
root_dir = Path("/kaggle/input")
app_source = None

print("\n Searching for app/models/transcribe.py under /kaggle/input...")
for transcribe_path in root_dir.rglob("transcribe.py"):
    # We want .../app/models/transcribe.py specifically
    if (
        transcribe_path.name == "transcribe.py"
        and transcribe_path.parent.name == "models"
        and transcribe_path.parent.parent.name == "app"
    ):
        app_source = transcribe_path.parent.parent
        print(f" Found app folder at: {app_source}")
        break

if not app_source:
    print(" app folder not found!")
    raise FileNotFoundError("app/models/transcribe.py not found under /kaggle/input. Check your dataset structure.")

# Add parent to path so 'from app.models...' works
sys.path.insert(0, str(app_source.parent))

# Copy to working directory for persistence
app_working = Path('/kaggle/working') / 'app'
if not app_working.exists():
    try:
        shutil.copytree(app_source, app_working)
        sys.path.insert(0, '/kaggle/working')
    except Exception as e:
        print(f"  Copy failed: {e}")

# Import models
print("\n Importing functions...")
try:
    from app.models.transcribe import transcribe
    from app.models.llm_extract import extract_with_long_prompt
    print(" transcribe() - ready")
    print(" extract_with_long_prompt() - ready")
except ImportError as e:
    print(f" Import failed: {e}")
    print(f"   sys.path[0] = {sys.path[0]}")
    raise

## Step 4Ô∏è‚É£: Transcribe Audio to Text

In [None]:
import gc
import time
import os

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

print("\n" + "=" * 70)
print(" TRANSCRIPTION")
print("=" * 70)

if not audio_path:
    print(" No audio file available - skipping transcription")
    transcript = None
else:
    print(f"\n File: {Path(audio_path).name}")
    
    try:
        start = time.time()
        print(" Processing... (5-10 seconds)")
        
        # Fix for precision error: set torch to use float32 consistently
        torch.set_default_dtype(torch.float32)
        
        # Force model to use float32
        os.environ['TORCH_DTYPE'] = 'float32'
        
        # Transcribe audio
        result = transcribe(audio_path)
        
        # Handle different return formats
        if isinstance(result, dict):
            transcript = result.get('result', result)
        else:
            transcript = result
        
        # Ensure transcript is a string
        if isinstance(transcript, dict):
            transcript = str(transcript)
        
        elapsed = time.time() - start
        print(f" Done in {elapsed:.1f}s\n")
        
        # Show transcript
        if transcript and len(str(transcript)) > 0:
            preview = str(transcript)[:300] + ("..." if len(str(transcript)) > 300 else "")
            print(f" Transcript ({len(str(transcript))} chars):\n{preview}\n")
        else:
            print("  Empty transcript - audio may be silent or corrupted")
            
    except RuntimeError as e:
        if "float" in str(e) or "Half" in str(e) or "dtype" in str(e):
            print(f"  GPU precision issue detected")
            print(f"   Attempting direct transcription...\n")
            
            try:
                # Direct approach: load model with explicit dtype
                import librosa
                from transformers import pipeline
                
                # Load audio
                audio_data, sr = librosa.load(audio_path, sr=16000)
                
                # Create pipeline with explicit float32
                pipe = pipeline(
                    "automatic-speech-recognition",
                    model="openai/whisper-small",
                    device=0 if torch.cuda.is_available() else -1,
                    torch_dtype=torch.float32
                )
                
                result = pipe(audio_data)
                transcript = result.get('text', str(result))
                
                elapsed = time.time() - start
                print(f" Transcription complete in {elapsed:.1f}s\n")
                
                preview = str(transcript)[:300] + ("..." if len(str(transcript)) > 300 else "")
                print(f" Transcript ({len(str(transcript))} chars):\n{preview}\n")
                
            except Exception as direct_e:
                print(f" Direct transcription also failed: {direct_e}")
                print(f"\n Audio file may be corrupted or invalid")
                print(f"   Please check the audio file and try again")
                transcript = None
        else:
            print(f" Error: {e}")
            transcript = None
            
    except Exception as e:
        print(f" Error: {e}")
        print(f"   Type: {type(e).__name__}")
        transcript = None

## Step 5Ô∏è‚É£: Extract Stock Price Data with LLM

In [None]:
print("\n" + "=" * 70)
print(" LLM EXTRACTION")
print("=" * 70)

result = None

if not transcript:
    print("\n No transcript available - cannot extract")
elif not prompt_path:
    print("\n No prompt file available - cannot extract")
else:
    print(f"\n Prompt: {Path(prompt_path).name}")
    
    try:
        start = time.time()
        print(" Extracting... (2-5 seconds with GPU)")
        
        # Extract using LLM
        result = extract_with_long_prompt(transcript, prompt_file=prompt_path)
        
        elapsed = time.time() - start
        print(f" Done in {elapsed:.1f}s\n")
        
        # Display results
        if result:
            print(" Extracted Data:")
            print(json.dumps(result, indent=2))
        else:
            print("  No data extracted")
            
    except Exception as e:
        print(f" Error: {e}")
        print("\n Troubleshooting:")
        print("  ‚Ä¢ Check HuggingFace token is valid")
        print("  ‚Ä¢ Verify Mistral model can be accessed")
        print("  ‚Ä¢ Check internet connection")
        print("  ‚Ä¢ Ensure GPU has enough memory (Mistral 7B needs ~14GB)")

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

## Step 6Ô∏è‚É£: Save and Download Results

In [None]:
print("\n" + "=" * 70)
print(" SAVING RESULTS")
print("=" * 70)

output_dir = Path('/kaggle/working')
output_dir.mkdir(parents=True, exist_ok=True)

if result:
    # Save extraction results
    results_file = output_dir / 'stock_price_extraction.json'
    with open(results_file, 'w') as f:
        json.dump(result, f, indent=2)
    
    print(f"\n Results saved: stock_price_extraction.json")
    print(f"   Size: {results_file.stat().st_size} bytes")
    
    # Save metadata
    metadata = {
        'audio_file': Path(audio_path).name if audio_path else None,
        'prompt_file': Path(prompt_path).name if prompt_path else None,
        'gpu_device': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU',
        'extraction_status': 'success'
    }
    
    metadata_file = output_dir / 'metadata.json'
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f" Metadata saved: metadata.json")
    
    print(f"\n Download your files:")
    print(f"  1. Click 'Files' (right panel)")
    print(f"  2. Download 'stock_price_extraction.json'")
    print(f"  3. Download 'metadata.json'")
    
else:
    print(f"\n  No results to save (extraction may have failed)")
    
    # Save error info
    error_info = {
        'audio_found': audio_path is not None,
        'prompt_found': prompt_path is not None,
        'transcript_generated': transcript is not None,
        'gpu_available': torch.cuda.is_available()
    }
    
    error_file = output_dir / 'error_info.json'
    with open(error_file, 'w') as f:
        json.dump(error_info, f, indent=2)
    
    print(f"   Saved debug info to: error_info.json")

print("\n" + "=" * 70)
print(" COMPLETE")
print("=" * 70)

## Step 7Ô∏è‚É£: Process Multiple Audio Files (Optional)

Process all audio files in the dataset and save individual results.

In [None]:
print("\n" + "=" * 70)
print(" BATCH PROCESSING")
print("=" * 70)

# Find all audio files
all_audio_files = sorted(list(Path('/kaggle/input').glob('**/*.wav')))

if len(all_audio_files) <= 1:
    print(f"\n‚úì Only 1 audio file - already processed above")
else:
    print(f"\n‚úì Found {len(all_audio_files)} audio files")
    print(f"  Processing all of them...\n")
    
    all_results = {}
    
    for i, audio_file in enumerate(all_audio_files, 1):
        filename = audio_file.name
        print(f"[{i}/{len(all_audio_files)}] {filename}...", end=' ', flush=True)
        
        try:
            # Clear memory
            torch.cuda.empty_cache()
            gc.collect()
            
            # Transcribe
            trans_result = transcribe(str(audio_file))
            trans = trans_result.get('result') if isinstance(trans_result, dict) else trans_result
            
            # Extract
            extract_result = extract_with_long_prompt(trans, prompt_file=prompt_path)
            
            all_results[filename] = {
                'status': 'success',
                'data': extract_result
            }
            print("")
            
        except Exception as e:
            all_results[filename] = {
                'status': 'error',
                'error': str(e)
            }
            print(f" ({str(e)[:30]}...)")
    
    # Save batch results
    batch_file = Path('/kaggle/working') / 'batch_results.json'
    with open(batch_file, 'w') as f:
        json.dump(all_results, f, indent=2)
    
    print(f"\n All results saved to: batch_results.json")
    
    # Summary
    success_count = sum(1 for v in all_results.values() if v['status'] == 'success')
    error_count = len(all_results) - success_count
    
    print(f"\n Summary:")
    print(f"   Successful: {success_count}/{len(all_results)}")
    print(f"   Failed: {error_count}/{len(all_results)}")