In [2]:
# Install required packages if not already installed
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Required packages
required_packages = [
    "pdfplumber",
    "pandas",
    "matplotlib",
    "seaborn"
]

for package in required_packages:
    try:
        __import__(package)
        print(f"✓ {package} already installed")
    except ImportError:
        print(f"Installing {package}...")
        install_package(package)
        print(f"✓ {package} installed")


✓ pdfplumber already installed
✓ pandas already installed
✓ matplotlib already installed
✓ seaborn already installed


In [3]:
# Import required libraries
import re
import pdfplumber
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import defaultdict, Counter
import datetime as dt
import csv

# Set matplotlib to display Hungarian characters properly
plt.rcParams['font.family'] = ['DejaVu Sans', 'Arial Unicode MS', 'Segoe UI']

print("All libraries imported successfully!")


All libraries imported successfully!


In [4]:
# Configuration
PDF_PATH = Path("../mav/2024-2025._evi_belfoldi_kozforgalmu_menetrend_06.21_-_12.13-ig_v06.25.pdf")

# Check if PDF exists
if PDF_PATH.exists():
    print(f"✓ PDF found: {PDF_PATH}")
    print(f"File size: {PDF_PATH.stat().st_size / (1024*1024):.1f} MB")
else:
    print(f"❌ PDF not found at: {PDF_PATH}")
    print("Please check the file path and try again.")


✓ PDF found: ..\mav\2024-2025._evi_belfoldi_kozforgalmu_menetrend_06.21_-_12.13-ig_v06.25.pdf
File size: 3.7 MB


In [5]:
def extract_route_pairs_from_pdf(pdf_path, max_pages=10, debug=True):
    """
    Extract station pairs from MÁV PDF by parsing route headers correctly.
    
    Args:
        pdf_path: Path to the PDF file
        max_pages: Maximum number of pages to process (use None for all pages)
        debug: Whether to show debug information
    
    Returns:
        List of dictionaries with route information
    """
    
    if debug:
        print(f"🔍 Starting extraction from: {pdf_path}")
        if max_pages:
            print(f"📄 Processing first {max_pages} pages for testing")
        else:
            print(f"📄 Processing ALL pages")
    
    all_routes = []
    
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        pages_to_process = pdf.pages[:max_pages] if max_pages else pdf.pages
        
        if debug:
            print(f"📖 Total pages in PDF: {total_pages}")
            print(f"🔄 Will process: {len(pages_to_process)} pages")
        
        for i, page in enumerate(pages_to_process, 1):
            try:
                text = page.extract_text()
                if not text:
                    continue
                
                lines = [line.strip() for line in text.splitlines() if line.strip()]
                
                # Find route headers with em-dash or en-dash
                for line in lines:
                    if ('—' in line or '–' in line) and re.search(r'\d+', line):
                        
                        if debug and i <= 3:  # Show debug for first 3 pages
                            print(f"\n🔍 Page {i}, Found route header: '{line}'")
                        
                        # Parse the route header
                        clean_header = line.replace('—', '|').replace('–', '|')
                        parts = [part.strip() for part in clean_header.split('|') if part.strip()]
                        
                        if len(parts) >= 2:
                            # Remove leading numbers from first part
                            first_part = re.sub(r'^\d+\s+', '', parts[0]).strip()
                            last_part = parts[-1].strip()
                            
                            # Extract just the city names (Hungarian characters supported)
                            start_match = re.search(r'^([A-Za-záéíóöőúüűÁÉÍÓÖŐÚÜŰ]+(?:\s+[A-Za-záéíóöőúüűÁÉÍÓÖŐÚÜŰ]+)*)', first_part)
                            end_match = re.search(r'^([A-Za-záéíóöőúüűÁÉÍÓÖŐÚÜŰ]+(?:\s+[A-Za-záéíóöőúüűÁÉÍÓÖŐÚÜŰ]+)*)', last_part)
                            
                            if start_match and end_match:
                                start = start_match.group(1).strip()
                                end = end_match.group(1).strip()
                                
                                # Only keep if both stations are valid and different
                                if start and end and start != end and len(start) > 2 and len(end) > 2:
                                    route_info = {
                                        'page': i,
                                        'route_header': line,
                                        'source': start,
                                        'destination': end,
                                        'total_stations': len(parts),
                                        'intermediate_stations': parts[1:-1] if len(parts) > 2 else []
                                    }
                                    all_routes.append(route_info)
                                    
                                    if debug and i <= 3:
                                        print(f"   ✅ Extracted: {start} → {end} ({len(parts)} stations)")
            
            except Exception as e:
                if debug:
                    print(f"   ⚠️  Error on page {i}: {e}")
                continue
    
    if debug:
        print(f"\n🎯 EXTRACTION COMPLETE!")
        print(f"   📊 Total routes found: {len(all_routes)}")
        print(f"   📄 Pages with routes: {len(set(r['page'] for r in all_routes))}")
        
        if all_routes:
            print(f"\n📋 SAMPLE RESULTS:")
            for i, route in enumerate(all_routes[:5], 1):
                print(f"   {i}. {route['source']} → {route['destination']} (Page {route['page']})")
    
    return all_routes

# Test extraction on first 5 pages
print("🧪 TESTING EXTRACTION ON FIRST 5 PAGES")
print("="*50)
test_routes = extract_route_pairs_from_pdf(PDF_PATH, max_pages=5, debug=True)


🧪 TESTING EXTRACTION ON FIRST 5 PAGES
🔍 Starting extraction from: ..\mav\2024-2025._evi_belfoldi_kozforgalmu_menetrend_06.21_-_12.13-ig_v06.25.pdf
📄 Processing first 5 pages for testing
📖 Total pages in PDF: 724
🔄 Will process: 5 pages

🔍 Page 2, Found route header: '1 Budapest — Hegyeshalom — Rajka'
   ✅ Extracted: Budapest → Rajka (3 stations)

🎯 EXTRACTION COMPLETE!
   📊 Total routes found: 2
   📄 Pages with routes: 2

📋 SAMPLE RESULTS:
   1. Budapest → Rajka (Page 2)
   2. Budapest → Rajka (Page 4)


In [6]:
# Run extraction on ALL pages
print("🚀 RUNNING FULL EXTRACTION ON ALL 724 PAGES")
print("="*50)
print("⏳ This may take 5-10 minutes...")

import time
start_time = time.time()

# Extract from all pages
all_routes = extract_route_pairs_from_pdf(PDF_PATH, max_pages=None, debug=False)

end_time = time.time()
processing_time = end_time - start_time

print(f"\n✅ EXTRACTION COMPLETED!")
print(f"   ⏱️  Time: {processing_time:.1f} seconds")
print(f"   📊 Routes found: {len(all_routes)}")
print(f"   📄 Pages with data: {len(set(r['page'] for r in all_routes))}")

# Show sample results
print(f"\n📋 SAMPLE RESULTS:")
for i, route in enumerate(all_routes[:10], 1):
    print(f"   {i:2d}. {route['source']} → {route['destination']} (Page {route['page']})")


🚀 RUNNING FULL EXTRACTION ON ALL 724 PAGES
⏳ This may take 5-10 minutes...

✅ EXTRACTION COMPLETED!
   ⏱️  Time: 59.8 seconds
   📊 Routes found: 398
   📄 Pages with data: 352

📋 SAMPLE RESULTS:
    1. Budapest → Rajka (Page 2)
    2. Budapest → Rajka (Page 4)
    3. Budapest → Rajka (Page 6)
    4. Budapest → Rajka (Page 8)
    5. Budapest → Rajka (Page 10)
    6. Budapest → Rajka (Page 12)
    7. Budapest → Rajka (Page 14)
    8. Budapest → Rajka (Page 16)
    9. Budapest → Rajka (Page 18)
   10. Budapest → Rajka (Page 20)


In [7]:
# Save results to CSV files
print("\n💾 SAVING RESULTS TO CSV")
print("="*30)

if all_routes:
    # Convert to DataFrame
    df = pd.DataFrame(all_routes)
    
    # Create timestamp for file naming
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # 1. Save all routes
    filename_all = f"mav_all_routes_{timestamp}.csv"
    df.to_csv(filename_all, index=False, encoding='utf-8')
    print(f"✅ All routes: {filename_all} ({len(df)} routes)")
    
    # 2. Save unique pairs only
    df_unique = df.drop_duplicates(['source', 'destination'])[['source', 'destination', 'total_stations']]
    filename_unique = f"mav_unique_pairs_{timestamp}.csv"
    df_unique.to_csv(filename_unique, index=False, encoding='utf-8')
    print(f"✅ Unique pairs: {filename_unique} ({len(df_unique)} pairs)")
    
    # 3. Save Budapest-Rajka routes specifically
    budapest_routes = df[
        (df['route_header'].str.contains('Budapest', case=False)) & 
        (df['route_header'].str.contains('Rajka|Hegyeshalom', case=False))
    ]
    
    if len(budapest_routes) > 0:
        filename_budapest = f"budapest_rajka_routes_{timestamp}.csv"
        budapest_routes.to_csv(filename_budapest, index=False, encoding='utf-8')
        print(f"✅ Budapest-Rajka: {filename_budapest} ({len(budapest_routes)} routes)")
        
        print(f"\n🎯 BUDAPEST-RAJKA PAIRS FOUND:")
        unique_budapest = budapest_routes.drop_duplicates(['source', 'destination'])
        for i, (_, row) in enumerate(unique_budapest.iterrows(), 1):
            print(f"   {i}. {row['source']} → {row['destination']}")
    
    print(f"\n🎉 SUCCESS! Files saved with timestamp: {timestamp}")
    
else:
    print("❌ No routes to save")



💾 SAVING RESULTS TO CSV
✅ All routes: mav_all_routes_20250715_221609.csv (398 routes)
✅ Unique pairs: mav_unique_pairs_20250715_221609.csv (132 pairs)
✅ Budapest-Rajka: budapest_rajka_routes_20250715_221609.csv (24 routes)

🎯 BUDAPEST-RAJKA PAIRS FOUND:
   1. Budapest → Rajka
   2. Rajka → Budapest

🎉 SUCCESS! Files saved with timestamp: 20250715_221609
