### Try get a list of all gas stations in Hungary

In [9]:
import os
import requests
import json
from bs4 import BeautifulSoup
from pathlib import Path

In [10]:
def collect_station_htmls():
    """
    Collect HTML responses for all fuel types (uatip values)
    and save them to an htmls folder.
    """
    # Create htmls directory if it doesn't exist
    os.makedirs('htmls', exist_ok=True)
    
    # Define fuel types and their IDs
    fuel_types = {
        '1': '95-ös Benzin E10',
        '2': 'Gázolaj',
        '3': 'Lpg',
        '4': '100-As Benzin E5',
        '5': 'Cng',
        '6': 'Prémium Gázolaj',
        '7': 'Prémium Benzin E10',
        '8': '100-As Benzin E10',
        '10': 'Adblue Töltőpisztolyos'
    }
    
    # Base request configuration
    url = 'https://holtankoljak.hu/station_result'
    
    headers = {
        'User-Agent': os.getenv('USER_AGENT'),
        'Content-Type': 'application/x-www-form-urlencoded',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
        'Referer': 'https://holtankoljak.hu/station_result'
    }
    
    cookies = {
        'PHPSESSID': '5p0c1andikbctchp64hqibko8j',
        'latitude': '47.53585242506067',
        'longitude': '19.035004609912807',
        'scr_w': '1440',
        'scr_h': '900'
    }
    
    # Loop through each fuel type
    for uatip, fuel_name in fuel_types.items():
        print(f"Fetching {fuel_name} (uatip={uatip})...")
        
        data = {
            'uatip': uatip,
            'irsz': '',
            'distance': '5000'
        }
        
        response = requests.post(url, headers=headers, cookies=cookies, data=data)
        
        if response.status_code == 200:
            # Save the HTML response
            filename = f"htmls/stations_uatip_{uatip}.html"
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(response.text)
            print(f"  Saved to {filename}")
        else:
            print(f"  Failed with status code {response.status_code}")
    
    print("All fuel types collected!")

In [21]:
def parse_station_htmls():
    """
    Parse all HTML files in the htmls folder and create a deduplicated
    JSON based on station URLs.
    """
    all_stations = {}  # Use a dictionary for deduplication by URL
    htmls_dir = Path('htmls')
    
    # Loop through all HTML files in the htmls directory
    for html_file in htmls_dir.glob('stations_uatip_*.html'):
        uatip = html_file.name.split('_')[2].split('.')[0]
        
        print(f"Parsing {html_file.name}...")
        
        with open(html_file, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
        
        # Find all gas stations in the results div
        results_div = soup.find('div', id='results')
        if not results_div:
            print(f"  No results found in {html_file.name}")
            continue
        
        station_count = 0
        for station_div in results_div.find_all('div', class_='d-flex mb-3'):
            station = {}
            
            # Get station brand - check all possible attribute names
            logo_img = station_div.find('img', attrs={'data-bs-toggle': 'tooltip'})
            if logo_img:
                # Try all possible attributes for the brand name
                for attr in ['title', 'data-bs-original-title', 'aria-label']:
                    if attr in logo_img.attrs:
                        station['brand'] = logo_img[attr]
                        break
            
            # Get distance
            distance_span = station_div.find('span', class_='ar_list')
            if distance_span:
                station['distance'] = distance_span.text.strip()
            
            # Get address
            address_link = station_div.select('div.w-100.d-flex.atl.ps-5.align-items-center span.ar_list a')
            if address_link:
                station['address'] = address_link[0].text.strip()
            
            # Get OpenStreetMap link with coordinates - extract only the station coordinates
            osm_link = station_div.find('a', class_='link_none_white', href=lambda x: x and 'openstreetmap.org' in x)
            if osm_link and 'href' in osm_link.attrs:
                import re
                coords_match = re.search(r'route=[^;]+;([0-9.]+),([0-9.]+)', osm_link['href'])
                if coords_match:
                    station['lat'] = coords_match.group(1)
                    station['lon'] = coords_match.group(2)
            
            # Get price and date
            price_div = station_div.find('div', class_='price position-relative')
            if price_div:
                date_span = price_div.find('span', class_='badge')
                if date_span:
                    station['date'] = date_span.text.strip()
                
                price_span = price_div.find('span', class_='ar')
                if price_span:
                    station['price'] = price_span.text.strip()
            
            # Get details URL - this is our deduplication key
            details_link = station_div.find('a', href=lambda x: x and '#tartalom' in x)
            if details_link:
                url = 'https://holtankoljak.hu/' + details_link['href']
                station['url'] = url
                
                # Add fuel type information
                if 'fuels' not in all_stations.get(url, {}):
                    station['fuels'] = {}
                
                # Store this specific fuel's price
                fuel_type = next((name for id, name in {
                    '1': '95-ös Benzin E10',
                    '2': 'Gázolaj',
                    '3': 'Lpg',
                    '4': '100-As Benzin E5',
                    '5': 'Cng',
                    '6': 'Prémium Gázolaj',
                    '7': 'Prémium Benzin E10',
                    '8': '100-As Benzin E10',
                    '10': 'Adblue Töltőpisztolyos'
                }.items() if id == uatip), f"Unknown-{uatip}")
                
                if price_div and price_span:
                    price_value = price_span.text.strip()
                    if url not in all_stations:
                        station['fuels'][fuel_type] = {
                            'price': price_value,
                            'date': station.get('date', '')
                        }
                        all_stations[url] = station
                    else:
                        # Add this fuel type to existing station
                        if 'fuels' not in all_stations[url]:
                            all_stations[url]['fuels'] = {}
                        
                        all_stations[url]['fuels'][fuel_type] = {
                            'price': price_value,
                            'date': station.get('date', '')
                        }
                        
                        # Update coordinates if they weren't captured before
                        if 'lat' in station and 'lat' not in all_stations[url]:
                            all_stations[url]['lat'] = station['lat']
                            all_stations[url]['lon'] = station['lon']
                        
                station_count += 1
        
        print(f"  Parsed {station_count} stations from {html_file.name}")
    
    # Convert dictionary to list for final JSON
    stations_list = list(all_stations.values())
    
    # Save results to JSON file
    with open('all_stations.json', 'w', encoding='utf-8') as f:
        json.dump(stations_list, f, ensure_ascii=False, indent=2)
    
    print(f"Created deduplicated JSON with {len(stations_list)} unique stations!")
    return stations_list

In [11]:
# Step 1: Collect all HTML responses for different fuel types
collect_station_htmls()

Fetching 95-ös Benzin E10 (uatip=1)...
  Saved to htmls/stations_uatip_1.html
Fetching Gázolaj (uatip=2)...
  Saved to htmls/stations_uatip_2.html
Fetching Lpg (uatip=3)...
  Saved to htmls/stations_uatip_3.html
Fetching 100-As Benzin E5 (uatip=4)...
  Saved to htmls/stations_uatip_4.html
Fetching Cng (uatip=5)...
  Saved to htmls/stations_uatip_5.html
Fetching Prémium Gázolaj (uatip=6)...
  Saved to htmls/stations_uatip_6.html
Fetching Prémium Benzin E10 (uatip=7)...
  Saved to htmls/stations_uatip_7.html
Fetching 100-As Benzin E10 (uatip=8)...
  Saved to htmls/stations_uatip_8.html
Fetching Adblue Töltőpisztolyos (uatip=10)...
  Saved to htmls/stations_uatip_10.html
All fuel types collected!


In [22]:
# Step 2: Parse the HTML files and create a deduplicated JSON
stations = parse_station_htmls()

Parsing stations_uatip_2.html...
  Parsed 1274 stations from stations_uatip_2.html
Parsing stations_uatip_10.html...
  Parsed 174 stations from stations_uatip_10.html
Parsing stations_uatip_3.html...
  Parsed 354 stations from stations_uatip_3.html
Parsing stations_uatip_4.html...
  Parsed 914 stations from stations_uatip_4.html
Parsing stations_uatip_8.html...
  Parsed 131 stations from stations_uatip_8.html
Parsing stations_uatip_5.html...
  Parsed 12 stations from stations_uatip_5.html
Parsing stations_uatip_6.html...
  Parsed 956 stations from stations_uatip_6.html
Parsing stations_uatip_7.html...
  Parsed 306 stations from stations_uatip_7.html
Parsing stations_uatip_1.html...
  Parsed 1254 stations from stations_uatip_1.html
Created deduplicated JSON with 1286 unique stations!


In [23]:
# Example: Print some statistics
fuel_counts = {}
for station in stations:
    if 'fuels' in station:
        for fuel_type in station['fuels'].keys():
            fuel_counts[fuel_type] = fuel_counts.get(fuel_type, 0) + 1

print("\nFuel type availability:")
for fuel_type, count in sorted(fuel_counts.items()):
    # Find the longest fuel type name to properly align output
    max_length = max(len(ft) for ft in fuel_counts.keys())
    print(f"{fuel_type:{max_length}} : {count} stations")


Fuel type availability:
100-As Benzin E10      : 131 stations
100-As Benzin E5       : 914 stations
95-ös Benzin E10       : 1254 stations
Adblue Töltőpisztolyos : 174 stations
Cng                    : 12 stations
Gázolaj                : 1274 stations
Lpg                    : 354 stations
Prémium Benzin E10     : 306 stations
Prémium Gázolaj        : 956 stations
