### Scrape holtankoljak.hu data for visualization, analyysis and possible modeling of gas prices

In [1]:
from bs4 import BeautifulSoup
import requests
import pickle

import pandas as pd
import numpy as np

from multiprocessing.pool import ThreadPool as Pool

import warnings
warnings.filterwarnings('ignore')

Link needs to know my location so it can parse all stations around it with given radius. 

Steps:
1. In Chrome open this: https://holtankoljak.hu/index.php?ua_map=1&uz_tip=2&myrad=1000#tartalom
2. Save HTML to data folder
3. Open with BS here in Python

In [2]:
with open("data/3_arstop_eltorlese/Holtankoljak.hu.html", encoding='utf8') as page:
    soup = BeautifulSoup(page)

In [3]:
# data_table = soup.find_all('table', class_ = 'table width=')[0]
# pd.read_html(str(data_table))[0]

### Get links

In [3]:
href_list = soup.find_all('table', class_ = 'table width=')[0].find_all('a', href = True)

links = []
for i in href_list:
    links.append(i['href'])

links = list(set(links))

In [4]:
len(links)

1275

### Get data by station

In [5]:
def scrape_station_data(link):
    
    page = requests.get(link)
    page_soup = BeautifulSoup(page.text)

    div_cards = page_soup.find_all('div', class_ = 'card')

    address = np.nan
    geo_location = np.nan
    services = np.nan
    hours = np.nan

    for i in div_cards:
        
        if 'Útvonal' in i.text:
            address = i.find('a').text
            geo_location = i.find('a')['href'].split(';')[1].split('#')[0]

        if 'Szolgáltatások' in i.text:
            services = i.find('div', class_ = 'card-body').text.strip().split(',')

        if 'Nyitvatartás' in i.text:
            hours = i.find('table').text.strip().split('\n')
            

    gas_data = page_soup.find('table', class_ = 'table table-hover').find('tbody').find_all('tr')

    gas_dict = {}
    for i in gas_data:

        gas_type = i.find_all('td')[1].text.strip()
        gas_price = i.find_all('td')[2].text.split('/liter')[0].strip()
        gas_dict[gas_type] = gas_price

    scraped_data = pd.DataFrame.from_dict({'link' : link,
                                            'address' : address, 
                                            'geo_location' : geo_location,
                                            'services' : services,
                                            'open_hours' : hours,
                                            'prices' : gas_dict}, orient = 'index')

    return scraped_data   


In [6]:
%%time

data_collector = []

pool = Pool(4)
for link in links:
    pool.apply_async(scrape_station_data, (link,), callback = data_collector.append)
pool.close()
pool.join()

Wall time: 5min 54s


### Concat data

In [7]:
%%time

data = pd.concat((df for df in data_collector), axis = 1, join = 'outer').T

Wall time: 198 ms


In [8]:
data.head(3)

Unnamed: 0,link,address,geo_location,services,open_hours,prices
0,https://holtankoljak.net/mol_vajszlo_szechenyi...,"Vajszló, Széchenyi István út 34.","45.86290359,17.98775482","[nagynyomású kútoszlop, UTA-kártya, bankkárt...","[Hétfő:06:00 - 20:00, \t\t\tKedd:06:00 - 20:00...","{'95-ös Benzin E10': '641.0,- Ft', 'Gázolaj': ..."
0,https://holtankoljak.net/mol_jaszbereny_nagyka...,"Jászberény, Nagykátai út","47.48664474,19.88292694","[autópálya-matrica, kávé, kézi autómosó, PB...","[Hétfő:06:00 - 22:00, \t\t\tKedd:06:00 - 22:00...","{'100-As Benzin E5': '678.0,- Ft', 'Lpg': '380..."
0,https://holtankoljak.net/shell_szekszard_palan...,"Szekszárd, Palánki u. 2.","46.36576080,18.70693970","[HU-GO feltöltés, nagynyomású kútoszlop, büf...","[Hétfő:0-24 óráig, \t\t\tKedd:0-24 óráig, \t\t...","{'100-As Benzin E5': '724.9,- Ft', '95-ös Benz..."


In [9]:
data.to_csv('data/scraped_data_20221211.csv', index = False)