### Scrape holtankoljak.hu data for visualization, analyysis and possible modeling of gas prices

In [1]:
from bs4 import BeautifulSoup
import requests
import pickle

import pandas as pd
import numpy as np

from multiprocessing.pool import ThreadPool as Pool

import warnings
warnings.filterwarnings('ignore')

Link needs to know my location so it can parse all stations around it with given radius. 

Steps:
1. In Chrome open this: https://holtankoljak.hu/index.php?ua_map=1&uz_tip=2&myrad=1000#tartalom
2. Save HTML to data folder
3. Open with BS here in Python

In [2]:
with open('data/2/Holtankoljak.hu.html', encoding='utf8') as page:
    soup = BeautifulSoup(page)

In [3]:
# data_table = soup.find_all('table', class_ = 'table width=')[0]
# pd.read_html(str(data_table))[0]

### Get links

In [3]:
href_list = soup.find_all('table', class_ = 'table width=')[0].find_all('a', href = True)

links = []
for i in href_list:
    links.append(i['href'])

links = list(set(links))

In [4]:
len(links)

1287

### Get data by station

In [5]:
def scrape_station_data(link):
    
    page = requests.get(link)
    page_soup = BeautifulSoup(page.text)

    div_cards = page_soup.find_all('div', class_ = 'card')

    address = np.nan
    geo_location = np.nan
    services = np.nan
    hours = np.nan

    for i in div_cards:
        
        if 'Útvonal' in i.text:
            address = i.find('a').text
            geo_location = i.find('a')['href'].split(';')[1].split('#')[0]

        if 'Szolgáltatások' in i.text:
            services = i.find('div', class_ = 'card-body').text.strip().split(',')

        if 'Nyitvatartás' in i.text:
            hours = i.find('table').text.strip().split('\n')
            

    gas_data = page_soup.find('table', class_ = 'table table-hover').find('tbody').find_all('tr')

    gas_dict = {}
    for i in gas_data:

        gas_type = i.find_all('td')[1].text.strip()
        gas_price = i.find_all('td')[2].text.split('/liter')[0].strip()
        gas_dict[gas_type] = gas_price

    scraped_data = pd.DataFrame.from_dict({'link' : link,
                                            'address' : address, 
                                            'geo_location' : geo_location,
                                            'services' : services,
                                            'open_hours' : hours,
                                            'prices' : gas_dict}, orient = 'index')

    return scraped_data   


In [6]:
%%time

data_collector = []

pool = Pool(4)
for link in links:
    pool.apply_async(scrape_station_data, (link,), callback = data_collector.append)
pool.close()
pool.join()

Wall time: 1min 39s


### Concat data

In [7]:
%%time

data = pd.concat((df for df in data_collector), axis = 1, join = 'outer').T

Wall time: 212 ms


In [8]:
data.head(3)

Unnamed: 0,link,address,geo_location,services,open_hours,prices
0,https://holtankoljak.hu/mol_siklos_felszabadul...,"Siklós, Felszabadulás út 1211/9 hrsz.","45.85673523,18.29270935","[UTA-kártya, kávé, bankkártya elfogadás, hű...","[Hétfő:06:00 - 22:00, Kedd:06:00 - 22:00, Szer...","{'95-ös Benzin E10': '477.9,- Ft', 'Gázolaj': ..."
0,https://holtankoljak.hu/canada_petrol_kft__kec...,"Kecskemét, Felsőcsalános u. 54.","46.88845825,19.62286186","[autópálya-matrica, bankkártya elfogadás, ke...","[Hétfő:0-24 óráig, Kedd:0-24 óráig, Szerda:0-2...","{'95-ös Benzin E10': '479.9,- Ft', 'Gázolaj': ..."
0,https://holtankoljak.hu/ersto_kft__tokol_ledin...,"Tököl, Ledina dűlő 4.","47.31358337,18.97490311","[PB-gáz, mobiltelefon egyenlegfeltöltés, kéz...","[Hétfő:05:00 - 22:00, Kedd:05:00 - 22:00, Szer...","{'Gázolaj': '480.0,- Ft', '95-ös Benzin E10': ..."


In [10]:
data.to_csv('data/scraped_data_20220113.csv', index = False)