In [None]:
import pandas as pd
import sys
import requests

from bs4 import BeautifulSoup
from pathlib import Path

# Path cfg.
try:
    base_dir = Path(__file__).resolve().parents[2]
except NameError:
    base_dir = Path().resolve()

src_path = base_dir / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))


## Projects finder.
project_id = 3
url = f'https://www.wroclaw.pl/budzet-obywatelski-wroclaw/projekty-2016/szukaj,id,{project_id},name,,rejon,,kategoria,,prog,,selected,2'
response = requests.get(url)

if response.status_code == 200:
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    print("HTML obtenido y parseado con éxito.")
    print(soup.title.string)
else:
    print(f"Error al acceder a la página. Código: {response.status_code}")

# Find all <a> with class lnkTitle inside each project.
links = [a["href"] for a in soup.find_all("a", class_="lnkTitle") if a.has_attr("href")]

for link in links:
    print(link)

In [None]:
import pandas as pd
file_path = Path().resolve().parents[1] / 'data/PL16-base-projects.csv'
file_path

df = pd.read_csv(file_path, sep=";")

### URL Extraction Wrocław 2016

In [None]:
url_data = []
for _,r in df.iterrows():
    project_id = r['project_id']
    url = f'https://www.wroclaw.pl/budzet-obywatelski-wroclaw/projekty-2016/szukaj,id,{project_id},name,,rejon,,kategoria,,prog,,selected,2'
    response = requests.get(url)

    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, "html.parser")
        print("HTML obtenido y parseado con éxito.")
        print(soup.title.string)
    else:
        print(f"Error al acceder a la página. Código: {response.status_code}")

    # Find all <a> with class lnkTitle inside each project.
    links = [a["href"] for a in soup.find_all("a", class_="lnkTitle") if a.has_attr("href")]

    if len(links) == 1:
        d = {'project_id': project_id, 'url': links[0]}
        url_data.append(d)

url_df = pd.DataFrame(url_data)
url_df.to_csv('pl-projects-2016.csv', sep = ";", index=False)

### URL Extraction Wrocław 2017

In [None]:
import pandas as pd
file_path = Path().resolve().parents[1] / 'data/PL17-base-projects.csv'
file_path

df = pd.read_csv(file_path, sep=";")

In [None]:
url_data = []
for _,r in df.iterrows():
    project_id = r['project_id']
    url = f'https://www.wroclaw.pl/budzet-obywatelski-wroclaw/projekty-2017/szukaj,id,{project_id},name,,rejon,,osiedle,,kategoria,,status,1,prog,,selected,1'
    response = requests.get(url)

    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, "html.parser")
        print("HTML obtenido y parseado con éxito.")
        print(soup.title.string)
    else:
        print(f"Error al acceder a la página. Código: {response.status_code}")

    # Find all <a> with class lnkTitle inside each project.
    links = [a["href"] for a in soup.find_all("a", class_="lnkTitle") if a.has_attr("href")]

    if len(links) == 1:
        d = {'project_id': project_id, 'url': links[0]}
        url_data.append(d)

url_df = pd.DataFrame(url_data)
url_df.to_csv('pl-projects-2017.csv', sep = ";", index=False)

#### Extract Poland HTML

In [12]:
import pandas as pd
import sys
import requests

from bs4 import BeautifulSoup
from pathlib import Path

# Path cfg.
try:
    base_dir = Path(__file__).resolve().parents[2]
except NameError:
    base_dir = Path().resolve()

src_path = base_dir / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

In [14]:
# Input
path_16 = base_dir / 'pl-projects-2016.csv'
path_17 = base_dir / 'pl-projects-2017.csv'

df16 = pd.read_csv(path_16, sep=";")
df17 = pd.read_csv(path_17, sep=";")

df16['year'] = 2016
df17['year'] = 2017

# Output
output_dir = base_dir / 'pl-html'
output_dir.mkdir(parents=True, exist_ok=True)

# Data Load
project_urls = pd.concat([df16, df17]).reset_index(drop=True)

# HTML Loader
for i, row in project_urls.iterrows():
    try:
        response = requests.get(row['url'], timeout=10)
        response.raise_for_status()
    
    except Exception as e:
        print(f"❌ Error downloading {url}: {e}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    html_content = soup.find('html')

    if html_content:
        output_file = output_dir / 'project_id_{}_{}.html'.format(row['project_id'],row['year'])
        with open(output_file, 'w', encoding='utf-8') as f:
            print(f'✅ Saved: {output_file}')
            f.write(str(html_content))
    else:
        print(f"⚠️ No HTML content found in {url}")

✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_710_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_15_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_764_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_685_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_379_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_155_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_731_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_201_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_18_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_4_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_564_2016.html
✅ Saved: /home/juano/pb-project/scripts/scrapper/pl-html/project_id_337_2016.html
✅ Saved: /home/juano

#### Analyze HTML for scrapping.

In [64]:
def get_project_data(filepath):
    
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    project_id = int(filepath.split("/")[-1].split("_")[2].replace(".html",""))
    year = int(filepath.split("/")[-1].split("_")[-1].replace(".html",""))

    soup = BeautifulSoup(content, 'html.parser')

    h1 = soup.find('h1', class_='txtTitle')
    project_title = h1.contents[0].strip()

    desc_div = soup.find('div', class_='boxProjectDesc')
    description = desc_div.get_text(separator="\n", strip=True)

    project = {}
    
    project['project_id'] = project_id if project_id else None
    project['project_title'] = project_title if project_title else None
    project['description'] = description if description else None

    return project

In [65]:
import os 

html_folder = base_dir /'pl-html/'
data = [os.path.join(html_folder, f) for f in os.listdir(html_folder) if os.path.isfile(os.path.join(html_folder, f))]

In [66]:
file_path = data[0]
file_path

'/home/juano/pb-project/scripts/scrapper/pl-html/project_id_108_2016.html'

In [67]:
print(get_project_data(filepath=file_path))

{'project_id': 108, 'project_title': 'Budowa sygnalizacji świetlnej na skrzyżowaniu przy Skwerze Pionierów Wrocławskich', 'description': 'Uzasadnienie\nCelem projektu jest budowa sygnalizacji świetlnej na skrzyżowaniu przy Skwerze Pionierów Wrocławskich (Wrocław/Ołbin), przy wlocie ulic Poniatowskiego i Oleśnickiej.\nW powyższym miejscu kumuluje się - szczególnie w szczycie komunikacyjnym - ruch pieszych, tramwajów i samochodów. Niejednokrotnie dochodzi tam do zdarzeń drogowych i nierzadko poważnych wypadków. Zachodzi więc potrzeba budowy sygnalizacji, która miała by na celu poprawę bezpieczeństwa i płynności ruchu na tym skrzyżowaniu. Wiele razy o rozwiązanie problemu prosiła min rada osiedla Ołbin.\nNiemal wszystkie skrzyżowania - także te o mniejszym ruchu i mniej "wypadkowe" w ciągu ulic Bema-Poniatowskiego-Jedności Narodowej w ostatnich latach zostały wyposażone w sygnalizację świetlną. Tym bardziej jest istotne i ważne, aby także i to miejsce - tak niebezpieczne - wreszcie doczek

In [51]:
df16[df16['project_id'] == 108]['url'].iloc[0]

'https://www.wroclaw.pl/budzet-obywatelski-wroclaw/projekty-2016/projekt,id,108'