In [None]:
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re
import pandas as pd

In [None]:
def get_sme_blog_links(first_page: int, last_page: int, subpage: str = ''):
    sme_blog_links = []
    for i in range(first_page, last_page):
        source = requests.get(f'https://blog.sme.sk/{subpage}?page={i}').text
        soup = BeautifulSoup(source, 'lxml')

        # Find all elements with href and class="title"
        elements = soup.select('a.title[href]')
        
        # Extract the links from the elements
        links = [element['href'] for element in elements]

        # Append the links to the list
        sme_blog_links.extend(links)

    # Add the base URL to the links
    sme_blog_links = [f'https://blog.sme.sk{link}' for link in sme_blog_links]

    return sme_blog_links

blogs = get_sme_blog_links(1, 300, 'najnovsie')

In [None]:
# Create a dictionary with Slovak month names as keys and English month names as values
slovak_to_english_months = {
    'jan': 'Jan', 'feb': 'Feb', 'mar': 'Mar', 'apr': 'Apr', 'máj': 'May', 'jún': 'Jun',
    'júl': 'Jul', 'aug': 'Aug', 'sep': 'Sep', 'okt': 'Oct', 'nov': 'Nov', 'dec': 'Dec'
}

def parse_sme_blog(url: str):
    source = requests.get(url).text
    soup = BeautifulSoup(source, 'lxml')

    # Parsing title
    title = soup.find('h1').text

    # Parsing perex
    perex = soup.find('p', {'class': 'perex'}).text

    # Parsing article body
    article_body_element = soup.find('div', {'class': 'article-body-content'})
    article_body = "\n".join([element.text for element in article_body_element.find_all('p')]).strip()

    # Parsing date
    publication_datetime_str = soup.find('span', {"class": "datetime datetime-long"}).text
    for sk, en in slovak_to_english_months.items():
        publication_datetime_str = publication_datetime_str.replace(f" {sk} ", f" {en} ")
    publication_datetime = datetime.strptime(publication_datetime_str, "%d. %b %Y o %H:%M")

    # Parsing likes
    likes_str = soup.find('span', {"class": "likes"}).text
    likes = int(re.sub("\D", "", likes_str))

    # Parsing reads
    reads_str = soup.find('span', {"class": "read-count"}).text
    reads = int(re.sub("\D", "", reads_str))

    # Parsing comments
    comments_str = soup.find('small', {"class": "count"}).text
    comments = int(re.sub("\D", "", comments_str))

    # Parsing author
    author_element = soup.select('a.name[href]')[0]
    author_link = author_element["href"]
    author_name = author_element.text

    return {
        "url": url,
        "title": title,
        "perex": perex,
        "article_body": article_body,
        "publication_datetime": publication_datetime,
        "likes": likes,
        "reads": reads,
        "comments": comments,
        "author_link": author_link,
        "author_name": author_name
    }


df = pd.DataFrame([parse_sme_blog(b) for b in blogs[:100]])

In [None]:
df