This notebook scrapes text from Wiadomosci from February 2022 until December 2023. As all content up to 20th December 2023 had been deleted from the official website, the content of ‘Wiadomości’, TVP’s main news programme, is therefore scraped from a GitHub channel daily saving its transcript (codziennatranskrypcjatvpis, 2024). Data is then subsetted to only involve LGBTQ+-related coverages.

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

urls = [
    "https://raw.githubusercontent.com/codziennatranskrypcjatvpis/codziennatranskrypcjatvpis.github.io/main/transcriptions_txt/Wiadomosci_06.02.2022.txt",
    "https://raw.githubusercontent.com/codziennatranskrypcjatvpis/codziennatranskrypcjatvpis.github.io/main/transcriptions_txt/Wiadomosci_07.02.2022.txt"
]

def main():

    all_texts = []

    for url in urls:

        try:
            response = requests.get(url)
            response.raise_for_status()  # This line checks for any HTTP errors
            text = response.text.strip()  # No need for BeautifulSoup as it's a text file, not HTML
            all_texts.append(text)

        except Exception as e:
            print(f"Error scraping URL: {url}. {e}")

    data = {
        "URLs": urls,
        "Texts": all_texts,
    }

    df = pd.DataFrame.from_dict(data)
    df.to_csv("wiadomosci.csv", encoding="utf-8", header=True, index=False)

if __name__ == "__main__":
    main()




  from pandas.core import (


In [2]:
wiadomosci_df = pd.read_csv("wiadomosci.csv")
wiadomosci_df.head()

Unnamed: 0,URLs,Texts
0,https://raw.githubusercontent.com/codziennatra...,Mamy pierwszy medal. Dawid Kubacki wywalczył b...
1,https://raw.githubusercontent.com/codziennatra...,W Brukseli prezydent Andrzej Duda wraz z szefa...


In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

def generate_urls(start_date, end_date):
    base_url = "https://raw.githubusercontent.com/codziennatranskrypcjatvpis/codziennatranskrypcjatvpis.github.io/main/transcriptions_txt/Wiadomosci_{}.txt"
    urls = []
    current_date = start_date
    while current_date <= end_date:
        urls.append(base_url.format(current_date.strftime("%d.%m.%Y")))
        current_date += timedelta(days=1)
    return urls

def scrape_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        text = response.text.strip()
        return text
    except requests.HTTPError as e:
        if response.status_code == 404:
            print(f"URL not found: {url}")
        else:
            print(f"HTTP error occurred while scraping URL: {url}. {e}")
        return None
    except Exception as e:
        print(f"Error occurred while scraping URL: {url}. {e}")
        return None

def main():
    start_date = datetime(2022, 2, 6)
    end_date = datetime(2023, 12, 19)
    urls = generate_urls(start_date, end_date)

    all_texts = []
    for url in urls:
        text = scrape_text_from_url(url)
        if text is not None:
            all_texts.append(text)

    if all_texts:  # Only create DataFrame if there's data
        data = {
            "URLs": urls[:len(all_texts)],  # Truncate URLs to match the length of all_texts
            "Texts": all_texts,
        }

        df = pd.DataFrame.from_dict(data)
        df.to_csv("wiadomosci.csv", encoding="utf-8", header=True, index=False)
    else:
        print("No data scraped.")

if __name__ == "__main__":
    main()


URL not found: https://raw.githubusercontent.com/codziennatranskrypcjatvpis/codziennatranskrypcjatvpis.github.io/main/transcriptions_txt/Wiadomosci_15.11.2023.txt
URL not found: https://raw.githubusercontent.com/codziennatranskrypcjatvpis/codziennatranskrypcjatvpis.github.io/main/transcriptions_txt/Wiadomosci_16.11.2023.txt


In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

def generate_urls(start_date, end_date):
    base_url = "https://raw.githubusercontent.com/codziennatranskrypcjatvpis/codziennatranskrypcjatvpis.github.io/main/transcriptions_txt/Wiadomosci_{}.txt"
    urls = []
    dates = []
    current_date = start_date
    while current_date <= end_date:
        url = base_url.format(current_date.strftime("%d.%m.%Y"))
        urls.append(url)
        dates.append(current_date.strftime("%Y-%m-%d"))
        current_date += timedelta(days=1)
    return urls, dates

def scrape_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        text = response.text.strip()
        return text
    except requests.HTTPError as e:
        if response.status_code == 404:
            print(f"URL not found: {url}")
        else:
            print(f"HTTP error occurred while scraping URL: {url}. {e}")
        return None
    except Exception as e:
        print(f"Error occurred while scraping URL: {url}. {e}")
        return None

def main():
    start_date = datetime(2022, 2, 6)
    end_date = datetime(2023, 12, 19)
    urls, dates = generate_urls(start_date, end_date)

    all_texts = []
    for url in urls:
        text = scrape_text_from_url(url)
        if text is not None:
            all_texts.append(text)

    if all_texts:  # Only create DataFrame if there's data
        data = {
            "Date": dates[:len(all_texts)],  # Truncate dates to match the length of all_texts
            "URLs": urls[:len(all_texts)],  # Truncate URLs to match the length of all_texts
            "Texts": all_texts,
        }

        df = pd.DataFrame.from_dict(data)
        df.to_csv("wiadomosci.csv", encoding="utf-8", header=True, index=False)
    else:
        print("No data scraped.")

if __name__ == "__main__":
    main()


URL not found: https://raw.githubusercontent.com/codziennatranskrypcjatvpis/codziennatranskrypcjatvpis.github.io/main/transcriptions_txt/Wiadomosci_15.11.2023.txt
URL not found: https://raw.githubusercontent.com/codziennatranskrypcjatvpis/codziennatranskrypcjatvpis.github.io/main/transcriptions_txt/Wiadomosci_16.11.2023.txt


In [9]:
wiadomosci_df = pd.read_csv("wiadomosci.csv")
wiadomosci_df.head()

Unnamed: 0,Date,URLs,Texts
0,2022-02-06,https://raw.githubusercontent.com/codziennatra...,Mamy pierwszy medal. Dawid Kubacki wywalczył b...
1,2022-02-07,https://raw.githubusercontent.com/codziennatra...,W Brukseli prezydent Andrzej Duda wraz z szefa...
2,2022-02-08,https://raw.githubusercontent.com/codziennatra...,Rada Polityki Pieniężnej reaguje na inflację. ...
3,2022-02-09,https://raw.githubusercontent.com/codziennatra...,Apogeum pi@tej fali koronawirusa ju¼ za nami. ...
4,2022-02-10,https://raw.githubusercontent.com/codziennatra...,Ważne słowa i deklaracje brytyjskiego premiera...
