In [5]:
# Importy potrzebnych bibliotek
import json
import os
from openai import OpenAI
import pandas as pd

In [None]:
# Pobieranie napisów z YouTube za pomocą yt-dlp
!yt-dlp --skip-download \
       --write-auto-sub \
       --sub-lang pl \
       --sub-format json3 \
       "https://www.youtube.com/watch?v=bvlzQvdgqLU"

In [None]:
# Konwersja surowych napisów JSON3 do czystego JSON (zawierającego tylko tekst, start i długość)
input_file = "debata.json3"
output_file = "debata.json"

with open(input_file, encoding="utf-8") as f:
    raw_data = json.load(f)

captions = []
for event in raw_data.get("events", []):
    if "segs" in event and "tStartMs" in event and "dDurationMs" in event:
        text = "".join(seg.get("utf8", "") for seg in event["segs"]).strip()
        if text:
            captions.append({
                "start": round(event["tStartMs"] / 1000, 3),
                "duration": round(event["dDurationMs"] / 1000, 3),
                "text": text
            })

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(captions, f, indent=2, ensure_ascii=False)

print(f"Transkrypt w JSON zapisany do {output_file}")


In [None]:
# Łączenie wszystkich segmentów tekstu w jeden plik tekstowy
import json

with open('debata.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

combined_text = []
for entry in data:
    text = entry.get('text', '').strip()
    if text and text != '[Muzyka]': # Pomijamy muzyke z transkryptu
        combined_text.append(text)

final_text = ' '.join(combined_text)

with open('debata.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(final_text)

print("Tekst jednolity zapisany pomyslnie do debata.txt")

In [None]:

# Tłumaczenie tekstu z polskiego na angielski za pomocą LLM
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

try:
    with open('debata.txt', 'r', encoding='utf-8') as file:
        text = file.read()

    # Tłumaczenie tekstu na angielski za pomocą LLM z odpowiednim system promptem
    response = client.chat.completions.create(
        model="google/gemini-2.5-flash-preview-05-20",
        messages=[
            {
                "role": "system",
                "content": f"You are a professional translator. Translate the following text from Polish to English. Maintain the original meaning and tone."
            },
            {
                "role": "user",
                "content": text
            }
        ]
    )
    
    translated_text = response.choices[0].message.content                
    output_file = 'debata_translated.txt'
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(translated_text)

    print(f"Tlumaczenie zapisane do {output_file}")
except FileNotFoundError:
    print("Error: debata.txt nie istnieje.")
except Exception as e:
    print(f"Error: {str(e)}")

In [None]:
# Identyfikacja mówców w debacie za pomocą LLM na podstawie kontekstu
try:
    with open('debata_translated.txt', 'r', encoding='utf-8') as file:
        text = file.read()

    # Identyfikacja mówców w debacie za pomocą LLM z odpowiednim system promptem
    response = client.chat.completions.create(
        model="google/gemini-2.5-flash-preview-05-20",
        messages=[
            {
                "role": "system",
                "content": """You are a debate analysis expert. Identify who is speaking in each segment of this Polish presidential debate transcript. 

The candidates include:
1. Krzysztof Stanowski
2. Joanna Senyszyn
3. Marek Woch
4. Marek Jakubiak
5. Artur Bartoszewicz
6. Magdalena Biejat
7. Karol Nawrocki
8. Rafał Trzaskowski
9. Szymon Hołownia
10. Maciej Maciak
11. Adrian Zandberg
12. Grzegorz Braun
13. Sławomir Mentzen

Plus moderators/presenters from various TV stations.

For each segment, determine if it's a presenter/moderator speaking or one of the candidates. If it's a candidate, identify which one based on context, name mentions, or speaking patterns.

When candidates interrupt each other or speak out of turn, mark these interruptions separately. For example, if Candidate A is speaking and Candidate B interrupts, identify this as an interruption and attribute it to the correct speaker. Include these interruptions in your JSON structure with a field indicating it's an interruption.

Return your analysis in JSON format with each segment containing the text and the identified speaker.
"""
            },
            {
                "role": "user",
                "content": text
            }
        ]
    )
    
    identified_speakers = response.choices[0].message.content
    
    output_file = 'debate_speakers.json'
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(identified_speakers)

    print(f"Identyfikacja rozmowcow zapisana do {output_file}")    
except FileNotFoundError:
    print("Error: 'debata_translated.txt' nie istnieje.")
except Exception as e:
    print(f"Error: {str(e)}")

In [None]:
# Zapisanie wyników identyfikacji w formacie JSON
try:
    parsed_json = json.loads(identified_speakers)
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(parsed_json, file, indent=2, ensure_ascii=False)
    print(f"Identyfikacja rozmowcow zapisana w JSON do {output_file}")
except json.JSONDecodeError as e:
    print(f"Error: {e}")

## Sprawdzenie wynikow transkrypcji i dentyfikacji rozmowcow debaty

In [6]:
# Przetwarzanie danych o mówcach - tworzenie plików dla każdego kandydata
with open("debata_speakers.json", 'r', encoding='utf-8') as file:
    content = file.read()
    chunks = json.loads(content)

speakers_array = []
speakers_map = {}
for chunk in chunks:
    speakers_array.append([chunk['speaker'], chunk['segment']])
    if chunk['speaker'] not in speakers_map:
        speakers_map[chunk['speaker']] = chunk['segment']
    else:
        speakers_map[chunk['speaker']] += " " + chunk['segment']

for speaker, text in speakers_map.items():
    with open(f"speaker_{speaker}.txt", 'w', encoding='utf-8') as file:
        file.write(text)

speakers_df = pd.DataFrame(speakers_array)
speakers_df.columns = ['speaker', 'text']

In [7]:
# Analiza częstotliwości wypowiedzi każdego mówcy
speakers_df.groupby('speaker').count().sort_values('text', ascending=False)

Unnamed: 0_level_0,text
speaker,Unnamed: 1_level_1
Presenter,124
Szymon Hołownia,22
Rafał Trzaskowski,21
Karol Nawrocki,20
Sławomir Mentzen,20
Krzysztof Stanowski,16
Magdalena Biejat,16
Grzegorz Braun,15
Adrian Zandberg,13
Artur Bartoszewicz,13
