In [2]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Elina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Elina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
import re
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

# === Шаг 1: читаем .srt файл и превращаем в subtitle_blocks ===
def load_subtitle_blocks(srt_path):
    with open(srt_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    # Разбиваем на блоки
    blocks = raw_text.strip().split("\n\n")
    subtitle_blocks = []

    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 3:
            # Последняя строка — текст
            text = " ".join(lines[2:])
            subtitle_blocks.append({"text": text})
        elif len(lines) == 2:
            text = lines[1]
            subtitle_blocks.append({"text": text})

    return subtitle_blocks

# === Шаг 2: суммаризация с помощью sumy ===
def summarize_blocks(blocks, sentence_count=10):
    subtitle_text = " ".join([block["text"] for block in blocks])
    parser = PlaintextParser.from_string(subtitle_text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summarizer.stop_words = []  # отключаем стоп-слова (чтобы не было ошибок)
    summary_sentences = summarizer(parser.document, sentence_count)
    return "\n".join(str(sentence) for sentence in summary_sentences)

# === Основной блок ===
if __name__ == "__main__":
    srt_file = "14194_1_1.srt"  
    subtitle_blocks = load_subtitle_blocks(srt_file)
    summary = summarize_blocks(subtitle_blocks, sentence_count=10)

    with open("recap_sumy_2.txt", "w", encoding="utf-8") as f:
        f.write(summary)

    print("recap_sumy.txt создан.")


recap_sumy.txt создан.


In [6]:
import re
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

# === Шаг 1: читаем .srt файл и извлекаем блоки с таймкодами ===
def load_subtitle_blocks(srt_path):
    with open(srt_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    blocks = raw_text.strip().split("\n\n")
    subtitle_blocks = []

    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 3:
            timecode = lines[1].split(" --> ")[0]
            text = " ".join(lines[2:])
            subtitle_blocks.append({"time": timecode, "text": text})
        elif len(lines) == 2:
            timecode = lines[0]
            text = lines[1]
            subtitle_blocks.append({"time": timecode, "text": text})

    return subtitle_blocks

# === Шаг 2: суммаризация с таймкодами ===
def summarize_blocks_with_time(blocks, sentence_count=10):
    subtitle_text = " ".join([block["text"] for block in blocks])
    parser = PlaintextParser.from_string(subtitle_text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summarizer.stop_words = []

    summary_sentences = list(summarizer(parser.document, sentence_count))
    summary_texts = [str(s) for s in summary_sentences]

    # Сопоставим предложения с блоками
    results = []
    for s in summary_texts:
        for block in blocks:
            if s.strip().startswith(block["text"][:10]):  # сравнение начала фразы
                results.append(f"[{block['time']}] {s}")
                break
    return "\n".join(results)

# === Основной блок ===
if __name__ == "__main__":
    srt_file = "14194_1_1.srt"
    subtitle_blocks = load_subtitle_blocks(srt_file)
    summary = summarize_blocks_with_time(subtitle_blocks, sentence_count=10)

    with open("recap_with_timecodes.txt", "w", encoding="utf-8") as f:
        f.write(summary)

    print("recap_with_timecodes.txt создан.")


recap_with_timecodes.txt создан.


In [8]:
import re
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

# === Шаг 1: читаем .srt файл и извлекаем блоки с таймкодами начала и конца ===
def load_subtitle_blocks(srt_path):
    with open(srt_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    blocks = raw_text.strip().split("\n\n")
    subtitle_blocks = []

    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 3:
            time_range = lines[1].strip()
            start, end = time_range.split(" --> ")
            text = " ".join(lines[2:])
            subtitle_blocks.append({"start": start, "end": end, "text": text})
        elif len(lines) == 2:
            text = lines[1]
            subtitle_blocks.append({"start": "00:00:00,000", "end": "00:00:00,000", "text": text})

    return subtitle_blocks

# === Шаг 2: суммаризация с таймкодами начала и конца ===
def summarize_blocks_with_time(blocks, sentence_count=10):
    subtitle_text = " ".join([block["text"] for block in blocks])
    parser = PlaintextParser.from_string(subtitle_text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summarizer.stop_words = []

    summary_sentences = list(summarizer(parser.document, sentence_count))
    summary_texts = [str(s) for s in summary_sentences]

    results = []
    for s in summary_texts:
        for block in blocks:
            if s.strip().startswith(block["text"][:10]):
                results.append(f"[{block['start']}] - [{block['end']}] {s}")
                break
    return "\n".join(results)

# === Основной блок ===
if __name__ == "__main__":
    srt_file = "14194_1_1.srt"
    subtitle_blocks = load_subtitle_blocks(srt_file)
    summary = summarize_blocks_with_time(subtitle_blocks, sentence_count=10)

    with open("recap_with_ranges.txt", "w", encoding="utf-8") as f:
        f.write(summary)

    print("recap_with_ranges.txt создан.")


recap_with_ranges.txt создан.
