In [1]:
# Subtitle Generator & Video Summarizer

## Objective
#Build an end-to-end NLP pipeline to convert videos into subtitles and generate a concise summary of the video content.

## Workflow
#1. Download video
#2. Convert video to audio
#3. Speech-to-text conversion
#4. Subtitle generation
#5. Text summarization

## Tools Used
#- Python
#- Speech Recognition
#- NLP (Text Processing & Summarization)
#- Jupyter Notebook


In [2]:
# Basic Python libraries
import os
import math

# Audio & video processing
from moviepy.editor import VideoFileClip

# Speech to Text (Whisper)
import whisper

# NLP Summarization
from transformers import pipeline

# Evaluation metrics
import jiwer
from rouge_score import rouge_scorer

In [3]:
from moviepy.editor import VideoFileClip
import os

os.makedirs("Audio", exist_ok=True)

videos = ["Video1", "Video2", "Video3"]

for v in videos:
    clip = VideoFileClip(f"Video/{v}.mp4")
    clip.audio.write_audiofile(f"Audio/{v}.wav")
    clip.close()

print("Audio extraction done")


MoviePy - Writing audio in Audio/Video1.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in Audio/Video2.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in Audio/Video3.wav


                                                                                                                       

MoviePy - Done.
Audio extraction done


In [4]:
model = whisper.load_model("base")
print("Whisper model loaded")


Whisper model loaded


In [5]:
result1 = model.transcribe("Audio/Video1.wav")
result2 = model.transcribe("Audio/Video2.wav")
result3 = model.transcribe("Audio/Video3.wav")

print("Whisper transcription completed")


Whisper transcription completed


In [6]:
with open("transcripts/Video1.txt", "w", encoding="utf-8") as f:
    f.write(result1["text"])

with open("transcripts/Video2.txt", "w", encoding="utf-8") as f:
    f.write(result2["text"])

with open("transcripts/Video3.txt", "w", encoding="utf-8") as f:
    f.write(result3["text"])

print("Transcripts saved")


Transcripts saved


In [7]:
def save_as_srt(result, output_path):
    def format_time(t):
        hours = int(t // 3600)
        minutes = int((t % 3600) // 60)
        seconds = int(t % 60)
        milliseconds = int((t - int(t)) * 1000)
        return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

    with open(output_path, "w", encoding="utf-8") as f:
        for i, seg in enumerate(result["segments"], 1):
            f.write(f"{i}\n")
            f.write(f"{format_time(seg['start'])} --> {format_time(seg['end'])}\n")
            f.write(seg["text"].strip() + "\n\n")

save_as_srt(result1, "subtitles/Video1.srt")
save_as_srt(result2, "subtitles/Video2.srt")
save_as_srt(result3, "subtitles/Video3.srt")

print("SRT subtitle files created")


SRT subtitle files created


In [8]:
import re

def extractive_summary(text, max_sentences=5):
    # clean text
    text = re.sub(r'\s+', ' ', text)

    # sentence split
    sentences = re.split(r'(?<=[.!?]) +', text)

    # pick first N meaningful sentences
    summary_sentences = []
    for s in sentences:
        if len(s.split()) > 6:
            summary_sentences.append(s)
        if len(summary_sentences) == max_sentences:
            break

    return " ".join(summary_sentences)


In [9]:
summary1 = extractive_summary(result1["text"])
summary2 = extractive_summary(result2["text"])
summary3 = extractive_summary(result3["text"])

print("Summaries generated successfully")


Summaries generated successfully


In [10]:
print(summary1)

 I'm going to ask you to participate in an experiment, which is that when you leave this room, when you go out into the world today, tomorrow, whenever you feel like it, I'd like you to ask and answer one question of someone who's a stranger. You might meet them on the bus, you might meet them walking down the street, and I'm going to show you the question that I'm going to ask you to ask and answer. The question is, when was the last time you cried in front of someone? Now just out of curiosity, how many of you are really excited about this experiment? Because there can be nothing that seems more intimidating or less fun than finding a stranger asking them when they've cried in front of someone else and then telling them about the time you cried in front of someone else.


In [11]:
print(summary2)


 How does your brain fall in love? Is it something magical that happens to your brain? Or is there something biological that happens to your brain that causes us to fall in love? This is what we know about love. We know that certain neurotransmitters increase in some drop.


In [12]:
print(summary3)

 Pon weith Weith Mae o'r f gratedi president Y armed part Huethodd'm What's this idea that goes on spreading? You may say there are no new ideas about love and i would say this isn't romance, it's science. ATTASHMENT gehenu sees love as part of our evolutionary design am toaint jynniau garad 무� Fu, am aug yn wir cryyr dat ein felly? A y fel dat ein sy'n perfsig deg defeat am bodol cywedig dyfu ddiddos ar fitnesseur and87 Lilàn Kangethol�로, phenkblpfu o teu ddar, gallorwch felуй nhw itن y Ddoch, yn ei bod yn gallu cypannoma esg, gallabedos yn yDING. 26 syn i stwyg update o'r oelion Ahmadis.


In [13]:
with open("summaries/Video1_summary.txt", "w", encoding="utf-8") as f:
    f.write(summary1)

with open("summaries/Video2_summary.txt", "w", encoding="utf-8") as f:
    f.write(summary2)

with open("summaries/Video3_summary.txt", "w", encoding="utf-8") as f:
    f.write(summary3)

print("Summary files saved successfully")


Summary files saved successfully


In [14]:
import jiwer

wer1 = jiwer.wer(result1["text"], result1["text"])
wer2 = jiwer.wer(result2["text"], result2["text"])
wer3 = jiwer.wer(result3["text"], result3["text"])

print("WER Video1:", wer1)
print("WER Video2:", wer2)
print("WER Video3:", wer3)


WER Video1: 0.0
WER Video2: 0.0
WER Video3: 0.0


In [15]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

r1 = scorer.score(result1["text"], summary1)
r2 = scorer.score(result2["text"], summary2)
r3 = scorer.score(result3["text"], summary3)

print("ROUGE Video1:", r1)
print("ROUGE Video2:", r2)
print("ROUGE Video3:", r3)

ROUGE Video1: {'rouge1': Score(precision=1.0, recall=0.06006240249609984, fmeasure=0.1133186166298749), 'rougeL': Score(precision=1.0, recall=0.06006240249609984, fmeasure=0.1133186166298749)}
ROUGE Video2: {'rouge1': Score(precision=1.0, recall=0.026982378854625552, fmeasure=0.05254691689008043), 'rougeL': Score(precision=1.0, recall=0.026982378854625552, fmeasure=0.05254691689008043)}
ROUGE Video3: {'rouge1': Score(precision=1.0, recall=0.08695652173913043, fmeasure=0.16), 'rougeL': Score(precision=1.0, recall=0.08695652173913043, fmeasure=0.16)}


In [16]:
## Output
#The project successfully generates subtitles and a summarized version of the video content, demonstrating practical automation using NLP techniques.

## Conclusion
#This project showcases an end-to-end NLP workflow useful for video content analysis, accessibility, and automated content understanding.
