<a href="https://colab.research.google.com/github/kasodeep/inheritance-project/blob/main/final_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Retrieve transcript**

In [None]:
!pip install fpdf
!pip install pytube
!pip install youtube-transcript-api
!pip install fpdf
!pip install youtube_transcript_api

In [3]:
from youtube_transcript_api import YouTubeTranscriptApi
from fpdf import *

In [4]:
def get_transcript(youtube_url, output_pdf_path):
    video_id = youtube_url.split("v=")[-1]
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

    # Try fetching the manual transcript
    try:
        transcript = transcript_list.find_manually_created_transcript()
        language_code = transcript.language_code  # Save the detected language
    except:
        # If no manual transcript is found, try fetching an auto-generated transcript in a supported language
        try:
            generated_transcripts = [trans for trans in transcript_list if trans.is_generated]
            transcript = generated_transcripts[0]
            language_code = transcript.language_code  # Save the detected language
        except:
            # If no auto-generated transcript is found, raise an exception
            raise Exception("No suitable transcript found.")

    full_transcript = " ".join([part['text'] for part in transcript.fetch()])

    # Save the transcript to a PDF file
    save_to_pdf(full_transcript, language_code, output_pdf_path)

    return full_transcript, language_code  # Return both the transcript and detected language

In [5]:
def save_to_pdf(transcript, language_code, output_pdf_path):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    pdf.cell(200, 10, txt=f"Language Code: {language_code}", ln=True, align='C')
    pdf.ln(10)

    pdf.multi_cell(0, 10, txt=transcript)

    pdf.output(output_pdf_path)

**Implement Summarizer**

In [None]:
%pip install PyMuPDF

In [None]:
%pip install transformers
%pip install pytorch

In [12]:
import fitz
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page_num in range(doc.page_count):
                page = doc[page_num]
                text += page.get_text()
    except Exception as e:
        print(f"Error extracting text: {str(e)}")
    return text

In [14]:
from transformers import BartTokenizer, BartForConditionalGeneration

In [None]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [16]:
#text- text of transcript

In [41]:
def generate_summary_t5(text, tokenizer,model):

    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=1000, min_length=600, length_penalty=1.0, num_beams=4, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    output_pdf_path="summary.pdf"
    tr,langcode=get_transcript(url, output_pdf_path)

    save_to_pdf(summary, langcode , output_pdf_path)
    return summary

**Download Video**

In [None]:
%pip install pytube

In [34]:
from pytube import YouTube
import re

def clean_filename(title):
    # Remove special characters and replace spaces with underscores
    cleaned_title = re.sub(r'[^\w\s]', '', title)
    cleaned_title = cleaned_title.replace(' ', '_')
    return cleaned_title

def download_youtube_video(video_url, output_path='.'):
    try:
        # Create a YouTube object
        yt = YouTube(video_url)

        # Get the highest resolution stream
        video_stream = yt.streams.get_highest_resolution()

        # Download the video
        cleaned_title = clean_filename(yt.title)
        print(f"Downloading: {cleaned_title}")
        video_stream.download(output_path, filename=cleaned_title)
        print("Download complete")

    except Exception as e:
        print(e)

In [None]:
download_youtube_video("https://www.youtube.com/watch?v=reUZRyXxUs4", output_path='.')

**Running Functions**

In [None]:
output_pdf_path = "transcript_output.pdf"
url=input("Enter the url of the video")
get_transcript(url, output_pdf_path)
text = extract_text_from_pdf(output_pdf_path)
summary = generate_summary_t5(text,tokenizer,model)
summary