In [None]:

#install whisper model
!pip install whisper
""" whisper is specifically designed for speech recognition and trained on large and diverse dataset.
    In addition to that, it can be used for multiple languages not only english. On top of that, it provides finest accuracy.
    I also tried wav2vec2 model for speech recognition but it didn't produced accurate output.
    As accuracy is more important I chose this model to work with."""

#preprocess the audio:
#install ffmpeg - This converts the audio file into standard format which supports whisper
!pip install ffmpeg
"""Although there are other ways to preprocess the audio, ffmpeg is faster and standalone framework to preprocess the audio.
    This resamples the audio and to the standard formart by handling audio format conversion.
    I also tried pydub to preprocess the audio but it was not as user-friendly as ffmpeg for advanced preprocessing.
    So, I chose ffmpeg over pydub for preprocessing."""

#language detection:
# load audio and pad/trim it to fit 30 seconds
import whisper
model=whisper.load_model("base")
audio = whisper.load_audio("path_for_the_audio")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
lang=max(probs, key=probs.get)
""" It will detect the language that is used in the audio and store that value in lang variable.
    This will be used to set language attribute while transcribing to ensure that it will use correct language for transcription."""

#Transcription using whisper model:
#The above line takes the audio file path, model size which is medium, stores the output in a json format by creating the file in the same directory and language. This also creates timestamps of the audio.

!whisper "path_for_the_audio" --model medium --output_format json --language lang

#semantic chunking:
pip install sentence_transformers

import json       #to read and write the json file
from sentence_transformers import SentenceTransformer   # to create text embeddings
from sklearn.metrics.pairwise import cosine_similarity  # to calculate the semantic similarity between text embeddings
# reads the json file
with open('output_audio.json', 'r') as file:
    data = json.load(file)
# stores the data in the segments variable that has been extracted from the json file
segments = data['segments']
model = SentenceTransformer('all-MiniLM-L6-v2') # This model is ligh-weight, accurate and faster which is why it is used.
texts = [segment['text'] for segment in segments]
embeddings = model.encode(texts) # this converts each text into fixed-size embedding vector
chunks = []
current_chunk = {"start_time": segments[0]['start'], "text": ""}
chunk_start_time = segments[0]['start']
chunk_id = 1

# iterate through segments to create semantic chunks
for i, segment in enumerate(segments):
    text = segment.get('text', '')  # default to an empty string if 'text' is missing
    current_chunk["text"] += text
    current_chunk_duration = segment['end'] - chunk_start_time

    if i < len(segments) - 1:
        similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
        #set cosine similarity level as 0.8 as minimum level to ensure that the chunks should be almost similar.
        if similarity < 0.8 or current_chunk_duration > 15:
            # finalize the current chunk
            current_chunk["chunk_id"] = chunk_id
            current_chunk["chunk_length"] = current_chunk_duration
            current_chunk["end_time"] = segment['end']
            chunks.append(current_chunk)

            # prepare for the next chunk
            chunk_id += 1
            current_chunk = {"start_time": segments[i + 1]['start'], "text": ""}
            chunk_start_time = segments[i + 1]['start']
    else:
        # handle the last segment
        current_chunk["chunk_id"] = chunk_id
        current_chunk["chunk_length"] = current_chunk_duration
        current_chunk["end_time"] = segment['end']
        chunks.append(current_chunk)

for chunk in chunks:
    print("{")
    for key,value in chunk.items():
        print(f"    {key}:{value},")
    print("},", end="")
    print()



""" The whisper provided accurate transcription for English language but when I tested it against telugu song, although it gave transcription but it was not as accurate as english.
    As it was a song I used which consists different dialects, I think it performed pretty well compared to other models.
    In whisper, I decided to use medium size model. I also tested it against large model thinking that it will provide more accurate transcripts but it provided the same output as medium.
    So, it is not necessary to use large model as it consumes more space. I thought medium will provide a balance between accuracy and space constraints.
    Although it supports for multiple languages, it was not addressed the overlap speech and have to perform fine-tuning to enhance the accuracy of transcription for non-english languages."""



  checkpoint = torch.load(fp, map_location=device)
usage: whisper [-h] [--model MODEL] [--model_dir MODEL_DIR] [--device DEVICE]
               [--output_dir OUTPUT_DIR]
               [--output_format {txt,vtt,srt,tsv,json,all}]
               [--verbose VERBOSE] [--task {transcribe,translate}]
               [--language {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,yue,zh,Afrikaans,Albanian,Amharic,Arabic,Armenian,Assamese,Azerbaijani,Bashkir,Basque,Belarusian,Bengali,Bosnian,Breton,Bulgarian,Burmese,Cantonese,Castilian,Catalan,Chinese,Croatian,Czech,Danish,Dutch,English,Estonian,Faroese,Finnish,Flemish,French,Galician,Georgian,German,Greek,Gujarati,Haitian,Haitian Creole,Hausa,Hawaiian,Hebrew,Hindi,Hungarian,Icelandic,Indonesian,Italian,

{
    start_time:0.0,
    text: Hi everyone, so let us start with lecture 1 of this course where we will be talking,
    chunk_id:1,
    chunk_length:23.080000000000002,
    end_time:23.080000000000002,
},
{
    start_time:23.080000000000002,
    text: about a brief and maybe a bit selective partial history of deep learning right.,
    chunk_id:2,
    chunk_length:5.919999999999998,
    end_time:29.0,
},
{
    start_time:29.0,
    text: So when we talk about deep learning right, so most of this material, the early material,
    chunk_id:3,
    chunk_length:4.640000000000001,
    end_time:33.64,
},
{
    start_time:33.64,
    text: that is there at least there in these slides is taken by from this article on deep learning,
    chunk_id:4,
    chunk_length:5.759999999999998,
    end_time:39.4,
},
{
    start_time:39.4,
    text: in neural networks and overview by Shmidubar.,
    chunk_id:5,
    chunk_length:4.039999999999999,
    end_time:43.44,
},
{
    start_time:43.44,
    text: There

In [None]:
from pytube import YouTube
from pydub import AudioSegment
import os
def download(url):
    yt=YouTube(url)
    ys=yt.streams.filter(only_audio=True).first()
    audio_file=ys.downlaod
    base,ext=os.path.splitext(audio_file)
    audio.export(base+'.wav', format='wav')
    os.remove(audio_file)
video_url="https://youtu.be/EhrEdHMt5GY?si=xwMbKS7r9MoYOuaO"
download(video_url)

enter https://youtu.be/EhrEdHMt5GY?si=xwMbKS7r9MoYOuaO


HTTPError: HTTP Error 403: Forbidden

In [None]:
pip install pytube

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
Installing collected packages: pytube
Successfully installed pytube-15.0.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
setx /M PATH "path\to\ffmpeg\bin;%PATH%"

SyntaxError: invalid syntax (4225797406.py, line 1)

In [None]:
pip install moviepy

Collecting moviepyNote: you may need to restart the kernel to use updated packages.

  Downloading moviepy-2.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Downloading imageio_ffmpeg-0.5.1-py3-none-win_amd64.whl.metadata (1.6 kB)
Collecting proglog<=1.0.0 (from moviepy)
  Downloading proglog-0.1.10-py3-none-any.whl.metadata (639 bytes)
Downloading moviepy-2.1.1-py3-none-any.whl (123 kB)
Downloading imageio_ffmpeg-0.5.1-py3-none-win_amd64.whl (22.6 MB)
   ---------------------------------------- 0.0/22.6 MB ? eta -:--:--
   - -------------------------------------- 0.8/22.6 MB 4.8 MB/s eta 0:00:05
   --- ------------------------------------ 1.8/22.6 MB 4.6 MB/s eta 0:00:05
   ----- ---------------------------------- 2.9/22.6 MB 4.8 MB/s eta 0:00:05
   ------ --------------------------------- 3.9/22.6 MB 4.7 MB/s eta 0:00:04
   -------- ------------------------------- 5.0/22.6 MB 5.0 MB/s eta 0:00:04
   ----------- ---------------------------- 6.3

In [None]:
from pytube import YouTube
import ffmpeg

text = 'https://youtu.be/jULGCJzyRXc?si=kXLSs0rSDY6ycjTj'

yt = YouTube(text)

# https://github.com/pytube/pytube/issues/301
stream_url = yt.streams.all()[0].url  # Get the URL of the video stream

# Probe the audio streams (use it in case you need information like sample rate):
#probe = ffmpeg.probe(stream_url)
#audio_streams = next((stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None)
#sample_rate = audio_streams['sample_rate']

# Read audio into memory buffer.
# Get the audio using stdout pipe of ffmpeg sub-process.
# The audio is transcoded to PCM codec in WAC container.
audio, err = (
    ffmpeg
    .input(stream_url)
    .output("pipe:", format='wav', acodec='pcm_s16le')  # Select WAV output format, and pcm_s16le auidio codec. My add ar=sample_rate
    .run(capture_stdout=True)
)

# Write the audio buffer to file for testing
with open('audio.wav', 'wb') as f:
    f.write(audio)

HTTPError: HTTP Error 403: Forbidden