####Download Video form Youtube

In [None]:
#Library for downloading youtube vid 
!pip install pytube 

In [None]:
from pytube import YouTube 

In [None]:
youtube_video = "https://www.youtube.com/watch?v=5-rCKo4CBgM"

In [None]:
yt = YouTube(youtube_video)

In [None]:
#Download audio from youtube
#filtering only mp4 (.filter)
#return objek yang memenuhi kondisi (.first)

yt.streams\
    .filter(only_audio = True, file_extension = 'mp4')\
    .first()\
    .download(filename ='audio.mp4')

In [None]:
import IPython.display as display

In [None]:
path = "/content/audio.mp4"

In [None]:
display.Audio(path, autoplay=True)

In [None]:
#convert mp4 to wav
! ffmpeg -i audio.mp4 -acodec pcm_s16le -ar 16000 audio.wav

####ASR 

In [None]:
!pip install torch --upgrade

In [None]:
!pip install huggingsound --upgrade

In [None]:
from huggingsound import SpeechRecognitionModel
import torch

In [None]:
#Checking GPU if CUDA enabled -> output cuda, if not output will be cpu
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
device

In [None]:
#pretrained model using wav2vec2 for
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device=device)

####Audio Slicing

In [None]:
import librosa
import soundfile as sf

In [None]:
input_file ='/content/audio.wav'

In [None]:
# to retrieve the sampling rate of an audio file
print(librosa.get_samplerate(input_file)) 

stream = librosa.stream(
    input_file,
    block_length=30, #Slicing duration to 30 seconds
    frame_length=16000, #length of each audio frame per block
    hop_length=16000 #overlap between gap, set to 16000 means that there is no overlap
)

In [None]:
#Save slicing result from "stream"
for i, speech in enumerate(stream):
  sf.write(f'{i}.wav', speech, 16000) #to write audio data to a file

In [None]:
i

####Audio Transcript (Speech to Text)

In [None]:
#Generate list of audio 
audio_path = []
for a in range(i+1):
  audio_path.append(f'/content/{a}.wav')

In [None]:
audio_path

In [None]:
#transcript audio_path and stored in the transcriptions 
transcriptions = model.transcribe(audio_path)

In [None]:
full_transcript = ""

In [None]:
#combine all transcript in full_transcript
for i in transcriptions :
  full_transcript += ''.join(i['transcription'])

In [None]:
#length of characters 
len(full_transcript)

In [None]:
import pprint

In [None]:
pprint.pprint(full_transcript)

In [None]:
#Save full_transcript to txt file 
transcript_file = "/content/transcript.txt"

#open file 
with open(transcript_file, "w") as file:
  file.write(full_transcript)

print("Transcript saved to", transcript_file)

In [None]:
with open("transcript.txt", "r") as file:
  trans = file.read().replace('\n', '')
trans = trans.replace("\ufeff", "")

####Text Summarizer

In [None]:
!pip install --upgrade transformers

In [None]:
#pipeline library in transformers, used for summaryzation
from transformers import pipeline

In [None]:
#pretrained model for summarization using bart using CNN daily mail dataset 
bart_model ="facebook/bart-large-cnn"
summarization = pipeline('summarization', model=bart_model)

In [None]:
# summarized_text = summarization(full_transcript)

In [None]:
# summarized_text[0]['summary_text']

In [None]:
#summarize per 500 characters 
num_iters = int(len(full_transcript)/500)
summarized_text = []
for i in range(0, num_iters + 1):
  start = 0
  start = i * 500
  end = (i + 1) * 500
  print("INPUT TEXT")
  pprint.pprint(full_transcript[start:end])
  out = summarization(full_transcript[start:end], min_length = 5, max_length=50)
  out = out[0]
  out = out['summary_text']
  print("SUMMARY TEXT")
  pprint.pprint(out)
  summarized_text.append(out)

In [None]:
len(str(summarized_text))

In [None]:
summary = " ".join(summarized_text)
pprint.pprint(summary)

In [None]:
#save summary result to txt file 
result = "/content/summary_result.txt"

with open(result, "w") as file:
  file.write(summary)

print("Summary Result saved to", result)

##Evaluation 

Matrix evaluation using ROUGE for BART pretrained model 

Doing evaluation compared summarization using BART model and human summarization

In [None]:
#Summary result without pretrained model -> only using pipeline from transformer 
with open("summary_Pip.txt", 'r') as file:
  data1 = file.read().replace('\n', '')
data1 = data1.replace("\ufeff","")

In [None]:
#Summary result using BART model 
with open("summary_result.txt", 'r') as file:
  data2 = file.read().replace('\n', '')
data2 = data2.replace("\ufeff","")

In [None]:
!pip install rouge 

In [None]:
from rouge import Rouge

In [None]:
rouge1 = Rouge()

In [None]:
score = rouge1.get_scores(data1, data2, avg=True)

In [None]:
print(score)

Matrix Evaluation using WER for wav2vec2 pretrained model 

Compared transcript result using API and wav2vec2

In [None]:
import nltk

In [None]:
#Transcript result using youtube api
with open("transcript_api.txt", 'r') as file:
  data3 = file.read().replace('\n', '')
data3 = data1.replace("\ufeff","")

In [None]:
#Transcript result using wav2vec2
with open("transcript.txt", 'r') as file:
  data4 = file.read().replace('\n', '')
data4 = data1.replace("\ufeff","")

In [None]:
def calculate_wer(data3, data4):
  with open("transcript_api.txt", 'r') as file:
    data3 = file.read().strip().lower().split()

  with open("transcript.txt", 'r') as file:
    data4 = file.read().strip().lower().split()

  wer = nltk.edit_distance(data3, data4) / len(data3)
  return wer

In [None]:
ref = "transcript_api.txt"
hyp = "transcript.txt"

In [None]:
score = calculate_wer(ref, hyp)

In [None]:
print("WER Score :", score)