In [None]:
pip install pytube

In [None]:
pip install git+https://github.com/openai/whisper.git 

In [None]:
import pandas as pd

from pytube import YouTube
import whisper

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter

import gc

In [None]:
def download_video_and_return_file_name(url: str) -> str:
    yt = YouTube(url=url, use_oauth=True, allow_oauth_cache=True)
    title = yt.streams[0].title
    audios = yt.streams.filter(only_audio=True)
    video_audio = audios[-1]

    file_name = title.lower().replace(' ', '_') + '.mp3'
    video_audio.download(filename=f'{file_name}')
    return file_name

In [None]:
# Ordered by name
# KK: Kemal Kilicdaroglu
# RTE: Recep Tayyip Erdogan

video_urls = {
        "KK":
        [
        "https://www.youtube.com/watch?v=szeLxIEIt7M", # kutahya
        "https://www.youtube.com/watch?v=7DbtD5BVwyA", # sivas
        "https://www.youtube.com/watch?v=LGYkYR0FFzU", # bolu
        "https://www.youtube.com/watch?v=WYf9zAMgyCY", # adana
        "https://www.youtube.com/watch?v=LXyQjm9bZOU", # antalya
        "https://www.youtube.com/watch?v=8IGPNse2GVo", # mersin
        "https://www.youtube.com/watch?v=OUJgTcEvJVY", # denizli
        "https://www.youtube.com/watch?v=_xJlMpvPOoE" # nigde
        ],
        "RTE":
        [
        "https://www.youtube.com/watch?v=X4NWdr106ZA", # adana
        "https://www.youtube.com/watch?v=d8BnyX1YvIo", # tekirdag
        "https://www.youtube.com/watch?v=_lEVK65qUJs", # mersin
        "https://www.youtube.com/watch?v=CtLEoB2htZw", # kayseri
        "https://www.youtube.com/watch?v=TL0k8q9V_Hg", # samsun
        "https://www.youtube.com/watch?v=ur11uH45dL0", # ordu
        "https://www.youtube.com/watch?v=4NyvpsPg2UQ", # rize
        "https://www.youtube.com/watch?v=h4U8LHmHfRE" # ankara
        ]
    }

In [None]:
downloaded_content= {
    "KK":[], 
    "RTE":[], 
}

In [None]:
for candidate in video_urls.keys():
    for url in video_urls[candidate]:
        print(f"For the candidate: {candidate} the video {url} started!")
        file_name = download_video_and_return_file_name(url)
        downloaded_content[candidate].append(file_name)
        print(f"For the candidate: {candidate} the video {url} completed!")

        collected = gc.collect()

In [None]:
model = whisper.load_model("large")

In [None]:
def transcribe_audio_to_scripts(file_name: str) -> list:
    output = model.transcribe(file_name)
    raw_scripts = [{'start':script['start'], 'end': script['end'], 'text': script['text']} for script in output['segments']]
    return raw_scripts

In [None]:
language_analysis = {
    "KK":[], 
    "RTE":[], 
}

In [None]:
for candidate in downloaded_content.keys():
    for file_name in downloaded_content[candidate]:
        print(f"For the candidate: {candidate} the audio {file_name} started for generating the scripts.")
        raw_scripts = transcribe_audio_to_scripts(file_name)
        language_analysis[candidate].append({'name':file_name,
                                             'raw_scripts':raw_scripts})
        print(f"For the candidate: {candidate} the audio {file_name} completed to generate the scripts.")
        collected = gc.collect()

In [None]:
import pickle 

with open('KK_and_RTE_language_analysis.pkl', 'wb') as f:
    pickle.dump(language_analysis, f)