In [None]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import os
from yt_dlp import YoutubeDL
from youtube_transcript_api import YouTubeTranscriptApi
import stopwordsiso as stopwords
import spacy
from pathlib import Path
import json
import shutil
from pydub import AudioSegment


# Initialize YouTube API
youtube = build('youtube', 'v3', developerKey='AIzaSyBOvVwCkJKIY3RCWAUogb9Y8UbGjMN6fec')

In [None]:
def search_videos(query, max_results=100):
    """Search for videos by query and return a list of video IDs."""
    try:
        search_response = youtube.search().list(
            q=query,
            part='id',
            type='video',
            maxResults=max_results
        ).execute()
        
        video_ids = [item['id']['videoId'] for item in search_response['items']]
        return video_ids
    except HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred:\n{e.content}")
        return []


In [None]:
# Example usage
query = "Gaeilge i mo chroí"  # Adjust your search query as needed
video_ids = search_videos(query)
print(video_ids)
print(f"Found {len(video_ids)} videos with Irish subtitles.")

In [None]:
# Generate URLs
video_url = [f"https://www.youtube.com/watch?v={id}" for id in video_ids]

# Print URLs
for url in video_url:
    print(url)


In [None]:
video_urls='/Users/kritikajavali/Documents/CAMB/video_urls.txt'
video_id='/Users/kritikajavali/Documents/CAMB/video_id.txt'
with open (video_urls,'w') as file:
    for url in video_url:
        file.writelines(f"{url}\n")

with open (video_id,'w') as file:
    for video in video_ids:
        file.writelines(f"{video}\n")



In [None]:
audio_download_options = {
    'format': 'bestaudio/best',
    'outtmpl': '/Users/kritikajavali/Documents/CAMB/audios/%(id)s.%(ext)s',  # Saves audio files to ./audios directory with video ID as filename
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',  # or mp3
        'preferredquality': '192',
    }],
    'quiet': False
}

with YoutubeDL(audio_download_options) as ydl:
    ydl.download(video_url)


In [None]:
for video in video_ids:
    # Path where you want to save the JSON file
    output_file_path = f"transcripts/{video}.json"

    try:
        # Retrieve the available transcripts
        transcript_list = YouTubeTranscriptApi.list_transcripts(video)

        # Choose the transcript you want (e.g., translating it to 'ga' - Irish)
        transcript = transcript_list.find_transcript(['en']).translate('ga').fetch()

        # Serialize the transcript data to JSON
        transcript_json = json.dumps(transcript, ensure_ascii=False, indent=4)

        # Write the JSON data to a file
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(transcript_json)

        print(f"Transcript saved to {output_file_path}")

    except Exception as e:
        print(f"An error occurred: {e}")


In [None]:
# Path to the directories containing the audio files and transcripts
audio_files_dir = '/Users/kritikajavali/Documents/CAMB/audios'
transcripts_dir = '/Users/kritikajavali/Documents/CAMB/transcripts'

# Path to the directory where you want to organize the folders by video ID
ORGANIZED_DIR = '/Users/kritikajavali/Documents/CAMB/audios_transcripts'

# Ensure the organized directory exists
os.makedirs(ORGANIZED_DIR, exist_ok=True)


for video_id in video_ids:
    # Create a directory for each video ID
    video_id_dir = os.path.join(ORGANIZED_DIR, video_id)
    os.makedirs(video_id_dir, exist_ok=True)

    # Move the audio file for this video ID
    audio_file_path = os.path.join(audio_files_dir, f"{video_id}.wav")  # Update the extension if necessary
    if os.path.exists(audio_file_path):
        shutil.move(audio_file_path, os.path.join(video_id_dir, os.path.basename(audio_file_path)))

    # Move the transcript for this video ID
    transcript_file_path = os.path.join(transcripts_dir, f"{video_id}.json")  # Update the extension if necessary
    if os.path.exists(transcript_file_path):
        shutil.move(transcript_file_path, os.path.join(video_id_dir, os.path.basename(transcript_file_path)))

print("Files have been organized.")

In [None]:
# Process each directory in the source directory
for video_id in os.listdir(ORGANIZED_DIR):
    dir_path = os.path.join(ORGANIZED_DIR, video_id)
    if os.path.isdir(dir_path):  # Make sure it's a directory
        for file in os.listdir(dir_path):
            if file.endswith(".wav"):  # Check for audio files
                audio_file_path = os.path.join(dir_path, file)
                base_name = os.path.splitext(file)[0]

                # Assuming the transcript file has the same base name but with a .json extension
                transcript_file_path = os.path.join(dir_path, f"{base_name}.json")

                # Proceed only if both audio and transcript files exist
                if os.path.exists(transcript_file_path):
                    # Load the audio file
                    audio = AudioSegment.from_file(audio_file_path)

                    # Load the transcript data from the JSON file
                    with open(transcript_file_path, 'r', encoding='utf-8') as f:
                        transcript_data = json.load(f)

                    # Process each transcript entry
                    for i, entry in enumerate(transcript_data):
                        # Calculate the start and end times in milliseconds
                        start_time = int(entry['start'] * 1000)
                        end_time = int((entry['start'] + entry['duration']) * 1000)
                        
                        # Extract the audio segment
                        segment = audio[start_time:end_time]
                        
                        # Define the base filename for the audio and transcript files
                        segment_filename = f"{base_name}_{i:03d}"
                        
                        # Save the audio segment
                        segment_path = os.path.join(dir_path, f"{segment_filename}.wav")
                        segment.export(segment_path, format="wav")
                        
                        # Save the corresponding transcript text
                        transcript_path = os.path.join(dir_path, f"{segment_filename}.txt")
                        with open(transcript_path, 'w', encoding='utf-8') as f:
                            f.write(entry['text'])

print("Audio segments and transcripts have been saved.")


In [None]:
# Process each directory in the source directory
for dir_path in ORGANIZED_DIR.iterdir():
    if dir_path.is_dir():
        # Process each .txt file in the directory
        for txt_file_path in dir_path.glob("*.txt"):
            with open(txt_file_path, "r", encoding="utf-8") as f:
                text = f.read()

            # Check if text contains "[Ceol]"
            if "[Ceol]" in text:
                # Prepare the path for the corresponding .wav file
                wav_file_path = dir_path / f"{txt_file_path.stem}.wav"
                
                # Delete the text file
                txt_file_path.unlink()

                # Delete the .wav file if it exists
                if wav_file_path.exists():
                    wav_file_path.unlink()
                
                print(f"Deleted {txt_file_path.name} and {wav_file_path.name} due to the presence of '[Ceol]'.")
                continue  # Important to skip the rest of the loop if the file is deleted

print("Processing complete.")


In [None]:
# Initialize the Irish Spacy model
nlp = spacy.blank("ga")

# Fetch Irish stop words using stopwordsiso
irish_stopwords = stopwords.stopwords("ga")

# Function to normalize text
def normalize_text(text, nlp, stopwords):
    doc = nlp(text)
    # Filter out stop words and punctuation
    filtered_tokens = [token.text for token in doc if token.text not in stopwords and not token.is_punct]
    return " ".join(filtered_tokens)

# Process each directory and file in the source directory
for dir_path in ORGANIZED_DIR.iterdir():
    if dir_path.is_dir():  # Make sure it's a directory
        for txt_file_path in dir_path.glob("*.txt"):
            with open(txt_file_path, "r", encoding="utf-8") as f:
                text = f.read()

            # Normalize the text
            normalized_text = normalize_text(text, nlp, irish_stopwords)
            
            # Construct the normalized file path
            normalized_file_path = txt_file_path.parent / f"{txt_file_path.stem}_normalized.txt"
            with open(normalized_file_path, "w", encoding="utf-8") as nf:
                nf.write(normalized_text)
            
            print(f"Processed and saved normalized text for {txt_file_path.name}.")

print("Processing complete.")