<a href="https://colab.research.google.com/github/kenwaldek/Whisper/blob/main/YT_dwld_search_date_upload_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install necessary dependencies
!pip install yt-dlp
!pip install google-api-python-client

# Cell 2: Import libraries
import os
from yt_dlp import YoutubeDL
from googleapiclient.discovery import build

# Cell 3: Function to search YouTube using the YouTube Data API
def search_youtube_videos(query, api_key, max_results=10):
    youtube = build('youtube', 'v3', developerKey=api_key)
    request = youtube.search().list(
        q=query,
        part='snippet',
        type='video',
        order='date',
        maxResults=max_results
    )
    response = request.execute()

    video_data = [
        {
            "url": f"https://www.youtube.com/watch?v={item['id']['videoId']}",
            "title": item['snippet']['title'],
            "published_at": item['snippet']['publishedAt'][:10]  # Extract date (YYYY-MM-DD)
        }
        for item in response['items']
    ]

    return video_data

# Cell 4: Function to download audio using yt-dlp
def download_audio(video_data, download_folder):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    for video in video_data:
        ydl_opts = {
            'outtmpl': os.path.join(download_folder, f"{video['published_at']} - {video['title']}.%(ext)s"),
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'noplaylist': True
        }

        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([video["url"]])

# Cell 5: Main function to perform the search and download
def main():
    api_key = "AIzaSyDp_XV5lBEeJNz0j6sR_BDR6q7aFH1al_s"  # Replace with your API key
    search_query = "rick rule"
    max_videos = 3
    download_folder = "./Audio"

    print(f"Searching YouTube for latest videos: '{search_query}'...")
    video_data = search_youtube_videos(search_query, api_key, max_results=max_videos)

    print("Found videos:")
    for video in video_data:
        print(f"{video['published_at']} - {video['title']} ({video['url']})")

    print("\nDownloading audio...")
    download_audio(video_data, download_folder)
    print(f"Audio files downloaded to: {download_folder}")

# Cell 6: Execute the script
if __name__ == "__main__":
    main()


Collecting yt-dlp
  Downloading yt_dlp-2024.11.18-py3-none-any.whl.metadata (172 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/172.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.1/172.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2024.11.18-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2024.11.18
Searching YouTube for latest videos: 'rick rule'...
Found videos:
2024-11-18 - New Trump Era, 10 Commodities Analysis, Enriched Uranium Export Ban to the U.S - Rick Rule (https://www.youtube.com/watch?v=i8HPbeWcLH0)
2024-11-18 - Rick Rule&#39;s Wisdom &amp; a New Copper Porphyry Exploration Program in BC (https://www.youtube.com/watch?v=DzYIJgFNkTE)
2024-11-18 - Rick Rule: &quot;Everyone Who Owns Gold Needs To

In [None]:
import time  # Import the time module for delay

# Delay execution by 10 seconds
print("Starting in 10 seconds...")
time.sleep(10)

# Install Whisper from the specified repository
!pip install git+https://github.com/openai/whisper.git
!pip install ffmpeg  # Ensure ffmpeg is installed for audio processing

# Import required libraries
import os
import whisper

# Define input and output folders
audio_folder = "./Audio"
txt_folder = "./TXT"

# Create the TXT folder if it doesn't exist
if not os.path.exists(txt_folder):
    os.makedirs(txt_folder)

# Load Whisper medium model
model = whisper.load_model("medium")  # Use the medium model for more accuracy

# Function to transcribe audio files and save as .txt in English
def transcribe_audio_files(audio_folder, txt_folder):
    for audio_file in os.listdir(audio_folder):
        if audio_file.endswith(".mp3"):
            audio_path = os.path.join(audio_folder, audio_file)
            output_txt_path = os.path.join(txt_folder, f"{os.path.splitext(audio_file)[0]}.txt")

            print(f"Transcribing {audio_file}...")

            # Transcribe and translate to English
            result = model.transcribe(audio_path, language=None, task="translate")

            # Save the transcription to a .txt file
            with open(output_txt_path, "w", encoding="utf-8") as txt_file:
                txt_file.write(result["text"])

            print(f"Saved transcription to: {output_txt_path}")

# Transcribe all audio files
transcribe_audio_files(audio_folder, txt_folder)

print(f"Transcriptions saved in: {txt_folder}")


Starting in 10 seconds...
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-d_ngx817
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-d_ngx817
  Resolved https://github.com/openai/whisper.git to commit 173ff7dd1d9fb1c4fddea0d41d704cfefeb8908c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper==20240930)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

100%|█████████████████████████████████████| 1.42G/1.42G [00:18<00:00, 81.1MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Transcribing 2024-11-18 - New Trump Era, 10 Commodities Analysis, Enriched Uranium Export Ban to the U.S - Rick Rule.mp3...
Saved transcription to: ./TXT/2024-11-18 - New Trump Era, 10 Commodities Analysis, Enriched Uranium Export Ban to the U.S - Rick Rule.txt
Transcribing 2024-11-18 - Rick Rule&#39;s Wisdom &amp; a New Copper Porphyry Exploration Program in BC.mp3...
Saved transcription to: ./TXT/2024-11-18 - Rick Rule&#39;s Wisdom &amp; a New Copper Porphyry Exploration Program in BC.txt
Transcribing 2024-11-18 - Rick Rule: &quot;Everyone Who Owns Gold Needs To Know It&#39;s Going To $10.000&quot; 2025 Prediction.mp3...
Saved transcription to: ./TXT/2024-11-18 - Rick Rule: &quot;Everyone Who Owns Gold Needs To Know It&#39;s Going To $10.000&quot; 2025 Prediction.txt
Transcriptions saved in: ./TXT


In [None]:
import os
import time

# Delay execution by 10 seconds
print("Waiting for 10 seconds to ensure all files are ready...")
time.sleep(10)

# Define the input and output folders
txt_folder = "./TXT"
txt_mod_folder = "./TXT_mod"

# Create the TXT_mod folder if it doesn't exist
if not os.path.exists(txt_mod_folder):
    os.makedirs(txt_mod_folder)

# Process each .txt file in the TXT folder
for txt_file in os.listdir(txt_folder):
    if txt_file.endswith(".txt"):
        input_txt_path = os.path.join(txt_folder, txt_file)
        output_txt_path = os.path.join(txt_mod_folder, txt_file)

        # Read the contents of the .txt file
        with open(input_txt_path, "r", encoding="utf-8") as file:
            content = file.read()

        # Add the filename (without extension) as the title at the top
        title = os.path.splitext(txt_file)[0]  # Get filename without extension
        modified_content = f"{title}\n\n{content}"

        # Save the modified content to the new .txt file in TXT_mod
        with open(output_txt_path, "w", encoding="utf-8") as file:
            file.write(modified_content)

        print(f"Processed and saved: {output_txt_path}")

print(f"All files have been processed and saved in: {txt_mod_folder}")


Waiting for 10 seconds to ensure all files are ready...
Processed and saved: ./TXT_mod/2024-11-18 - New Trump Era, 10 Commodities Analysis, Enriched Uranium Export Ban to the U.S - Rick Rule.txt
Processed and saved: ./TXT_mod/2024-11-18 - Rick Rule&#39;s Wisdom &amp; a New Copper Porphyry Exploration Program in BC.txt
Processed and saved: ./TXT_mod/2024-11-18 - Rick Rule: &quot;Everyone Who Owns Gold Needs To Know It&#39;s Going To $10.000&quot; 2025 Prediction.txt
All files have been processed and saved in: ./TXT_mod


In [None]:
import os
import time
import zipfile
from datetime import datetime

# Delay execution by 4 seconds to ensure previous operations are complete
print("Waiting for 4 seconds to ensure previous operations are complete...")
time.sleep(4)

# Define folders
audio_folder = "./Audio"
txt_mod_folder = "./TXT_mod"

# Ensure the folders exist before proceeding
if not os.path.exists(audio_folder):
    print(f"Folder '{audio_folder}' does not exist.")
    exit()
if not os.path.exists(txt_mod_folder):
    print(f"Folder '{txt_mod_folder}' does not exist.")
    exit()

# Get current date in yyyy-mm-dd format
current_date = datetime.now().strftime("%Y-%m-%d")

# Define the search string used (replace 'Sewer data' with the actual search term)
search_string = "Sewer data"
formatted_search_string = search_string.replace(" ", "_")

# Function to compress a single file
def compress_file(file_path, output_zip):
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        arcname = os.path.basename(file_path)  # Store only the file name in the ZIP
        zipf.write(file_path, arcname)

# Compress each audio file in the Audio folder
for audio_file in os.listdir(audio_folder):
    if audio_file.endswith(".mp3"):
        audio_file_path = os.path.join(audio_folder, audio_file)
        audio_zip_filename = f"{os.path.splitext(audio_file)[0]}.zip"  # Only the file name without extensions
        compress_file(audio_file_path, audio_zip_filename)
        print(f"Compressed '{audio_file}' into '{audio_zip_filename}'.")

# Compress the entire TXT_mod folder
txt_mod_zip_filename = f"{current_date}_{formatted_search_string}_TXT_mod.zip"

def compress_folder(folder_path, output_zip):
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, start=os.path.dirname(folder_path))
                zipf.write(file_path, arcname)

compress_folder(txt_mod_folder, txt_mod_zip_filename)
print(f"Compressed '{txt_mod_folder}' into '{txt_mod_zip_filename}'.")
