In [44]:
import os
import whisper
import yt_dlp
from pydub import AudioSegment


In [46]:
def download_youtube_audio(url, output_dir="downloads"):
    os.makedirs(output_dir, exist_ok=True)
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": f"{output_dir}/%(title)s.%(ext)s",
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "wav",
            "preferredquality": "192",
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

In [48]:
model = whisper.load_model("base")  # Can switch to "small" or "medium" for better accuracy

def find_hello_segments(audio_path):
    result = model.transcribe(audio_path)
    segments = []
    for seg in result["segments"]:
        if "hello" in seg["text"].lower():
            segments.append((seg["start"], seg["end"]))
    return segments


In [49]:
def extract_clips(audio_path, segments, output_dir="hello_dataset"):
    os.makedirs(output_dir, exist_ok=True)
    audio = AudioSegment.from_wav(audio_path)
    
    # Use the base name of the audio file (without extension)
    base_name = os.path.splitext(os.path.basename(audio_path))[0]

    for idx, (start, end) in enumerate(segments):
        clip = audio[start*1000:end*1000]  # convert sec → ms
        clip_name = os.path.join(output_dir, f"{base_name}_hello_clip_{idx+1}.wav")
        clip.export(clip_name, format="wav")
        print(f"Saved: {clip_name}")


In [41]:
def process_youtube_videos(video_urls):
    for url in video_urls:
        print(f"\n🔹 Processing: {url}")
        download_youtube_audio(url)
        
        # Get downloaded audio file
        downloaded_files = sorted(os.listdir("downloads"), key=lambda x: os.path.getmtime(os.path.join("downloads", x)))
        audio_path = os.path.join("downloads", downloaded_files[-1])

        # Find hello segments
        hello_segments = find_hello_segments(audio_path)

        if hello_segments:
            extract_clips(audio_path, hello_segments)
        else:
            print("No 'hello' detected in this video!")

# Example usage
urls = [
    "https://www.youtube.com/watch?v=EYt6uDr-PHQ"
    
]
process_youtube_videos(urls)



🔹 Processing: https://www.youtube.com/watch?v=EYt6uDr-PHQ
[youtube] Extracting URL: https://www.youtube.com/watch?v=EYt6uDr-PHQ
[youtube] EYt6uDr-PHQ: Downloading webpage
[youtube] EYt6uDr-PHQ: Downloading tv simply player API JSON
[youtube] EYt6uDr-PHQ: Downloading tv client config
[youtube] EYt6uDr-PHQ: Downloading tv player API JSON
[info] EYt6uDr-PHQ: Downloading 1 format(s): 251
[download] Sleeping 3.00 seconds as required by the site...
[download] Destination: downloads\The Future of Game Development.webm
[download] 100% of    6.96MiB in 00:00:14 at 492.52KiB/s 
[ExtractAudio] Destination: downloads\The Future of Game Development.wav
Deleting original file downloads\The Future of Game Development.webm (pass -k to keep)




Saved: hello_dataset\The Future of Game Development_hello_clip_1.wav
Saved: hello_dataset\The Future of Game Development_hello_clip_2.wav


In [None]:
https://www.youtube.com/watch?v=wfzRJFNHnoQ
https://www.youtube.com/watch?v=EXatnflnJKU
https://www.youtube.com/watch?v=H_UxyQ7cb3I
https://www.youtube.com/watch?v=_tWh4cYCTv0
https://www.youtube.com/watch?v=EYt6uDr-PHQ

In [None]:
import os
import pandas as pd
from tqdm import tqdm  # progress bar


#  Get unlabeled clips

raw_path = 'training_dataset/cv-corpus-16.1-delta-2023-12-06/en/clips'
label_path = 'training_dataset/cv-corpus-16.1-delta-2023-12-06/en/validated.tsv'

df = pd.read_csv(label_path, sep='\t')

# Unlabeled clips: gender not male/female
unlabeled_df = df[~df['gender'].isin(['male', 'female'])].copy()
unlabeled_df['mp3_path'] = unlabeled_df['path'].apply(lambda x: os.path.join(raw_path, x))

print(f"Found {len(unlabeled_df)} unlabeled clips")


# Extract features with progress bar

unlabeled_features = []

for file_path in tqdm(unlabeled_df['mp3_path'], desc="Extracting features"):
    feat = add_features(file_path)  # reuse your feature extraction function
    feat['file_path'] = file_path
    unlabeled_features.append(feat)

unlabeled_audio_df = pd.DataFrame(unlabeled_features)


#  Keep only training features

feature_columns = x.columns  # features used in SVM/KNN training
X_unlabeled = unlabeled_audio_df[feature_columns]

#  Scale features using training scaler

X_unlabeled_scaled = scaler.transform(X_unlabeled)


# Predict with trained SVM

y_unlabeled_pred = model.predict(X_unlabeled_scaled)
y_unlabeled_labels = labelencoder.inverse_transform(y_unlabeled_pred)

unlabeled_audio_df['predicted_gender'] = y_unlabeled_labels


# 6️⃣ Output DataFrame

unlabeled_audio_df[['file_path', 'predicted_gender']]
output_csv = "unlabeled_audio_predictions.csv"
unlabeled_audio_df[['file_path', 'predicted_gender']].to_csv(output_csv, index=False)
print(f"✅ Predictions saved to {output_csv}")