In [7]:
import pandas as pd
import os
import requests
import subprocess
from faster_whisper import WhisperModel


In [8]:
# --------------------------------------------------
# STEP 1: Read CSV and get audio URL
# --------------------------------------------------

cols = ["Podcast_Name", "Episode_Name", "Audio_URL", "Source", "Index"]
df = pd.read_csv("../audio_links.csv", header=None, names=cols)

print("CSV Loaded:")
print(df.head())

urls = df["Audio_URL"].tolist()
print("\nFirst 3 URLs:")
print(urls[:3])


CSV Loaded:
                                        Podcast_Name  \
0  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   
1  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   
2  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   
3  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   
4  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   

                 Episode_Name  \
0   episode-208-with-kelsey-h   
1   episode-208-with-kelsey-h   
2   episode-208-with-kelsey-h   
3   episode-208-with-kelsey-h   
4   episode-208-with-kelsey-h   

                                           Audio_URL       Source  Index  
0   https://stutterrockstar.files.wordpress.com/2...   HeStutters      0  
1   https://stutterrockstar.files.wordpress.com/2...   HeStutters      1  
2   https://stutterrockstar.files.wordpress.com/2...   HeStutters      2  
3   https://stutterrockstar.files.wordpress.com/2...   HeStutters      3  
4   https://stutterrockstar.files.wordpress.com/2...   HeStutters      4  

F

In [9]:

# --------------------------------------------------
# STEP 2: Download MP3
# --------------------------------------------------

os.makedirs("../audio/downloaded", exist_ok=True)

url = urls[0]
mp3_path = "../audio/downloaded/test.mp3"

response = requests.get(url)

with open(mp3_path, "wb") as f:
    f.write(response.content)

print("\nDownloaded test.mp3")


Downloaded test.mp3


In [10]:

# --------------------------------------------------
# STEP 3: Convert MP3 → WAV (16kHz, mono)
# --------------------------------------------------

wav_path = "../audio/downloaded/test.wav"

subprocess.run(
    [
        "ffmpeg", "-y",
        "-i", mp3_path,
        "-ac", "1",
        "-ar", "16000",
        wav_path
    ],
    check=True
)

print("Converted to WAV successfully")

Converted to WAV successfully


In [11]:

# --------------------------------------------------
# STEP 4: Load Whisper model
# --------------------------------------------------

MODEL_PATH = r"C:\whisper_models\faster-whisper-small"

model = WhisperModel(
    MODEL_PATH,
    device="cpu",
    compute_type="int8",
    cpu_threads=4,
    num_workers=1
)


In [12]:
# --------------------------------------------------
# STEP 5: Transcribe with word timestamps
# --------------------------------------------------

segments, info = model.transcribe(
    wav_path,
    word_timestamps=True
)

# IMPORTANT: convert generator to list
segments = list(segments)

print("\nDetected Language:", info.language)

# Segment-level debug
print("\nSEGMENTS:")
for segment in segments:
    print(segment.start, segment.end, segment.text)


Detected Language: en

SEGMENTS:
16.039999999999996 25.98  Hi everybody, this is Pam coming to you from the debut episode of a new addition to the
25.98 34.8  blog make room for this uttering. Today I'm going to introduce his stories. She
34.8 45.9  asks him and my goal is to introduce a little bit of the male perspective on
45.9 54.32  this crazy thing that we all know as stuttering. So for my debut male guest
54.32 64.32  I'm really excited to say hello to Alan. Hi Alan, how are you? Good afternoon. It's
64.32 72.0  good afternoon here in Wales in the United Kingdom. I suspect it's still morning
72.0 84.3  with you or just into the afternoon in the New York area. That is correct. It's
84.3 91.38  time and it's a beautiful Saturday afternoon here kind of chilly but it's
91.38 96.68  one of those days where I don't mind at all staying in talking with good
96.68 105.38  friends and probably later on getting some housework done. Good for you. So Alan,
105.7 112.4  for our listeners coul

In [13]:
# --------------------------------------------------
# STEP 6: Extract word-level timestamps
# --------------------------------------------------

words = []

for segment in segments:
    if segment.words is None:
        continue

    for w in segment.words:
        words.append({
            "word": w.word,
            "start": w.start,
            "end": w.end
        })

print("\nFIRST 10 WORDS WITH TIMESTAMPS:")
print(words[:10])


FIRST 10 WORDS WITH TIMESTAMPS:
[{'word': ' Hi', 'start': np.float64(16.039999999999996), 'end': np.float64(16.919999999999998)}, {'word': ' everybody,', 'start': np.float64(16.919999999999998), 'end': np.float64(17.8)}, {'word': ' this', 'start': np.float64(18.1), 'end': np.float64(18.26)}, {'word': ' is', 'start': np.float64(18.26), 'end': np.float64(18.42)}, {'word': ' Pam', 'start': np.float64(18.42), 'end': np.float64(18.7)}, {'word': ' coming', 'start': np.float64(18.7), 'end': np.float64(19.56)}, {'word': ' to', 'start': np.float64(19.56), 'end': np.float64(19.9)}, {'word': ' you', 'start': np.float64(19.9), 'end': np.float64(20.16)}, {'word': ' from', 'start': np.float64(20.16), 'end': np.float64(20.5)}, {'word': ' the', 'start': np.float64(20.5), 'end': np.float64(20.74)}]


In [14]:
# --------------------------------------------------
# STEP 7: Generate RAW transcript
# --------------------------------------------------

raw_text = " ".join(w["word"] for w in words)

print("\nRAW TRANSCRIPT:")
print(raw_text)


RAW TRANSCRIPT:
 Hi  everybody,  this  is  Pam  coming  to  you  from  the  debut  episode  of  a  new  addition  to  the  blog  make  room  for  this  uttering.  Today  I'm  going  to  introduce  his  stories.  She  asks  him  and  my  goal  is  to  introduce  a  little  bit  of  the  male  perspective  on  this  crazy  thing  that  we  all  know  as  stuttering.  So  for  my  debut  male  guest  I'm  really  excited  to  say  hello  to  Alan.  Hi  Alan,  how  are  you?  Good  afternoon.  It's  good  afternoon  here  in  Wales  in  the  United  Kingdom.  I  suspect  it's  still  morning  with  you  or  just  into  the  afternoon  in  the  New  York  area.  That  is  correct.  It's  time  and  it's  a  beautiful  Saturday  afternoon  here  kind  of  chilly  but  it's  one  of  those  days  where  I  don't  mind  at  all  staying  in  talking  with  good  friends  and  probably  later  on  getting  some  housework  done.  Good  for  you.  So  Alan,  for  our  listeners  could  you  for

In [None]:
import pandas as pd
import os
import requests
import subprocess
from faster_whisper import WhisperModel
from tqdm import tqdm


In [None]:

# --------------------------------------------------
# CONFIG
# --------------------------------------------------

CSV_PATH = "../audio_links.csv"
DOWNLOAD_DIR = "../audio/downloaded"
OUTPUT_CSV = "../output/raw_transcripts.csv"

MODEL_PATH = r"C:\whisper_models\faster-whisper-small"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs("../output", exist_ok=True)


In [None]:

# --------------------------------------------------
# LOAD CSV
# --------------------------------------------------

cols = ["Podcast_Name", "Episode_Name", "Audio_URL", "Source", "Index"]
df = pd.read_csv(CSV_PATH, header=None, names=cols)


In [None]:
# --------------------------------------------------
# LOAD WHISPER MODEL (ONCE)
# --------------------------------------------------

model = WhisperModel(
    MODEL_PATH,
    device="cpu",
    compute_type="int8"
)

results = []

In [None]:

# --------------------------------------------------
# PROCESS EACH AUDIO FILE
# --------------------------------------------------

for idx, row in tqdm(df.iterrows(), total=len(df)):
    try:
        audio_url = row["Audio_URL"]
        index_id = row["Index"]

        mp3_path = f"{DOWNLOAD_DIR}/{index_id}.mp3"
        wav_path = f"{DOWNLOAD_DIR}/{index_id}.wav"

        # -------- Download audio --------
        if not os.path.exists(mp3_path):
            r = requests.get(audio_url, timeout=30)
            with open(mp3_path, "wb") as f:
                f.write(r.content)
                   # -------- Convert to WAV --------
        subprocess.run(
            [
                "ffmpeg", "-y",
                "-i", mp3_path,
                "-ac", "1",
                "-ar", "16000",
                wav_path
            ],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            check=True
        )

        # -------- Transcribe --------
        segments, info = model.transcribe(wav_path)
        segments = list(segments)

        raw_text = " ".join(segment.text.strip() for segment in segments)

        # -------- Save result --------
        results.append({
            "Podcast_Name": row["Podcast_Name"],
            "Episode_Name": row["Episode_Name"],
            "Audio_URL": audio_url,
            "Index": index_id,
            "Language": info.language,
            "Raw_Transcript": raw_text
        })

    except Exception as e:
        print(f"❌ Failed for index {row['Index']}: {e}")
        continue


In [None]:




     
# --------------------------------------------------
# SAVE OUTPUT
# --------------------------------------------------

output_df = pd.DataFrame(results)
output_df.to_csv(OUTPUT_CSV, index=False)

print(f"\n✅ Saved raw transcripts to {OUTPUT_CSV}")