# Chat Podcast

### Mount Drive in Colab

In [1]:
# Mount Google drive (since MP3 files are saved in Drive)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd content/drive/MyDrive/Data Vault/GitHub/Chat-Podcast

/content/drive/MyDrive/Data Vault/GitHub/Chat-Podcast


___
### Install and Import Dependencies

In [4]:
!pip install langchain
!pip install openai
!pip install -U openai-whisper

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [50]:
import json
import os
import pandas as pd
import time
import torch
import whisper
from pathlib import Path

In [6]:
torch. __version__

'1.13.1+cu116'

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [8]:
os.environ["OPENAI_API_KEY"] = "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"

___
### Initial Demo Run

In [30]:
whisper_model = whisper.load_model("medium.en").to(device)

In [12]:
text = whisper_model.transcribe("demo/Liam Neeson - Taken.mp3")

In [13]:
text['text']

" I don't know who you are. I don't know what you want. If you are looking for ransom, I can tell you I don't have money. But what I do have are a very particular set of skills. Skills I have acquired over a very long career. Skills that make me a nightmare for people like you. If you let my daughter go now, that will be the end of it. I will not look for you. I will not pursue you. But if you don't, I will look for you. I will find you. And I will kill you. Good luck."

In [None]:
text['segments']

___
### Transcribe All Podcast Audio Files

In [26]:
# Read podcast metadata (saved in a CSV file)
metadata = pd.read_csv('audio/podcast_metadata.csv')

In [27]:
metadata.head()

Unnamed: 0,Title,URL,Date
0,A Third Path to Talent Development - Delta's M...,https://open.spotify.com/episode/50oRprIC6z0wJ...,Mar-23
1,AI in Aerospace - Boeing's Helen Lee,https://open.spotify.com/episode/4jGgx7BnQqIHA...,May-22
2,AI in Your Living Room - Peloton's Sanjay Nichani,https://open.spotify.com/episode/162i8MiLbebbI...,Mar-22
3,Big Data in Agriculture - Land O'Lakes' Teddy ...,https://open.spotify.com/episode/0fOkXwX2eKORj...,Aug-22
4,Choreographing Human-Machine Collaboration - S...,https://open.spotify.com/episode/2YHcwnxS2bc0J...,Feb-22


In [49]:
paths = sorted([str(x) for x in Path('audio').glob('*.mp3')])
paths

["audio/A Third Path to Talent Development - Delta's Michelle McCrackin.mp3",
 "audio/AI in Aerospace - Boeing's Helen Lee.mp3",
 "audio/AI in Your Living Room - Peloton's Sanjay Nichani.mp3",
 "audio/Big Data in Agriculture - Land O'Lakes' Teddy Bekele.mp3",
 "audio/Choreographing Human-Machine Collaboration - Spotify's Sidney Madison Prescott.mp3",
 "audio/Digital First, Physical Second - Wayfair's Fiona Tan.mp3",
 "audio/Extreme Innovation with AI - Stanley Black and Decker's Mark Maybury.mp3",
 "audio/From Data to Wisdom - Novo Nordisk's Tonia Sideri.mp3",
 "audio/From Journalism to Jeans - Levi Strauss' Katia Walsh.mp3",
 "audio/Helping Doctors Make Better Decisions with Data - UC Berkley's Ziad Obermeyer.mp3",
 "audio/Imagining Furniture (and the Future) with AI - IKEA Retail's Barbara Martin Coppola.mp3",
 "audio/Inventing the Beauty of the Future - L'Oreal's Stephane Lannuzel.mp3",
 "audio/Investing in the Last Mile - PayPal's Khatereh Khodavirdi.mp3",
 "audio/Keeping Humans in

In [21]:
# text = whisper_model.transcribe("audio/Transforming a Technology Organization for the Future - Starbucks' Gerri Martin-Flickinger.mp3")

In [None]:
text

In [53]:
len(text['segments'])

427

### Automate Transcription and Save Transcript and Metadata

In [48]:
def save_transcript_json(content, title):
  with open(f"transcripts/{title}.jsonl", "w", encoding="utf-8") as fp:
      for line in content:
          json.dump(line, fp)
          fp.write('\n')

In [None]:
for i, path in enumerate(paths):
    
    episode_content = []

    # Get info of podcast episode
    title = path.split('/')[-1][:-4]

    # Skip if transcript already exists
    existing_transcripts = [str(x).split('/')[-1].split('.')[0] for x in \
                            Path('transcripts').glob('*')]
    if title in existing_transcripts:
      break

    date = metadata[metadata.Title == title]["Date"].values[0]
    url = metadata[metadata.Title == title]["URL"].values[0]
    
    # Initiate timer
    print(f'Begin transcription for {title}')
    start = time.time()

    # Transcribe MP3 audio
    result = whisper_model.transcribe(path)
    segments = result['segments']

    for segment in segments:
        # Merge segments data and podcast metadata
        segment_content = {
                          'title': title,
                          'date': date,
                          'url': url,
                          'id': f"{title}-t{segment['start']}",
                          'text': segment['text'].strip(),
                          'start': segment['start'],
                          'end': segment['end']
                        }
        episode_content.append(segment_content)

    # Save contents as JSON
    save_transcript_json(episode_content, title)
    
    # Show time taken
    stop = time.time()
    duration = stop-start
    print(f"{duration/60} minutes taken for episode: {title}")

Begin transcription for A Third Path to Talent Development - Delta's Michelle McCrackin


In [None]:
# Then combine all JSONs into one master compilation

In [None]:
# DO chunking and striding

In [None]:
# Vector base embedding with Weaviate

In [None]:
# References
# https://github.com/jamescalam/ask-youtube/blob/main/youtube-search/01-openai-whisper.ipynb
# https://www.pinecone.io/learn/openai-whisper/