In [1]:
import dropbox
from pinecone import Pinecone, ServerlessSpec, list_indexes, Index
import pandas as pd
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv, find_dotenv
import os
import re



### Create indices

In [1]:
load_dotenv(find_dotenv())

APP_KEY = os.getenv("APP_KEY")
APP_SECRET = os.getenv("APP_SECRET")
REDIRECT_URI = os.getenv("REDIRECT_URI")


auth_flow = dropbox.DropboxOAuth2FlowNoRedirect(APP_KEY, APP_SECRET, locale=REDIRECT_URI, token_access_type='offline')
# Get the authorization URL
authorize_url = auth_flow.start()
print("1. Go to: " + authorize_url)
print("2. Click 'Allow' (you might have to log in first).")
print("3. Copy the authorization code, paste it in next block")

auth_flow.finish()

In [None]:
# Exchange the authorization code for an access token and refresh token
oauth_result = auth_flow.finish(os.getenv("AUTHORIZATION_CODE"))
print("Access token:", oauth_result.access_token)
print("Refresh token:", oauth_result.refresh_token)

In [4]:
PINECONE_KEY = os.getenv("PINECONE_KEY")
DROPBOX_KEY = os.getenv("DROPBOX_KEY")

pc = Pinecone(api_key=PINECONE_KEY)
dbx = dropbox.Dropbox(app_key=APP_KEY, app_secret=APP_SECRET, oauth2_refresh_token=oauth_result.refresh_token)


SHOWS = dbx.files_list_folder('/Apps/Basketball Podcast Transcript').entries
NUM_SHOWS = len(SHOWS)
show = SHOWS[0].path_display
episodes = dbx.files_list_folder(show).entries
episode = episodes[1].path_display

dbx.files_list_folder(episode).entries

In [5]:
def list_tsv_files(folder_path):
    try:
        # List files in the specified folder
        response = dbx.files_list_folder(folder_path)
        # Filter for .tsv files
        tsv_files = [entry for entry in response.entries if isinstance(entry, dropbox.files.FileMetadata) and entry.name.endswith('.tsv')]
        return tsv_files
    except dropbox.exceptions.ApiError as err:
        print('Failed to list .tsv files:', err)
        return []
    

def download_file(file_path, local_path):
    try:
        dbx.files_download_to_file(local_path, file_path)
        print(f"Downloaded {file_path} to {local_path}")
    except dropbox.exceptions.ApiError as err:
        print(f"Failed to download {file_path}: {err}")

Downloaded /Apps/Basketball Podcast Transcript/All The Smoke - 20240404-093725/All-Star Recap, Caitlin Clark, Lakers or Warriors ft Gillie Da K/All The Smoke - All-Star Recap, Caitlin Clark, Lakers or Warrior.tsv to transcripts/All The Smoke - All-Star Recap, Caitlin Clark, Lakers or Warrior.tsv


In [None]:
# model to encode sentences
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
if 'transcripts' not in pc.list_indexes().names():
    pc.create_index(
        name="transcripts",
        dimension=model.get_sentence_embedding_dimension(),
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )

In [None]:
index = pc.Index('transcripts')

In [None]:
# check transcript not already uploaded
def check_id_exists(id_to_check):
    result = index.fetch(ids=[id_to_check], namespace='')

    if len(result['vectors'].keys()):
        return True
    return False

# upload set of vectors in batches
def upload_in_batches(vectors, batch_size=100):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)

# remove whitespaces in id and replace special characters with _
def clean_id(text):
    text = text.replace(' ', '')
    text = re.sub(r"[^\x00-\x7F]+", '_', text)
    return text

In [None]:
def create_index():
    for show in SHOWS:
        podcast_name = show.name
        episodes = dbx.files_list_folder(show.path_display).entries
        num_episodes = len(episodes)
        count = 0
        for episode in episodes:
            count += 1
            episode_name = episode.name
            files = list_tsv_files(episode.path_display)
            for file in files:
                filename = 'transcripts/' + file.name
                id = clean_id(podcast_name + episode_name + str(0))
                if check_id_exists(id):
                    break
                download_file(file.path_display, filename)
                df = pd.read_csv(filename, sep='\t', keep_default_na=False)
                df['vector'] = df['text'].apply(lambda x: model.encode(str(x)))
                upload_data = [{
                    'id': clean_id(podcast_name + episode_name + str(i)),
                    'values': row.vector.tolist(),
                    'metadata': {
                        'start_time': row.start,
                        'stop_time': row.end,
                        'podcast': podcast_name,
                        'episode': episode_name,
                        'text': row.text,
                    },
                    } for i, row in df.iterrows()]
                upload_in_batches(upload_data)
                os.remove(filename)
                break
            print(f"{count}/{num_episodes}")
# create_index()

### Query

In [8]:
# Re-initialize helpers & index from scraper portion (assuming data all loaded)

# model to encode sentences
model = SentenceTransformer('all-MiniLM-L6-v2')

PINECONE_KEY = os.getenv("PINECONE_KEY")
pc = Pinecone(api_key=PINECONE_KEY)

# remove whitespaces in id and replace special characters with _
def clean_id(text):
    text = text.replace(' ', '')
    text = re.sub(r"[^\x00-\x7F]+", '_', text)
    return text
index = pc.Index('transcripts')


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_start_end_sims(start_i, end_i, clip, clip_keys, num_clips, clips):
        start_sim = end_sim = -1
        start_v = np.array(clips[clip_keys[start_i]].values).reshape(1, -1)
        end_v = np.array(clips[clip_keys[end_i]].values).reshape(1, -1)

        if (start_i - 1) >= 0:
            prev_v = np.array(clips[clip_keys[start_i - 1]].values).reshape(1, -1)
            start_sim = cosine_similarity(start_v, prev_v)
        if (end_i + 1) < num_clips:
            next_v = np.array(clips[clip_keys[end_i + 1]].values).reshape(1, -1)
            end_sim = cosine_similarity(end_v, next_v)
        return start_sim, end_sim

In [9]:
import json

MIN_LEN = 1000 * 20
MAX_LEN = 1000 * 300
SCAN_RANGE = 40
THRESHOLD = 0.8

def search(query):
    query_vector = model.encode(query)
    results = index.query(vector=query_vector.tolist(), top_k=10, include_metadata=True)['matches']
    filtered_results = [match for match in results if match['score'] >= THRESHOLD]
    json_res = []
    for result in filtered_results:
        id_name = clean_id(result['metadata']['podcast'] + result['metadata']['episode'])
        id_num = int(result['id'].replace(id_name, ''))
        # print(result['id'], id_num)
        full_clip = index.fetch(ids=[id_name + str(i) for i in range(id_num-SCAN_RANGE, id_num+SCAN_RANGE)])
        clips = full_clip['vectors']
        clip_keys = np.sort(list(clips.keys()))
        num_clips = len(clip_keys)
        start_i, end_i = SCAN_RANGE, SCAN_RANGE
        min_sim = float('inf')
        
        # build the clip until it reaches the minimum length
        while (clips[clip_keys[end_i]].metadata['stop_time'] - clips[clip_keys[start_i]].metadata['start_time']) < MIN_LEN:
            start_sim, end_sim = find_start_end_sims(start_i, end_i, clips[clip_keys[start_i]], clip_keys, num_clips, clips)
            min_sim = min(min_sim, start_sim, end_sim)
            if start_sim >= end_sim:
                start_i -= 1
            else:
                end_i += 1

        # expand the clip until it reaches the maximum length or the similarity is below the threshold
        while (clips[clip_keys[end_i]].metadata['stop_time'] - clips[clip_keys[start_i]].metadata['start_time']) < MAX_LEN:
            start_sim, end_sim = find_start_end_sims(start_i, end_i, clips[clip_keys[start_i]], clip_keys, num_clips, clips)

            if max(start_sim, end_sim) < min_sim:
                break
            if start_sim > end_sim:
                start_i -= 1
            else:
                end_i += 1
        res = {'show': result['metadata']['podcast'],
               'episode': result['metadata']['episode'],
               'start_time': clips[clip_keys[start_i]].metadata['start_time'],
               'start_clip_end': clips[clip_keys[start_i]].metadata['stop_time'],
               'end_time': clips[clip_keys[end_i]].metadata['stop_time']}
        json_res.append(res)

        # # DEBUG: print the clip transcript
        # print('============ INDICES', start_i, end_i)
        # for clip in clip_keys[start_i:end_i+1]:
        #     text = clips[clip].metadata['text'] + ' '
        #     print(text)

    
    return json.dumps({'results': json_res})

search("warriors winning")

'{"results": [{"show": "First Things First - 20240404-093806", "episode": "Lakers beat Suns, Under Duress List, Refs the biggest reason Mav", "start_time": 564000.0, "start_clip_end": 565000.0, "end_time": 671000.0}, {"show": "First Things First - 20240404-093806", "episode": "Lamar Jackson continues to Tweet, Warriors comeback against Peli", "start_time": 949000.0, "start_clip_end": 953000.0, "end_time": 1044000.0}, {"show": "The Mark Titus Show - 20240404-093622", "episode": "Episode 18 Draymond Green SUSPENDED For Stomping on Domantas Sab", "start_time": 1394520.0, "start_clip_end": 1398400.0, "end_time": 1608460.0}, {"show": "First Things First - 20240404-093806", "episode": "Lamar Jackson continues to Tweet, Warriors comeback against Peli", "start_time": 10000.0, "start_clip_end": 13000.0, "end_time": 88000.0}, {"show": "First Things First - 20240404-093806", "episode": "Aaron Rodgers and Lamar Jackson news, Raiders free agent signing", "start_time": 64000.0, "start_clip_end": 670