In [1]:
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"


In [2]:
# Need to grab all of Jay's podcast episodes. For each episode, determine if has guest or is solo.
# If has guest, who is guest and what is expertise? What is their info? Website, podcast, youtube channel, etc?
# For each episode, identify summary of valuable information. Also, perhaps extract stories that can be used as examples? tbd...

In [21]:
from decouple import config
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone  import PineconeVectorStore
from pinecone import Pinecone
import uuid 
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter
import json
from urllib.parse import urlparse, urljoin
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re
from langchain.document_loaders import BSHTMLLoader
from langchain_chroma import Chroma
import requests
from langchain_anthropic import ChatAnthropic

import chromadb
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("creator_science")


embeddings = OpenAIEmbeddings(
                            model="text-embedding-3-small",
                            openai_api_key=config("OPENAI_API_KEY"),
                            openai_api_base=config('OPENAI_API_BASE'),
                            headers={
                                "Helicone-Auth": f"Bearer {config('HELICONE_API_KEY')}"
                            })

langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="testing_manifestos",
    embedding_function=embeddings,
)


model_claude = ChatAnthropic(model_name="claude-3-5-sonnet-20240620",
                                 anthropic_api_key=config("ANTHROPIC_API_KEY"),
                                 anthropic_api_url="https://anthropic.hconeai.com/",
                                 max_tokens=4096,
                                 model_kwargs={
                                     "extra_headers": {
                                         "Helicone-Auth": f"Bearer {config('HELICONE_API_KEY')}",
                                         "Helicone-Property-Step": "Manifesto Bot",
                                         "Helicone-Property-UUID": "Rob Hardy, Manifesto King"

                                     }
                                 }
                                 )
podscan_host = config("PODSCAN_HOST")
podscan_api_key = config("PODSCAN_API_KEY")

def fetch_all_podcast_episodes(podcast_id, per_page=10, pages=1):
    """
    Look for podcast episodes
    """
    text_splitter = SemanticChunker(embeddings)
    backup_text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
    
    current_page = 1
    raw_episodes = []

    print("Making request to Podscan")
    podscan_search = f'{podscan_host}/podcasts/{podcast_id}/episodes?per_page=100'
    print(f'podcast search url: {podscan_search}')
    headers = {
        "Authorization": f"Bearer {podscan_api_key}"
    }
    response = requests.get(podscan_search, headers=headers)
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        raise Exception(f"Error from podscan: {response.status_code}")
    episodes = response.json()
    print(episodes['pagination'])
    raw_episodes.extend(episodes['episodes'])

    found = []
    print(f"Found {len(raw_episodes)} episodes")

    for episode in raw_episodes:
        #existing_episode = PodcastEpisode.objects.create(
        #     name=episode['episode_title'],
        #     duration=episode['episode_duration'],
        #     podcast=existing_podcast,
        #     transcript_guid=episode['episode_guid'] if episode['episode_guid'] else str(uuid.uuid4()),
        #     transcript=episode['episode_transcript'],
        #     description=episode['episode_description'],
        #     episode_url=episode['episode_url'],
        #     episode_audio_url=episode['episode_audio_url'],
        #     embeddings_generated=False
        # )
        transcript = episode['episode_transcript']
        episode_audio_url=episode['episode_audio_url']
        name=episode['episode_title']
        transcript_guid=episode['episode_guid'] if episode['episode_guid'] else str(uuid.uuid4())
        chunks = text_splitter.split_text(transcript)
        metadatas = []
        ids = []
        chunks_to_save = []
        for chunk in chunks:
            chunk_id = str(uuid.uuid4())
            metadata = {
                "transcript_id": transcript_guid,
                "episode_name": name,
                "source_type": "podcast_episode",
                "text": chunk
            }

            # test metadata size
            size = sys.getsizeof(json.dumps(metadata))

            if size > 40960:
                # remove the snippet, because it's too big                
                # need to split into smaller chunks
                smaller_chunks = backup_text_splitter.split_text(chunk)
                for smaller_chunk in smaller_chunks:
                    smaller_id = str(uuid.uuid4())
                    
                    smaller_metadata = {
                        "transcript_id": transcript_guid,
                        "episode_name": name,
                        "source_type": "podcast_episode",
                        "snippet_id": smaller_id,
                        "episode_audio_url": episode_audio_url,

                        "text": smaller_chunk
                    }
                    metadatas.append(smaller_metadata)
                    ids.append(smaller_id)
                    chunks_to_save.append(smaller_chunk)
                continue
            else:
                chunks_to_save.append(chunk)
                metadatas.append(metadata)
                ids.append(chunk_id)

        collection.add(documents=chunks_to_save, metadatas=metadatas, ids=ids)
        

       

def find_podcast(name):
    podscan_search = f'{podscan_host}/podcasts/search?query={name}'
    headers = {
        "Authorization": f"Bearer {podscan_api_key}"
    }
    response = requests.get(podscan_search, headers=headers)
    print(response.json())

In [28]:
#podcast_guid=864ac56d-fd59-5769-9fd0-8a5e9f6276fd
#podcast_id=pd_eym7vj47633j43wp
find_podcast("LinkedIn Algorithm Insights 2024: What You Need to Know The Content 10x Podcast")
# fetch_all_podcast_episodes("pd_eym7vj47633j43wp")

{'podcasts': [{'podcast_id': 'pd_42yajr2746w9p8ow', 'podcast_guid': '0879dcc4-6f1f-537e-940d-fc22e94139b6', 'podcast_name': 'Social Media Explained', 'podcast_url': 'https://www.spreaker.com/podcast/social-media-explained--6079022', 'podcast_description': '"Social Media Explained" is an insightful and engaging podcast that demystifies the complex world of social media for its listeners. Each episode breaks down the latest trends, algorithms, and strategies across various platforms, from Facebook and Instagram to Twitter and LinkedIn. The podcast features expert guests, including social media influencers, digital marketers, and platform creators, who share their experiences, tips, and predictions for the future of social networking. Whether you\'re a social media novice looking to understand the basics or a seasoned professional seeking to enhance your strategies, "Social Media Explained" offers valuable insights into making the most of the digital landscape. Through in-depth analysis, 

In [26]:
collection.query(query_texts=["who is the guest?"])

{'ids': [['2fbd1c7b-d6e5-4528-8ce5-6ab3f88d1f51',
   '69e06d6c-9165-44ac-b0d3-0e87a673dc5c',
   '6df2bbe1-932f-4bef-bb1d-227a3b97156a',
   'd99fba73-61cf-440d-b9e6-84532d57d4a1',
   'a0bad74a-969a-410c-8ae0-3dd8517366cc',
   '13596ab9-f082-43f8-b923-88376ddd8b02',
   '3b04be73-b39a-4896-8a47-f303aae44a0c',
   'dc223005-ce17-4098-9d7e-4ccc2391696e',
   '5710d586-5708-461a-8464-ebe0e72d2a7f',
   '984d5c5e-0bac-4135-8def-c9f35c22b4f6']],
 'distances': [[1.2752716541290283,
   1.335811734199524,
   1.3624145984649658,
   1.3624145984649658,
   1.3624145984649658,
   1.379756216347017,
   1.3874602317810059,
   1.3976866006851196,
   1.4050308465957642,
   1.4050308465957642]],
 'metadatas': [[{'episode_audio_url': 'https://pdst.fm/e/chrt.fm/track/3C7AB4/traffic.megaphone.fm/ULROC1899373334.mp3?updated=1711574120',
    'episode_name': '#186: Listener Q&amp;A (Part 2) – Selling without feeling salesy, choosing what products to create, my future plans, and more.',
    'snippet_id': '2fbd1c7b-