In [8]:
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
import nest_asyncio
nest_asyncio.apply()
# Load HTML

loader = AsyncChromiumLoader(["https://ideasupplychain.com"])
html = loader.load()


In [9]:
html

[Document(page_content='<!DOCTYPE html><html lang="en" class="has-dark-text"><head>\n\n    <title>Idea Supply Chain</title>\n    <meta charset="utf-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    \n    <link rel="preload" as="style" href="/assets/built/screen.css?v=1be1ca579d">\n    <link rel="preload" as="script" href="/assets/built/source.js?v=1be1ca579d">\n\n    <link rel="stylesheet" type="text/css" href="/assets/built/screen.css?v=1be1ca579d">\n\n    <style>\n        :root {\n            --background-color: #ffffff\n        }\n    </style>\n\n    <script>\n        /* The script for calculating the color contrast has been taken from\n        https://gomakethings.com/dynamically-changing-the-text-color-based-on-background-color-contrast-with-vanilla-js/ */\n        var accentColor = getComputedStyle(document.documentElement).getPropertyValue(\'--background-color\');\n        accentColor = accentColor.trim().slice(1);\n        var r = parseInt(acc

In [10]:
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(html)

In [14]:
from submind.llms.submind import SubmindModelFactory


llm = SubmindModelFactory.get_model("testing-scraper", "scraping", 0)
from langchain.chains import create_extraction_chain

schema = {
    "properties": {
        "blog_title": {"type": "string"},
        "blog_excerpt": {"type": "string"},
        "blog_post": {"type": "string"},
    },
    "required": ["blog_title", "blog_post", "blog_excerpt"],
}


def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).run(content)

    
import pprint

from langchain_text_splitters import RecursiveCharacterTextSplitter


def scrape_with_playwright(urls, schema):
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=["h1", "p"]
    )
    print("Extracting content with LLM")

    # Grab the first 1000 tokens of the site
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)

    # Process the first split
    extracted_content = extract(schema=schema, content=splits[0].page_content)
    pprint.pprint(extracted_content)
    return extracted_content


urls = ["https://ideasupplychain.com/the-cost-of-being-weird/"]
extracted_content = scrape_with_playwright(urls, schema=schema)

Extracting content with LLM
[{'blog_excerpt': "What is your value if you don't fit into an understandable "
                  "category? There's a definite cost to being an outlier "
                  'before you are a good outlier.',
  'blog_post': "What is your value if you don't fit into an understandable "
               "category? There's a definite cost to being an outlier before "
               'you are a good outlier. I’ve seen all sorts of things about '
               'why it’s good to be weird. It’s important to stand out from '
               'the crowd if you want to be noticed. That makes sense. In '
               'order to be an outlier, you need to be an outlier, right? But '
               'how weird is too weird? There’s something I’ve noticed. The '
               'explanation has eluded me thus far, but I think I might have '
               'it now, so let’s take a shot at describing it. When you are '
               'too weird, you are really hard to measure. Thi

In [18]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers.openai_functions import JsonOutputFunctionsParser, JsonKeyOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate
QUESTION_PROMPT = """ You are an expert question writer. Based on this blog post: {post}

what are the questions that can be answered by it?

"""

functions = [
    {
        "name": "extract_questions",
        "description": "decide which questions can be answered",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "description": "which questions can be answered",
                    "items": {
                        "type": "string",
                        "description": "the question"
                        
                        

                    }
                },
            },
            "required": ["questions"],
        },
    }
]
    



prompt = ChatPromptTemplate.from_template(QUESTION_PROMPT)
chain = prompt | llm.bind(function_call={"name": "extract_questions"},
                                functions=functions) | JsonOutputFunctionsParser()


response = chain.invoke(
    {"post": extracted_content[0]['blog_post'],
     
    })

print(response)


{'questions': ['What is the value of being an outlier?', 'How can you measure new knowledge?', 'Why is it important to stand out from the crowd?', 'What are OKRs and how do they relate to performance reviews?', "How do you measure personal growth that doesn't fit into traditional metrics?", 'What challenges do outliers face in the job market?', 'How can you determine the market value of a multi-skilled individual?', 'Why do people want to listen to individuals with weird ideas?', 'What is A/B testing and how does it relate to user behavior?', "How can you help people who can't afford to pay for your services?"]}


In [19]:
from podcast.tasks import detailed_search
for question in response['questions']:
    detailed_search(question)
    

  from tqdm.autonotebook import tqdm
  warn_deprecated(
  warn_deprecated(


Making request to Podscan
Found 5 episodes
Found 5 episodes
Making request to Podscan
Found 5 episodes
Found 5 episodes
Making request to Podscan
Found 5 episodes
Found 5 episodes
Making request to Podscan
Found 5 episodes
Found 5 episodes
Making request to Podscan
Found 5 episodes
Found 5 episodes
Making request to Podscan
Found 5 episodes
Found 5 episodes
Making request to Podscan
Found 5 episodes
Found 5 episodes
Making request to Podscan
Found 5 episodes
Found 5 episodes
Making request to Podscan
Found 5 episodes
Found 5 episodes
Making request to Podscan
Found 5 episodes
Found 5 episodes
