In [None]:
# #Disable SSL check
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
#For huggingface embeddings, run without this comment in cell
pip install sentence-transformers

In [None]:
#Was not originally included
pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp310-cp310-macosx_14_0_arm64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp310-cp310-macosx_14_0_arm64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import Ollama
from langchain.schema import Document
import json
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA


In [2]:
loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=KhFlD54nQrY&t=1205s", add_video_info=False
)

In [3]:
youtube_transcript = loader.load()

In [4]:
def split_pages(pages):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=300,
    separators=["\n\n", "\n", " ", ""]
    )

    docs = text_splitter.split_documents(pages)
    return docs

In [5]:
pages = split_pages(youtube_transcript)

In [6]:
pages[0:5]

[Document(page_content="I've been in business for 13 years I've sold nine companies my last company I sold for 46.2 million I own acquisition. comom which right now does about $17 million a month across our portfolio I'm going to compress 13 years of brutal business truths and lessons into this one video brutal business truth number one sell to rich people until you have the money to sell all the poor people the middle is where you get killed Elon said you can either do a lot of good for a small amount of people or a little good for a large amount of people and I will tell you personally it's way hard to do a little good for a lot of people than a lot of good for a small amount of people there is a reason that Tesla started at the top they started only selling to rich people they had the Roadster which was $250,000 and so they sold those and then he was able to go down Market one and sell $100,000 cars to just upper Ash line of society the wealthy and then after that he made the X whic

In [7]:
llm = Ollama(model="qwen2.5-coder:7b")

In [8]:
def local_llm(prompt: str, llm) -> str:
    """
    Sends a prompt to the LangChain Ollama LLM instance and returns the response.
    """
    try:
        return llm.invoke(prompt).strip()
    except Exception as e:
        print("Error invoking model:", e)
        return ""


In [9]:
def process_chunk_to_alpaca(doc: Document, llm) -> dict:
    # Extract metadata from the LangChain Document
    source_name = doc.metadata.get("source", "Unknown Name")

    # Inject metadata into prompt
    instruction_with_metadata = f"""
You are a business assistant analyzing raw business content from the following source:
SOURCE NAME: {source_name}

Your task is to extract the following from the provided transcript:
1. Frameworks (e.g., naming, advertising, validation models).
2. Bullet points for key ideas or steps.
3. Q&A (any implied or stated questions with answers).
4. Case Examples or stories.
5. Copywriting formulas (AIDA, PAS, etc.)
6. Classify this content into high-level topics: e.g., Naming, Ads, Psychology, Copywriting.
7. Convert suitable content into a step-by-step guide.

Return your output in clearly labeled sections, and only include sections with relevant content. Do not include a preamble.
""".strip()

    prompt = f"{instruction_with_metadata}\n\n{doc.page_content.strip()}"
    response = local_llm(prompt, llm)

    return {
        "instruction": instruction_with_metadata,
        "input": doc.page_content.strip(),
        "output": response,
        "metadata": doc.metadata
    }


In [10]:
alpaca_data = []

for doc in pages[0:5]:  # each doc is a langchain Document
    alpaca_entry = process_chunk_to_alpaca(doc, llm)
    alpaca_data.append(alpaca_entry)

In [11]:
alpaca_data

[{'instruction': 'You are a business assistant analyzing raw business content from the following source:\nSOURCE NAME: KhFlD54nQrY\n\nYour task is to extract the following from the provided transcript:\n1. Frameworks (e.g., naming, advertising, validation models).\n2. Bullet points for key ideas or steps.\n3. Q&A (any implied or stated questions with answers).\n4. Case Examples or stories.\n5. Copywriting formulas (AIDA, PAS, etc.)\n6. Classify this content into high-level topics: e.g., Naming, Ads, Psychology, Copywriting.\n7. Convert suitable content into a step-by-step guide.\n\nReturn your output in clearly labeled sections, and only include sections with relevant content. Do not include a preamble.',
  'input': "I've been in business for 13 years I've sold nine companies my last company I sold for 46.2 million I own acquisition. comom which right now does about $17 million a month across our portfolio I'm going to compress 13 years of brutal business truths and lessons into this o

In [12]:
def alpaca_to_documents(alpaca_data):
    docs = []
    for entry in alpaca_data:
        metadata = {
            "instruction": entry.get("instruction", ""),
            "input": entry.get("input", "")
        }
        docs.append(Document(page_content=entry["output"], metadata=metadata))
    return docs

In [13]:
embedding = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
    ) 

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
embedding = embedding
docs = alpaca_to_documents(alpaca_data)
vectorstore = FAISS.from_documents(docs, embedding)

In [15]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)


In [None]:
"""
Qwen is slower than all, 2.3 mins.
"""

In [16]:
query = "Should we target to sell to rich people or otheerwise?"
response = qa_chain.run(query)

print("\n🔍 Answer from local model:")
print(response)


  warn_deprecated(



🔍 Answer from local model:
You should target both rich people and other market segments. The strategy involves focusing on the wealthy first to establish a strong business foundation, then moving downmarket to capture a broader customer base at an affordable price point of $99/month. This approach allows you to leverage high-value products for initial profitability while scaling to reach more customers through efficient operations.


In [17]:
query = "What does it mean to sell to the rich until you can afoford to seel to the poor?"
response = qa_chain.run(query)

print("\n🔍 Answer from local model:")
print(response)



🔍 Answer from local model:
Selling to the wealthy first means targeting individuals with higher income levels, typically those who have the financial capability to pay more for products or services. The goal is to establish a strong market presence and build a reputation for excellence in your industry. Once you've built a solid customer base among the rich, you can then start scaling down to lower-income customers by offering lower-priced alternatives that still provide value. This approach leverages the fact that wealthy individuals are often more willing to pay higher prices for premium products or services, allowing you to generate initial revenue and invest in infrastructure before moving into mass-market strategies.


In [None]:
# Save to .jsonl
with open("alpaca_dataset.jsonl", "w", encoding="utf-8") as f:
    for entry in alpaca_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
