In [1]:
!pip install requests beautifulsoup4 duckduckgo-search langchain chromadb tiktoken
!pip install transformers
!pip install langchain-community
!huggingface-cli login
!pip install bitsandbytes

Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-no

In [2]:
from duckduckgo_search import DDGS #all necessary imports
import requests
from bs4 import BeautifulSoup
import json, time
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
import json

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from transformers import pipeline, BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
from langchain_community.llms import HuggingFacePipeline
import torch
import chromadb
import gc

In [3]:

def crawl(topic, max_results=10): #crawls web, duckduckgo search, top 10 pages
    results = DDGS().text(topic, max_results=max_results)
    crawled = []

    for r in results:
        try:
            page = requests.get(r['href'], timeout=10)
            soup = BeautifulSoup(page.text, 'html.parser')
            text = ' '.join(p.get_text() for p in soup.find_all('p'))
            crawled.append({'title': r['title'], 'url': r['href'], 'text': text[:10000]})
            print(f"Fetched: {r['title']}")
        except:
            print(f"Failed to fetch: {r['href']}")
        time.sleep(1)

    output_dir = 'data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    file_path = os.path.join(output_dir, 'crawled.json')

    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(crawled, f, ensure_ascii=False, indent=2)

In [4]:

def chunk_documents(input_path='data/crawled.json', output_path='data/chunks.json'):
    with open(input_path, 'r', encoding='utf-8') as f:
        docs = json.load(f)

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = []

    for doc in docs:
        text_chunks = splitter.split_text(doc['text'])
        for chunk in text_chunks:
            chunks.append({"title": doc['title'], "url": doc['url'], "content": chunk})

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)


In [5]:
def embed_and_index(chunk_path='data/chunks.json'):
        with open(chunk_path, 'r', encoding='utf-8') as f:
            chunks = json.load(f)

        client = chromadb.Client()

        try:
            client.delete_collection("wikipe")
            print("Deleted existing collection 'wikipe'")
        except:
            pass

        collection = client.create_collection("wikipe")
        embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

        batch_size = 10 # ram limitation

        for i in range(0, len(chunks), batch_size):
            batch_chunks = chunks[i:i + batch_size]
            texts = [chunk['content'] for chunk in batch_chunks]
            metadata = [{"title": chunk['title'], "url": chunk['url']} for chunk in batch_chunks]

            ids = [str(j) for j in range(i, i + len(batch_chunks))]

            # Embed
            embeddings = embedder.embed_documents(texts)

            collection.add(
                documents=texts,
                metadatas=metadata,
                embeddings=embeddings,
                ids=ids
            )
            print(f"Indexed batch {i//batch_size + 1}")
            del batch_chunks, texts, metadata, embeddings, ids
            gc.collect() # free memory

        #print(f"Indexed {len(chunks)} chunks.")

In [6]:



def retrieve_and_generate(topic, sections):
    client = chromadb.Client()
    try:
        collection = client.get_collection("wikipe")
    except:
        print("Error: Collection 'wikipe' not found")
        return

    quantization_config = BitsAndBytesConfig(
            load_in_8bit=True
        )
    embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")



    device = "cuda" if torch.cuda.is_available() else "cpu"
    #print(f"Using device: {device}")

    model_id = "meta-llama/Meta-Llama-3-8B"
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token


    # Initialize the pipeline with the loaded model and tokenizer
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16, # Add this line
        max_new_tokens=500,
        return_full_text=False,
        #repeat_penalty = 1.4 #cause issues
        # Increased max_new_tokens for more substantial text
    )

    # Wrap the pipeline in HuggingFacePipeline
    llm = HuggingFacePipeline(pipeline=pipe)

    article = f"# {topic}\n\n"

    for section in sections:
        query = f"{topic} {section}"
        query_emb = embedder.embed_query(query)

        results = collection.query(query_embeddings=[query_emb], n_results=5)
        # Ensure that results['documents'] is not empty before joining
        if results and results.get('documents') and results['documents'][0]:
            context = "\n\n".join(results['documents'][0])
        else:
            context = "No relevant context for this section."

        prompt = ChatPromptTemplate.from_template(
            f"Write a Wikipedia-style '{section}' section for '{topic}' using this context:\n{context}\n\n"
            "Maintain a neutral, encyclopedic style, clearly structure the content with substantial detail, and do not repeat this prompt"
        )


        prompt_text = prompt.format(section=section, topic=topic, context=context)


        response = llm.invoke(prompt_text)

        generated_text = response

        article += f"## {section}\n\n{generated_text}\n\n" # concat sections

    with open('article_output.md', 'w', encoding='utf-8') as f:
        f.write(article)

    print("article generated!")

if __name__ == '__main__':
    query = input("SEARCH: ")
    crawl(query)
    chunk_documents()
    embed_and_index()
    sections = ["Summary", "History", "Applications", "Society and Culture", "References"] #typical Wiki sections
    retrieve_and_generate(query, sections)

SEARCH: CRISPR Gene editing
Fetched: CRISPR gene editing - Wikipedia
Fetched: What Is CRISPR Gene Editing and How Does It Work?
Fetched: Stanford explainer: CRISPR, gene editing, and beyond
Fetched: World's First Patient Treated with Personalized CRISPR Gene Editing ...
Fetched: Basic Principles and Clinical Applications of CRISPR-Based Genome Editing
Fetched: CRISPR | Definition, Gene Editing, Technology, Uses, & Ethics | Britannica
Fetched: CRISPR technology: A decade of genome editing is only the beginning - AAAS
Fetched: What is CRISPR? Understanding the Revolutionary Gene Editing Technology
Fetched: How does CRISPR work? - Live Science
Fetched: What is CRISPR? - New Scientist


  embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Indexed batch 1
Indexed batch 2
Indexed batch 3
Indexed batch 4
Indexed batch 5
Indexed batch 6
Indexed batch 7
Indexed batch 8
Indexed batch 9
Indexed batch 10
Indexed batch 11
Indexed batch 12
Indexed batch 13
Indexed batch 14
Indexed batch 15
Indexed batch 16
Indexed batch 17
Indexed batch 18


config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

OSError: TheBloke/Llama-2-7B-GGML does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.