In [1]:
#Main

import base64
import string
import requests
from bs4 import BeautifulSoup
from flask import Flask, request, jsonify
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from pymilvus import connections, CollectionSchema, FieldSchema, DataType, Collection
import nest_asyncio
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

# Web Scraper class definition
class WebScraper:
    def __init__(self, url, headers=None):
        self.url = url

    
    def extract_paragraphs(html_content):
        if html_content:
            soup = BeautifulSoup(html_content, 'html.parser')
            paragraph = [p.text for p in soup.find_all('p')]
            return paragraph
        else:
            return []

    def fetch_and_extract_p(self):
        response = requests.get(self.url)
        if response.status_code == 200:
            paragraph = self.extract_paragraphs(response.text)
            return " ".join(paragraph)
        else:
            print(f"Failed to fetch the page. Status code: {response.status_code}")
            return None

# Function to split paragraphs into sentences
def paragraph_to_sentences(paragraph):
    sentences = sent_tokenize(paragraph)
    word_lists = [sentence.lower().split() for sentence in sentences]
    return [sentence for sublist in word_lists for sentence in sublist]

# Load sentence transformer model for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings
def get_embeddings(sentences):
    embeddings = embedding_model.encode(sentences, show_progress_bar=True)
    return embeddings

# Connect to Milvus
connections.connect("default", host="localhost", port="19530")

# Define schema for Milvus collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512)
]
schema = CollectionSchema(fields, description="Text embeddings")

# Create collection
collection_name = "text_embedding_collection"
collection = Collection(name=collection_name, schema=schema)

# Create index
index_params = {
    "index_type": "IVF_FLAT",
    "params": {"nlist": 100},
    "metric_type": "L2"
}
collection.create_index(field_name="embedding", index_params=index_params)

def store_in_milvus(sentences, embeddings):
    # Create IDs for the sentences
    ids = [i for i in range(len(sentences))]
    
    # Ensure embeddings are in the correct format
    embeddings = embeddings.tolist()
    
    # Prepare entities for Milvus
    entities = {
        "id": ids,
        "embedding": embeddings,
        "text": sentences
    }
    
    collection.insert([entities["id"], entities["embedding"], entities["text"]])
    collection.flush()

def fetch_from_milvus(query_embedding, top_k=5):
    collection.load()
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search(
        data=[query_embedding],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        expr=None
    )
    return results

# Load GPT-2 tokenizer and model
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

def generate_answer(query, context, max_length=100):
    inputs = gpt2_tokenizer.encode(query + " " + context, return_tensors="pt")
    outputs = gpt2_model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=3,  # Prevent repeating trigrams
        repetition_penalty=2.0,  # Penalize repeated tokens more heavily
        temperature=0.7,  # Sampling temperature
        top_p=0.9  # Top-p (nucleus) sampling
    )
    return gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Apply the nest_asyncio patch
nest_asyncio.apply()

app = FastAPI()

class LoadRequest(BaseModel):
    url: str

class QueryRequest(BaseModel):
    query: str
    use_milvus: bool = True

@app.post("/load")
async def load_website_content(request: LoadRequest):
    scraper = WebScraper(request.url)
    text_lists = scraper.fetch_and_extract_p()
    flat_sentences = paragraph_to_sentences(text_lists)
    embeddings = get_embeddings(flat_sentences)

    # Store in Milvus
    store_in_milvus(flat_sentences, embeddings)

    return {"message": "Content loaded successfully"}

@app.post("/query")
async def query_content(request: QueryRequest):
    if request.use_milvus:
        # Fetch from Milvus
        query_embedding = get_embeddings([request.query])[0]
        results = fetch_from_milvus(query_embedding)
        context = [res.entity.get("text") for res in results[0] if res.entity.get("text") is not None]

    # Generate answer using RAG (assuming it's asynchronous)
    answer = generate_answer(request.query, " ".join(context))

    return {"answer": answer}

if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8000)


  from tqdm.autonotebook import tqdm, trange





INFO:     Started server process [2516]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [2516]


KeyboardInterrupt: 