<a href="https://colab.research.google.com/github/lalithkumar12345/sithafal/blob/main/sithafalTask2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss  # Ensure FAISS is installed
from transformers import pipeline

class DataIngestion:
    def __init__(self, urls):
        self.urls = urls
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.embeddings = []
        self.metadata = []

    def crawl_and_scrape(self):
        for url in self.urls:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text()
            self.process_content(text, url)

    def process_content(self, text, url):
        chunks = self.segment_content(text)
        for chunk in chunks:
            embedding = self.model.encode(chunk)
            self.embeddings.append(embedding)
            self.metadata.append(chunk)

    def segment_content(self, text):
        return text.split('\n\n')  # Simple segmentation by paragraphs

    def store_embeddings(self):
        embedding_matrix = np.array(self.embeddings).astype('float32')
        dimension = embedding_matrix.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embedding_matrix)
        faiss.write_index(index, 'embeddings.index')

class QueryHandler:
    def __init__(self, index, model):
        self.index = index
        self.model = model

    def handle_query(self, query):
        query_embedding = self.model.encode(query)
        D, I = self.index.search(np.array([query_embedding]).astype('float32'), k=5)
        return I

class ResponseGenerator:
    def __init__(self):
        self.llm = pipeline('text-generation', model='gpt2')

    def generate_response(self, relevant_chunks, user_query):
        context = " ".join(relevant_chunks)
        prompt = f"Context: {context}\nQuestion: {user_query}\nAnswer:"
        response = self.llm(prompt, max_length=150, num_return_sequences=1)
        return response[0]['generated_text']

if __name__ == "__main__":
    urls = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/"
    ]

    # Step 1: Ingest Data
    ingestion = DataIngestion(urls)
    ingestion.crawl_and_scrape()
    ingestion.store_embeddings()

    # Step 2: Load FAISS index
    index = faiss.read_index('embeddings.index')
    query_handler = QueryHandler(index, ingestion.model)

    # Step 3: Handle User Query
    user_query = "What is the focus of research at the University of Chicago?"
    relevant_indices = query_handler.handle_query(user_query)

    # Step 4: Generate Response
    response_generator = ResponseGenerator()
    relevant_chunks = [ingestion.metadata[i] for i in relevant_indices[0]]
    response = response_generator.generate_response(relevant_chunks, user_query)

    print(response)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Context: 
More about research  Research 
Stanford University 
More about Stanford University of North Dakota
Question: What is the focus of research at the University of Chicago?
Answer: The focus of research at the University of Chicago is building an innovative global information technology company. To date, the company is developing four new products and three different technology products: data centers, Internet providers, and Internet companies.
In 2013, the company released the first of six strategic partnerships, including three to partner with companies like the Internet of Things (IoT). These partnerships allow Stanford to expand and improve its own information technologies and innovation process (such as using smart cities to connect to other technologies).
About the University of Chicago  
