In [None]:
# # MyScheme Portal QA System - Demo Notebook
# 
# This notebook demonstrates the complete workflow of the MyScheme Portal QA System, from data collection to query answering.

# ## 1. Setup and Dependencies

# In[1]:

# Install required packages
# !pip install beautifulsoup4 requests pandas tqdm nltk scikit-learn sentence-transformers faiss-cpu torch transformers

# Import necessary libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import torch
from tqdm.notebook import tqdm

# Check if CUDA is available
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

In [None]:
# ## 2. Data Collection: Web Scraping

# In[2]:

# Import the scraper module
from scraper import MySchemePortalScraper

# Initialize the scraper
scraper = MySchemePortalScraper()

# Set a smaller limit for demonstration purposes
# In a real scenario, we would scrape more schemes
demo_limit = 20  # For quick demonstration

# Scrape the data
schemes = scraper.scrape(limit=demo_limit)

# Save the data
scraper.save_to_json("myscheme_data_demo.json")
scraper.save_to_csv("myscheme_data_demo.csv")

# Display sample data
pd.DataFrame(schemes[:5]).head()

In [None]:
# In[3]:

# Import the data processor
from data_processor import SchemeDataProcessor

# Initialize the processor with the demo data
processor = SchemeDataProcessor("myscheme_data_demo.json")

# Process the data
processed_data, chunks = processor.process()

# Save the processed data
processor.save_processed_data("processed_schemes_demo.json")
processor.save_chunks("scheme_chunks_demo.json")

# Display sample processed data
print(f"Number of processed schemes: {len(processed_data)}")
print(f"Number of chunks created: {len(chunks)}")

# Display a sample chunk
print("\nSample chunk:")
print(json.dumps(chunks[0], indent=2))