In [1]:
!pip install PyPDF2



In [2]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(file_path: str) -> str:
    try:
        reader = PdfReader(file_path)
        text = ""
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
        return text
    except Exception as e:
        raise ValueError(f"Error extracting text from PDF: {e}")


In [3]:
from transformers import pipeline

# Load a classification pipeline
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

def classify_text(text):
    """Classify the text using a Hugging Face model."""
    result = classifier(text[:1024])  # Process the first 512 tokens for simplicity
    return result[0]["label"], result[0]["score"]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [4]:
summarizer = pipeline("summarization", model="google/flan-t5-small")

def summarize_text(text):
    """Summarize the input text."""
    return summarizer(text[:2048], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [5]:
!pip install fastapi sentence-transformers faiss-cpu uvicorn nest_asyncio pyngrok



In [6]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# FAISS Index
dimension = 384  # Embedding size of all-MiniLM-L6-v2
index = faiss.IndexFlatL2(dimension)  # L2 distance metric

# Database to store metadata
document_db = []

def add_document_to_index(doc_id, text):
    """Generate embeddings and add them to FAISS index."""
    embedding = embedding_model.encode([text])[0]  # Generate embedding
    index.add(np.array([embedding]))  # Add to FAISS index
    document_db.append({"id": doc_id, "text": text})  # Add metadata to DB

def search_documents(query, top_k=5):
    """Perform a similarity search using FAISS."""
    query_embedding = embedding_model.encode([query])[0]
    distances, indices = index.search(np.array([query_embedding]), top_k)
    results = [document_db[i] for i in indices[0]]
    return results


In [7]:
!pip install python-multipart



In [None]:
# from fastapi import FastAPI

# app = FastAPI()

# @app.get("/")
# def read_root():
#     return {"Hello": "World"}

# @app.post("/upload/")
# async def upload_file():
#     return {"message": "File uploaded!"}


In [8]:


# Import necessary modules
from fastapi import FastAPI, UploadFile, File, HTTPException
import os
from PyPDF2 import PdfReader
import nest_asyncio
from pyngrok import ngrok
import uvicorn
import shutil
import uuid

# Create FastAPI app
app = FastAPI()

UPLOAD_DIR = "./uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)

# def extract_text_from_pdf(file_path: str) -> str:
#     try:
#         reader = PdfReader(file_path)
#         text = ""
#         for page in reader.pages:
#             text += page.extract_text()
#         return text
#     except Exception as e:
#         raise ValueError(f"Error extracting text from PDF: {e}")

@app.post("/upload/")
async def upload_document(file: UploadFile = File(...)):
    try:
        doc_id = str(uuid.uuid4())
        file_location = os.path.join(UPLOAD_DIR, file.filename)
        with open(file_location, "wb") as f:
            f.write(await file.read())
        text = extract_text_from_pdf(file_location)
        # Summarize, classify, and embed
        summary = summarize_text(text)
        label, score = classify_text(text)
        add_document_to_index(doc_id, text)
        
        # Return results
        return {
            "id": doc_id,
            "filename": file.filename,
            "extracted_text": text,
            "summary": summary,
            "classification": {"label": label, "score": score}
        }
        # return {"message": "File uploaded and text extracted successfully!", "extracted_text": text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error: {e}")

@app.get("/search/")
async def search_document(query: str):
    try:
        results = search_documents(query)
        return {"results": results}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error: {e}")

# Run FastAPI in notebook
nest_asyncio.apply()

# # Optional: Expose your app to the internet using ngrok
# public_url = ngrok.connect(8000)
# print(f"Public URL: {public_url}")

uvicorn.run(app, host="127.0.0.1", port=8000)


INFO:     Started server process [1114]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:49477 - "GET / HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:49477 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:49477 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:49477 - "GET /openapi.json HTTP/1.1" 200 OK
INFO:     127.0.0.1:49478 - "POST /upload/ HTTP/1.1" 200 OK
INFO:     127.0.0.1:49550 - "POST /upload/ HTTP/1.1" 200 OK
INFO:     127.0.0.1:49563 - "GET /search/?query=pdf-sample HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1114]


In [None]:
# from fastapi import FastAPI, UploadFile, File
# import uuid
# import shutil

# app = FastAPI()

# @app.get("/")
# def read_root():
#     return {"message": "Welcome to the FastAPI app!"}

# @app.post("/upload/")
# async def upload_document(file: UploadFile = File(...)):
#     """Upload and process a document."""
#     doc_id = str(uuid.uuid4())  # Generate unique ID
#     file_path = f"uploads/{doc_id}_{file.filename}"
    
#     # Save the uploaded file
#     with open(file_path, "wb") as buffer:
#         shutil.copyfileobj(file.file, buffer)
    
#     # Extract text from the document
#     text = extract_text_from_pdf(file_path)
    
#     # Summarize, classify, and embed
#     summary = summarize_text(text)
#     label, score = classify_text(text)
#     add_document_to_index(doc_id, text)
    
#     # Return results
#     return {
#         "id": doc_id,
#         "filename": file.filename,
#         "summary": summary,
#         "classification": {"label": label, "score": score}
#     }

# @app.get("/search/")
# async def search(query: str):
#     """Search for documents based on a query."""
#     results = search_documents(query)
#     return {"query": query, "results": results}

In [None]:
import requests

# Upload a file
url = "http://0.0.0.0:8000/upload/"
files = {"file": open("/Users/HP 1/Desktop/Project1/document_CAS_model/0000/0000009.pdf", "rb")}
response = requests.post(url, files=files)
print(response.json())


ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=8000): Max retries exceeded with url: /upload/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x3467051f0>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
# Search for a term
url = "http://127.0.0.1:8000/search/"
params = {"query": "linear regression"}
response = requests.get(url, params=params)
print(response.json())
