## NOVA IS IMPLEMENTED WITH DOCUMENT INTELLIGENCE

In [1]:
# Import necessary libraries
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PDFPlumberLoader # Load the document
from langchain.text_splitter import RecursiveCharacterTextSplitter # split the documkent
from langchain_ollama import OllamaEmbeddings  # Embed the documents
from langchain_core.vectorstores import InMemoryVectorStore # store the data
from langchain_groq import ChatGroq

from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain # For combining all the documents
from langchain.chains import create_retrieval_chain
from langchain_core.output_parsers import StrOutputParser
load_dotenv()

True

In [2]:
!pwd

/Users/sumi/Desktop/NOVA/Notebooks


In [3]:
os.chdir('../')
!pwd

/Users/sumi/Desktop/NOVA


In [4]:
# Get the API Keys
groq_api_key=os.getenv("GROQ_API_KEY")
tavily_api_key=os.getenv("TAVILY_API_KEY")

In [5]:
# Load the document
loader=PDFPlumberLoader('/Users/sumi/Desktop/NOVA/HSI-project-document.pdf')
raw_docs=loader.load()

In [6]:
# Split the document
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,
                                              chunk_overlap=200,
                                              add_start_index=True)

document_chunks=text_splitter.split_documents(raw_docs)
document_chunks

[Document(metadata={'source': '/Users/sumi/Desktop/NOVA/HSI-project-document.pdf', 'file_path': '/Users/sumi/Desktop/NOVA/HSI-project-document.pdf', 'page': 0, 'total_pages': 3, 'Title': 'HSI-project-document', 'Producer': 'Skia/PDF m134 Google Docs Renderer', 'start_index': 0}, page_content='HyperSpectral Image Classification\nProject overview: A HyperSpectral Image(HSI) is a high-dimensionality\ndataset that captures images across numerous spectral bands, beyond the\nvisible spectrum. Unlike RGB images (3 bands), HSI consists of hundreds\nof contiguous bands spanning in ultraviolet, visible, infrared and beyond\nregions.\nThis High-dimensional nature of HSI data makes it challenging for\ntraditional machine learning models to capture intricate spectral-spatial\nrelationships. Hence we propose deep learning based approaches to\naccurately classify HSI data.'),
 Document(metadata={'source': '/Users/sumi/Desktop/NOVA/HSI-project-document.pdf', 'file_path': '/Users/sumi/Desktop/NOVA/HSI-

In [7]:
# Embed the documents and store in vector store
EMBEDDING_MODEL = OllamaEmbeddings(model="deepseek-r1:1.5b")
vector_store=InMemoryVectorStore.from_documents(document_chunks,EMBEDDING_MODEL) # Any DB like FAISS, CHROMA, ASTRA can be used
vector_store

<langchain_core.vectorstores.in_memory.InMemoryVectorStore at 0x10d37bdc0>

In [8]:
# Query the vector DB
query="what is the major problem"
result=vector_store.similarity_search(query,k=4)
result[0].page_content

'● After transformation, the model_trainer step needs to be executed\nagain to fine-tune the classification model using the transformed data.\nData Ingestion --> Model Training (AutoEncoder) --> Data Transformation\n--> Model Training (AutoEncoder+Classifier) --> Prediction\nProject structure:\nHyperSpec-AI\n│-- config/ # Configuration files (e.g., paths, parameters)\n│ └── config.yaml\n│-- DATA/ # Dataset storage\n│-- Notebooks/ # Jupyter Notebooks for research & experimentation\n│-- src/\n│ ├── components/ # Core components of the pipeline\n│ │ ├── data_ingestion.py # Handles dataset loading\n│ │ ├── data_transformation.py # Applies transformations (e.g.,\nAutoEncoder)\n│ │ ├── model_trainer.py # Trains models (CNN, AutoEncoder,\nClassifier)\n│ ├── models/ # Model architectures\n│ │ ├── cnn_model.py # Convolutional Neural Network\n│ │ ├── autoencoder_classifier_head.py # AutoEncoder + Classifier\n│ ├── pipeline/ # End-to-end pipeline execution\n│ │ ├── training_pipeline.py # Full tra

In [9]:
# Convert it into retriever and query
retriever=vector_store.as_retriever()
result=retriever.invoke(query)
result[0].page_content

'● After transformation, the model_trainer step needs to be executed\nagain to fine-tune the classification model using the transformed data.\nData Ingestion --> Model Training (AutoEncoder) --> Data Transformation\n--> Model Training (AutoEncoder+Classifier) --> Prediction\nProject structure:\nHyperSpec-AI\n│-- config/ # Configuration files (e.g., paths, parameters)\n│ └── config.yaml\n│-- DATA/ # Dataset storage\n│-- Notebooks/ # Jupyter Notebooks for research & experimentation\n│-- src/\n│ ├── components/ # Core components of the pipeline\n│ │ ├── data_ingestion.py # Handles dataset loading\n│ │ ├── data_transformation.py # Applies transformations (e.g.,\nAutoEncoder)\n│ │ ├── model_trainer.py # Trains models (CNN, AutoEncoder,\nClassifier)\n│ ├── models/ # Model architectures\n│ │ ├── cnn_model.py # Convolutional Neural Network\n│ │ ├── autoencoder_classifier_head.py # AutoEncoder + Classifier\n│ ├── pipeline/ # End-to-end pipeline execution\n│ │ ├── training_pipeline.py # Full tra

In [10]:
# Setup the llm
llm_engine = ChatGroq(model="Deepseek-R1-Distill-Qwen-32b", groq_api_key=groq_api_key)
# provide sample input and get sample response from the LLM
result=llm_engine.invoke("what is generative AI")
result

AIMessage(content='<think>\n\n</think>\n\nGenerative AI refers to a type of artificial intelligence that can create or generate new content, such as text, images, music, or even code. Unlike traditional AI, which is typically designed to perform specific tasks or make predictions, generative AI models are capable of producing outputs that are similar to those created by humans.\n\n### Key Characteristics of Generative AI:\n1. **Content Creation**: Generative AI can produce new content, such as writing articles, composing music, or generating images.\n2. **Learning from Data**: These models are trained on large datasets to understand patterns, styles, and structures.\n3. **Creativity**: They can combine existing information in novel ways to create something unique.\n4. **Applications**: Generative AI is used in various fields, including art, entertainment, education, and business.\n\n### Examples of Generative AI:\n- **Text Generation**: Chatbots, automated essay writers, and language t

In [13]:
# Create a sample chain using chatprompt templates and LCEL
prompt=ChatPromptTemplate.from_messages(
    [
        ("system", "You are an Expert AI Engineer. Provide me the answer based on the questions"),
        ("user", "{input}")
    ]
)

output_parser=StrOutputParser()

chain= prompt | llm_engine | output_parser

response= chain.invoke({"input": "can you tell me about LangSmith"})
response

'<think>\nOkay, so I\'m trying to understand what LangSmith is. I know it\'s related to machine learning and AI, but beyond that, I\'m a bit fuzzy. Let me start by breaking down the name: "Lang" probably stands for language, and "Smith" is a common last name, but in tech, sometimes it\'s used as a tool or framework name. Maybe it\'s a tool for working with languages or something related.\n\nI remember that there\'s a company called Hugging Face, which is known for their work in natural language processing (NLP). They have tools like Hugging Face Transformers, which are used for training and deploying NLP models. Maybe LangSmith is related to them? Or perhaps it\'s an open-source project.\n\nI think I\'ve heard of LangSmith in the context of creating and managing machine learning pipelines. Pipelines are sequences of data processing steps, right? So LangSmith might help in setting up these pipelines for different tasks, maybe not just NLP but other types of models too.\n\nWait, the user

In [None]:
# Now we will combine our document chain and retriever chain so that llm can have context about documents
prompt= ChatPromptTemplate.from_template(
    """
    Answer the following questions based on the provided context:
    <context>
    {context}
    </context>
    """
)

document_chain=create_stuff_documents_chain(llm_engine,prompt)

# create the retriever chain
retrieval_chain=create_retrieval_chain(retriever, document_chain)

# Get the response
response=retrieval_chain.invoke({"input":"what is the major problem faced in the project"})
response["answer"]

"<think>\nOkay, I need to answer the user's questions based on the provided context. Let me read through the context again to make sure I understand everything.\n\nThe project is about hyperpectral image classification using deep learning. They have a pipeline that includes data ingestion, model training with an AutoEncoder, data transformation using that AutoEncoder, further model training with both the AutoEncoder and a classifier, and finally prediction.\n\nThe structure of the project includes several files and folders, like config, DATA, Notebooks, src, and utils. The src folder has components for data ingestion, transformation, and model training, as well as models and pipeline scripts.\n\nThe problem they're facing is a circular dependency. The model_trainer step needs to run before data_transformation because the AutoEncoder has to be trained to transform the data. But in their current setup, the model_trainer is part of the pipeline, and the data_transformation is a separate s