# Install Dependencies

In [1]:
# Cell 1: Install Dependencies

!pip install -q -U \
    transformers \
    accelerate \
    bitsandbytes \
    langchain-community \
    langchain-huggingface \
    langchain-text-splitters \
    sentence-transformers \
    faiss-cpu \
    pymupdf

print("Installation complete.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m458.9/458.9 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.9/24.9 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Import Libraries & Check Device

In [2]:
import os
import torch
import fitz  # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from google.colab import files

# Check Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using Device: {device.upper()}")


Using Device: CUDA


# PDF Processing Function

In [4]:
def process_pdf(pdf_path):
    print(f" Processing: {pdf_path}...")

    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        text += page.get_text()

    # Chunking
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    chunks = text_splitter.split_text(text)

    print(f" Created {len(chunks)} chunks.")
    return chunks


# Vector Store Creation Function

In [5]:

def create_index(chunks):
    # Initialize Embeddings
    print("Loading Embedding Model (BGE-Base)...")
    embedding_model = HuggingFaceEmbeddings(
        model_name="BAAI/bge-base-en-v1.5",
        model_kwargs={'device': device},
        encode_kwargs={'normalize_embeddings': True}
    )
    # Create Vector Store
    vector_store = FAISS.from_texts(chunks, embedding_model)
    print(" Vector Store Ready.")
    return vector_store

# Upload PDF

In [9]:
print("Please upload your Vehicle Manual PDF now:")
uploaded = files.upload()

if uploaded:
    pdf_filename = list(uploaded.keys())[0]
    print(f"\n🚀 Starting pipeline for: {pdf_filename}")
 # 1. Chunk the PDF
    chunks = process_pdf(pdf_filename)

    # 2. Create the Index (Defines vector_store globally)
    vector_store = create_index(chunks)

    print(" SUCCESS: 'vector_store' is now defined! You can run Cell 3 and 4.")
else:
    print(" No file uploaded. Please run this cell again and upload a file.")

Please upload your Vehicle Manual PDF now:


Saving sample-service-manual 1.pdf to sample-service-manual 1 (1).pdf

🚀 Starting pipeline for: sample-service-manual 1 (1).pdf
 Processing: sample-service-manual 1 (1).pdf...
 Created 1069 chunks.
Loading Embedding Model (BGE-Base)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: BAAI/bge-base-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

 Vector Store Ready.
 SUCCESS: 'vector_store' is now defined! You can run Cell 3 and 4.


## Load the LLM (Mistral)

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain_huggingface import HuggingFacePipeline

# Config for 4-bit quantization (Fits in Colab T4)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

print(f" Loading {model_id}...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

# Create Pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=False,
    temperature=0.1,
    return_full_text=False
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
print("LLM Loaded Successfully.")

 Loading mistralai/Mistral-7B-Instruct-v0.3...


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Passing `generation_config` together with generation-related arguments=({'do_sample', 'temperature', 'max_new_tokens'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.


LLM Loaded Successfully.


# Run the Extraction (RAG)

In [11]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import json
import csv

# Prompt
prompt_template = """
You are an expert automotive data extractor.
Use the Context below to answer the Query.

Context:
{context}

Query: {question}

Instructions:
1. Extract the specific TECHNICAL VALUE and UNIT.
2. Output ONLY a valid JSON object. Do not add intro/outro text.
3. Format: {{"component": "Component Name", "spec_type": "Torque/Capacity/Gap", "value": "Number", "unit": "Nm/Liters/mm"}}
4. If info is missing, return {{"error": "not found"}}.

JSON Output:
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Build the Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 5}),
    chain_type_kwargs={"prompt": PROMPT}
)

# Define queries to run
queries = [
    "What is the torque for the brake caliper bolts?",
    "What is the engine oil capacity?",
    "What is the spark plug gap?",
    "Wheel speed sensor bolt"
]

# Initialize a list to hold data for the next steps
extracted_data = []

# Result



In [12]:
print("--- Starting Extraction (JSON Mode) ---")

for query in queries:
    print(f"Query: {query}")
    try:

        res = qa_chain.invoke(query)


        json_str = res['result'].replace("```json", "").replace("```", "").strip()


        print(f"Extracted JSON: {json_str}\n")

        data_dict = json.loads(json_str)


        data_dict["original_query"] = query


        extracted_data.append(data_dict)

    except json.JSONDecodeError:
        print("  Error: LLM response was not valid JSON.")
    except Exception as e:
        print(f" Error: {e}")

print(f"Processed {len(queries)} queries.")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Starting Extraction (JSON Mode) ---
Query: What is the torque for the brake caliper bolts?


Both `max_new_tokens` (=512) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Extracted JSON: {
"component": "Brake caliper guide pin bolts",
"spec_type": "Torque",
"value": "33",
"unit": "Nm"
}
{
"component": "Brake caliper flow bolt",
"spec_type": "Torque",
"value": "35",
"unit": "Nm"
}
{
"component": "Brake caliper bleeder screw",
"spec_type": "Torque",
"value": "10",
"unit": "Nm"
}
{
"component": "Brake caliper support bracket bolts",
"spec_type": "Torque",
"value": "150",
"unit": "Nm"
}
{
"component": "Brake flexible hose bracket-to-axle bolt",
"spec_type": "Torque",
"value": "30",
"unit": "Nm"
}
{
"component": "Brake flexible hose bracket-to-frame bolt",
"spec_type": "Torque",
"value": "17",
"unit": "Nm"
}

  Error: LLM response was not valid JSON.
Query: What is the engine oil capacity?


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Extracted JSON: {"component": "Engine Oil Capacity", "spec_type": "Capacity", "value": "Not Found", "unit": "Liters"}

Query: What is the spark plug gap?


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Extracted JSON: {"error": "not found"}

Query: Wheel speed sensor bolt
Extracted JSON: {"component": "Wheel speed sensor bolt", "spec_type": "Torque", "value": "15", "unit": "Nm"}

Processed 4 queries.
