# This agri chat  bot project has 2 steps. Step 1 is chat bot with pretrained model and step 2 is chat bot with fine tuned model.

# STEP 1 chat bot with pretrained model

In [None]:
!pip install torch transformers sentence-transformers faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


## Data Preprocessing (downloading, cleaning, and saving books to JSON)


In [None]:
import requests
import re
import json
from tqdm import tqdm

# List of book URLs 
book_urls = [
    "https://www.gutenberg.org/ebooks/56640.txt.utf-8",
    "https://www.gutenberg.org/ebooks/67813.txt.utf-8",
    "https://www.gutenberg.org/ebooks/20772.txt.utf-8",
    "https://www.gutenberg.org/ebooks/40190.txt.utf-8",
    "https://www.gutenberg.org/ebooks/4924.txt.utf-8",
    "https://www.gutenberg.org/ebooks/4525.txt.utf-8",
    "https://www.gutenberg.org/ebooks/40190.txt.utf-8"
]

# Download books
for i, url in enumerate(book_urls):
    response = requests.get(url)
    with open(f"book_{i+1}.txt", "w", encoding="utf-8") as f:
        f.write(response.text)

print("Books downloaded successfully!")

# Clean Text Function
import re

def clean_text(text):
    # Remove everything before the main content
    text = re.sub(r"(?s)^.*?START OF (THE|THIS) PROJECT GUTENBERG EBOOK.*?\n", "", text)

    # Remove everything after the main content
    text = re.sub(r"(?s)END OF (THE|THIS) PROJECT GUTENBERG EBOOK.*$", "", text)

    # Remove common disclaimers
    text = re.sub(r"(?s)This ebook is for the use of anyone anywhere.*?restrictions whatsoever\.", "", text, flags=re.DOTALL)

    # Remove Distributed Proofreading and Transcriber's Notes
    text = re.sub(r"(?s)Produced by.*?www.pgdp.net.*?\n", "", text)
    text = re.sub(r"(?s)Transcriber’s Note:.*?\n", "", text)
    text = re.sub(r"(?s)Online Distributed Proofreading Team.*?\n", "", text)
    text = re.sub(r"(?s)Transcriber’s Notes.*?\n", "", text)

    # Remove email addresses and contributors
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "", text)

    # Remove unwanted characters (e.g., unnecessary punctuation or Unicode markers)
    text = text.replace("﻿", "").replace("\r", "").strip()

    return text





# Remove Unwanted Paragraphs
def filter_unwanted_paragraphs(paragraphs):
    unwanted_starts = [
        "the project gutenberg", "this ebook is for the use of", "release date:",
        "language:", "credits:", "produced by", "transcriber’s note:",
        "text printed in", "if you are not located in the united states"
    ]

    print(f"Before filtering: {len(paragraphs)} paragraphs")  # Debugging

    # Only check first 5 words instead of the entire paragraph
    filtered_paragraphs = [
        p for p in paragraphs
        if not any(phrase in p.lower().split()[:5] for phrase in unwanted_starts)
    ]

    print(f"After filtering: {len(filtered_paragraphs)} paragraphs")  # Debugging
    return filtered_paragraphs




# Process Books
processed_books = []
for i in range(1, 7):
    with open(f"book_{i}.txt", "r", encoding="utf-8") as f:
        text = f.read()
        cleaned_text = clean_text(text)
        processed_books.append(cleaned_text)

print("Books cleaned and preprocessed!")

for i, book in enumerate(processed_books):
    print(f"Book {i+1} length after cleaning: {len(book)} characters")


# Split into paragraphs
def split_into_paragraphs(text, min_length=300):
    # Split on two or more newlines (better for Gutenberg books)
    paragraphs = re.split(r"\n{2,}", text)

    # Remove short paragraphs (likely junk or headers)
    paragraphs = [p.strip() for p in paragraphs if len(p) > min_length]

    print(f"Extracted {len(paragraphs)} paragraphs from book")  # Debugging

    # Print first few paragraphs to verify
    for i, p in enumerate(paragraphs[:5]):
        print(f"Paragraph {i+1}: {p[:200]}...")  # Show first 200 characters

    return paragraphs






all_paragraphs = []
for book in processed_books:
    paragraphs = split_into_paragraphs(book)
    filtered_paragraphs = filter_unwanted_paragraphs(paragraphs)
    all_paragraphs.extend(filtered_paragraphs)

print(f"Total meaningful paragraphs extracted: {len(all_paragraphs)}")

# Save to JSON
with open("preprocessed_books.json", "w", encoding="utf-8") as f:
    json.dump({"paragraphs": all_paragraphs}, f, indent=4)

print("Preprocessed data saved!")


Books downloaded successfully!
Books cleaned and preprocessed!
Book 1 length after cleaning: 933369 characters
Book 2 length after cleaning: 460375 characters
Book 3 length after cleaning: 439848 characters
Book 4 length after cleaning: 585653 characters
Book 5 length after cleaning: 452872 characters
Book 6 length after cleaning: 82583 characters
Extracted 1095 paragraphs from book
Paragraph 1: If any of these characters do not display properly—in particular, if
the diacritic does not appear directly above the letter—or if the
apostrophes and quotation marks highlighted in this paragraph app...
Paragraph 2: The Preface to the first edition of this volume, which follows these
few words, will give some idea of the book’s origin. Much of the
material is of only passing importance, and is retained now rather...
Paragraph 3: No one of our readers will be half so curious to know what this book
contains as the author himself. For it is more than twelve years since
these pieces were begun, an

## Dense Passage Retrieval (DPR) for retrieving relevant passages

In [None]:
import json
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load preprocessed data
with open("preprocessed_books.json", "r", encoding="utf-8") as f:
    data = json.load(f)

paragraphs = data["paragraphs"]

# Load a fast SentenceTransformer model for retrieval
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
device = "cuda" if torch.cuda.is_available() else "cpu"
dpr_model = SentenceTransformer(model_name, device=device)

# File paths for embeddings and FAISS index
embedding_file = "paragraph_embeddings.npy"
index_file = "faiss_index.bin"

# Check if precomputed embeddings exist
try:
    paragraph_embeddings_np = np.load(embedding_file)
    index = faiss.read_index(index_file)
    print("✅ Loaded precomputed embeddings and FAISS index.")
except FileNotFoundError:
    print("⚡ Computing embeddings from scratch...")

    # Compute embeddings in batches (avoid memory overload)
    batch_size = 16  # Adjust as needed
    paragraph_embeddings = dpr_model.encode(
        paragraphs, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True
    )

    # Save embeddings
    paragraph_embeddings_np = np.array(paragraph_embeddings)
    np.save(embedding_file, paragraph_embeddings_np)

    # Build FAISS index (Inner Product for better retrieval)
    index = faiss.IndexFlatIP(paragraph_embeddings_np.shape[1])  # IP = Inner Product
    index.add(paragraph_embeddings_np)
    faiss.write_index(index, index_file)
    print("✅ FAISS index built and saved.")

# Function to retrieve top-k relevant paragraphs
def retrieve_relevant_paragraphs(query, top_k=3):
    query_embedding = dpr_model.encode(query, convert_to_numpy=True).reshape(1, -1)
    scores, indices = index.search(query_embedding, top_k)

    return [(paragraphs[i], scores[0][idx]) for idx, i in enumerate(indices[0])]

# Example query
query = "What is the importance of nitrogen in crop production?"
retrieved_paragraphs = retrieve_relevant_paragraphs(query)

print("\n🔍 Retrieved Paragraphs:")
for i, (para, score) in enumerate(retrieved_paragraphs):
    print(f"{i+1}. {para[:200]}... (Score: {score:.4f})")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

⚡ Computing embeddings from scratch...


Batches:   0%|          | 0/224 [00:00<?, ?it/s]

✅ FAISS index built and saved.

🔍 Retrieved Paragraphs:
1. In the earlier pages of this book you were told something about the food
of plants. One of the main elements of plant food, perhaps you remember,
is nitrogen. Just as soon as the roots of the legumino... (Score: 26.1981)
2. Practice and experiment have shown that such diminishing fertility
may be retarded or wholly avoided, first, by so working or
cultivating the soil as to set free much of the insoluble plant-food
and, ... (Score: 26.0686)
3. Second, the nitrogen-gathering crops, while helping to feed the stock,
also reduce the fertilizer bills by supplying one of the costly elements
of the fertilizer. The ordinary cotton fertilizer consis... (Score: 25.8705)


## Implement Summarization using Pretrained BART


In [None]:
!pip install torch transformers




In [None]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration

# Load Faster Summarization Model (BART instead of T5)
device = "cuda" if torch.cuda.is_available() else "cpu"
summarization_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)
summarization_model.half()  # Enable FP16 for faster execution
summarization_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Optimized Summarization Function
def summarize_text(text, max_length=100):
    input_text = "summarize: " + text
    input_ids = summarization_tokenizer.encode(
        input_text, return_tensors="pt", max_length=512, truncation=True
    ).to(device)

    summary_ids = summarization_model.generate(
        input_ids,
        max_length=max_length,
        min_length=30,
        do_sample=False,  # Greedy decoding for speed
        num_beams=1  # No beam search (faster)
    )

    summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage: Retrieve a paragraph and summarize it
query = "soil fertility management"
retrieved_paragraphs = retrieve_relevant_paragraphs(query, top_k=2)  # Reduce retrieved text

if retrieved_paragraphs:
    sample_text = retrieved_paragraphs[0][0]  # Extract only the text (not the score)
    summary = summarize_text(sample_text)
    print("\n🔍 Original Text:", sample_text[:300], "...")  # Print first 300 chars
    print("\n📝 Generated Summary:", summary)
else:
    print("No relevant paragraphs found.")




config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]




🔍 Original Text: Soil chemists have generally attempted to arrive at a determination
of the fertility of soil by treating a carefully selected and
prepared sample with a certain amount of acid of definite strength.
The portion which dissolves under the influence of acids has been
looked upon as a rough measure of th ...

📝 Generated Summary: summarize: Soil chemists have generally attempted to arrive at a determinationof the fertility of soil by treating a carefully selected and well-prepared sample with a certain amount of acid of definite strength.The portion which dissolves under the influence of acids has been used in the laboratory to obtain a determination of the fertilityof soil. This islooked upon as a rough measure of the possible fertility of thesoil.


## Implement Extractive Question Answering using Pretrained BERT-QA

In [None]:
!pip install torch transformers




In [None]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering

# Load Pretrained BERT-QA Model on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
qa_model = BertForQuestionAnswering.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad"
).to(device)
qa_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Function for Question Answering
def answer_question(question, context):
    inputs = qa_tokenizer(
        question, context, return_tensors="pt", max_length=512, truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = qa_model(**inputs)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    answer = qa_tokenizer.convert_tokens_to_string(
        qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])
    )

    return answer

# Example usage: Retrieve a paragraph and answer a question
query = "good"
retrieved_paragraphs = retrieve_relevant_paragraphs(query, top_k=1)

if retrieved_paragraphs:
    context = retrieved_paragraphs[0][0]  # Extract only the text (not the score)
    question = "What is the importance of good farming?"

    answer = answer_question(question, context)

    print("\n🔍 Context:", context[:300], "...")  # Print first 300 chars
    print("\n❓ Question:", question)
    print("\n📝 Extracted Answer:", answer)
else:
    print("No relevant paragraphs found.")



config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


🔍 Context: Prosperous farming requires knowledge, tact at managing men, skill in
laying out work, incessant industry, very close calculations, good
judgment in buying, and a good capacity of selling. In short, the
qualities which go to make up a good merchant, a good manufacturer,
and a good scientist ought to ...

❓ Question: What is the importance of good farming?

📝 Extracted Answer: knowledge , tact at managing men , skill in laying out work , incessant industry , very close calculations , good judgment in buying , and a good capacity of selling


## Implement a Gradio-based chatbot interface that can intelligently switch between summarization and QA

In [None]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.20.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3

In [None]:
import gradio as gr

# Chatbot Function
def chatbot(user_input):
    user_input = user_input.strip().lower()

    # Summarization Trigger Keywords
    summarization_keywords = ["summarize", "tell me about", "describe"]

    # Question Answering Trigger Keywords
    question_keywords = ["what", "how", "explain", "why", "when", "where", "who"]

    # Check if the input is a summarization request
    if any(user_input.startswith(keyword) for keyword in summarization_keywords):
        topic = " ".join(user_input.split(" ")[1:])  # Extract topic after keyword
        retrieved_paragraphs = retrieve_relevant_paragraphs(topic, top_k=3)

        if retrieved_paragraphs:
            context = " ".join([para[0] for para in retrieved_paragraphs])  # Extract text only
            summary = summarize_text(context)
            return f"**Summary:** {summary}"
        else:
            return "❌ Sorry, I couldn't find relevant information to summarize."

    # Check if the input is a question
    elif any(user_input.startswith(keyword) for keyword in question_keywords):
        retrieved_paragraphs = retrieve_relevant_paragraphs(user_input, top_k=1)

        if retrieved_paragraphs:
            context = retrieved_paragraphs[0][0]  # Extract only the text (not the score)
            answer = answer_question(user_input, context)
            return f"**Answer:** {answer}"
        else:
            return "❌ Sorry, I couldn't find an answer to your question."

    # Default Response
    else:
        return "🤔 I'm not sure what you're asking. Please ask a question or request a summary."

# Launch Gradio Interface
gr.Interface(
    fn=chatbot,
    inputs="text",
    outputs="text",
    title="🌱 Agriculture Chatbot",
    description=(
        "🔹 **Ask agricultural questions or request topic summaries!**\n\n"
        "**Usage Guide:**\n"
        "✅ **To get a summary**, start with: `summarize`, `tell me about`, or `describe`.\n"
        "✅ **To ask a question**, start with: `what`, `how`, `explain`, `why`, `when`, `where`, `who`."
    )
).launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2e8827fc71779bd3b8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# STEP 2:  chat bot with fine tuned model

## Now, let's fine-tune a BART model on our preprocessed agricultural books for better summarization.

In [None]:
!pip install torch transformers datasets sentencepiece accelerate


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

### Load the Preprocessed Data

In [None]:
import json

# Load preprocessed paragraphs
with open("preprocessed_books.json", "r", encoding="utf-8") as f:
    data = json.load(f)

paragraphs = data["paragraphs"]

# Example: Create artificial summaries (replace this with human-annotated summaries if available)
summaries = ["This passage discusses " + p[:100] + "..." for p in paragraphs]  # Dummy summaries


### Prepare Data for Fine-Tuning

In [None]:
from datasets import Dataset

# Create dataset dictionary
train_data = {
    "document": paragraphs,
    "summary": summaries
}

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(train_data)

# Split dataset (80% train, 20% validation)
dataset = dataset.train_test_split(test_size=0.2)

# Show example
dataset["train"][0]


{'document': 'We have said that bunt is not peculiar to any climate; we have, however,\nalways observed that employing seed from a warm district on a cold one,\nor using the finer white wheats in cold, exposed, or ill-drained\nsituations, is sure to produce a large quantity of this fungus.\nAutumn-sown wheat, too, is less liable to the infection than spring\nwheat, which we attribute to the fact that many of the weaker plants\nwill succumb to the cold rain and frost.',
 'summary': 'This passage discusses We have said that bunt is not peculiar to any climate; we have, however,\nalways observed that employ...'}

In [None]:
from transformers import BartTokenizer

# Load tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples["document"], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["document", "summary"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Map:   0%|          | 0/2863 [00:00<?, ? examples/s]

Map:   0%|          | 0/716 [00:00<?, ? examples/s]

### Fine-Tune the Model

In [None]:
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments

# Load model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Training arguments
training_args = TrainingArguments(
    output_dir="./bart-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Adjust for Colab RAM
    per_device_eval_batch_size=4,
    num_train_epochs=3,  # Increase if needed
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none",
# Disable if not uploading to HF Hub
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Start fine-tuning
trainer.train()


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,1.134,0.058898
2,0.0553,0.046347
3,0.0378,0.046442




TrainOutput(global_step=2148, training_loss=0.2988731474849765, metrics={'train_runtime': 1035.0268, 'train_samples_per_second': 8.298, 'train_steps_per_second': 2.075, 'total_flos': 2618513219911680.0, 'train_loss': 0.2988731474849765, 'epoch': 3.0})

In [None]:
model.save_pretrained("bart-finetuned-agriculture")
tokenizer.save_pretrained("bart-finetuned-agriculture")


('bart-finetuned-agriculture/tokenizer_config.json',
 'bart-finetuned-agriculture/special_tokens_map.json',
 'bart-finetuned-agriculture/vocab.json',
 'bart-finetuned-agriculture/merges.txt',
 'bart-finetuned-agriculture/added_tokens.json')

### fine-tuning BERT for question answering

In [None]:
!pip install torch transformers datasets accelerate




In [None]:
import json

# Load preprocessed paragraphs
with open("preprocessed_books.json", "r", encoding="utf-8") as f:
    data = json.load(f)

paragraphs = data["paragraphs"]

# Example: Create synthetic QA pairs (replace this with real annotations)
qa_data = [
    {
        "context": p,
        "question": "What is this passage about?",
        "answer": p[:50],  # Dummy answer (first 50 chars)
        "answer_start": 0
    }
    for p in paragraphs[:500]  # Use first 500 paragraphs for QA
]


In [None]:
from datasets import Dataset

# Create dataset dictionary
train_data = {
    "context": [d["context"] for d in qa_data],
    "question": [d["question"] for d in qa_data],
    "answers": [{"text": [d["answer"]], "answer_start": [d["answer_start"]]} for d in qa_data]
}

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(train_data)

# Split dataset (80% train, 20% validation)
dataset = dataset.train_test_split(test_size=0.2)

# Show example
dataset["train"][0]


{'context': 'Leaf-manure has always been held in high esteem by gardeners. But many\nregard it as a purely _vegetable substance_; whereas, it is the best\nmineral manure that can be applied to the soil. What are called\nvegetable loams (not peat soils, made up principally of decomposed\n_roots_), contain large quantities of earthy matter, being\nmineral-vegetable, rather than vegetable soils.',
 'question': 'What is this passage about?',
 'answers': {'answer_start': [0],
  'text': ['Leaf-manure has always been held in high esteem by']}}

In [None]:
from transformers import BertTokenizerFast

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Tokenization function for QA
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True,
    )

    start_positions = []
    end_positions = []

    for i, (offset, answer) in enumerate(zip(tokenized_inputs["offset_mapping"], examples["answers"])):
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = tokenized_inputs.sequence_ids(i)
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        start_pos = end_pos = 0
        for j in range(context_start, context_end):
            if offset[j][0] <= start_char and offset[j][1] >= start_char:
                start_pos = j
            if offset[j][0] <= end_char and offset[j][1] >= end_char:
                end_pos = j
                break

        start_positions.append(start_pos)
        end_positions.append(end_pos)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions
    tokenized_inputs.pop("offset_mapping")

    return tokenized_inputs

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["context", "question", "answers"])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments

# Load model
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert-qa-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Adjust for Colab RAM
    per_device_eval_batch_size=4,
    num_train_epochs=3,  # Increase if needed
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none", # Disable if not uploading to HF Hub
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Start fine-tuning
trainer.train()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.176739
2,No log,1.047927
3,No log,1.040901


TrainOutput(global_step=300, training_loss=1.5950960286458333, metrics={'train_runtime': 244.7117, 'train_samples_per_second': 4.904, 'train_steps_per_second': 1.226, 'total_flos': 313556108083200.0, 'train_loss': 1.5950960286458333, 'epoch': 3.0})

In [None]:
model.save_pretrained("bert-qa-finetuned-agriculture")
tokenizer.save_pretrained("bert-qa-finetuned-agriculture")


('bert-qa-finetuned-agriculture/tokenizer_config.json',
 'bert-qa-finetuned-agriculture/special_tokens_map.json',
 'bert-qa-finetuned-agriculture/vocab.txt',
 'bert-qa-finetuned-agriculture/added_tokens.json',
 'bert-qa-finetuned-agriculture/tokenizer.json')

In [None]:
!pip install gradio
import json
import faiss
import torch
import gradio as gr
from sentence_transformers import SentenceTransformer
from transformers import BartForConditionalGeneration, BartTokenizer, BertForQuestionAnswering, BertTokenizer

# Load preprocessed agricultural data
with open("preprocessed_books.json", "r", encoding="utf-8") as f:
    data = json.load(f)

paragraphs = data["paragraphs"]

# Load FAISS index for retrieval
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.encode(paragraphs, convert_to_numpy=True)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

Collecting gradio
  Downloading gradio-5.20.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def retrieve_relevant_paragraphs(query, top_k=1):
    query_embedding = embedding_model.encode([query])
    _, retrieved_indices = index.search(query_embedding, top_k)
    return [paragraphs[i] for i in retrieved_indices[0]]

# Load fine-tuned models
device = "cuda" if torch.cuda.is_available() else "cpu"

summarization_model = BartForConditionalGeneration.from_pretrained("bart-finetuned-agriculture").to(device)
summarization_tokenizer = BartTokenizer.from_pretrained("bart-finetuned-agriculture")

qa_model = BertForQuestionAnswering.from_pretrained("bert-qa-finetuned-agriculture").to(device)
qa_tokenizer = BertTokenizer.from_pretrained("bert-qa-finetuned-agriculture")

In [None]:
def summarize_text(text, max_length=100):
    input_text = "summarize: " + text
    input_ids = summarization_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = summarization_model.generate(input_ids, max_length=max_length, min_length=30, do_sample=False, num_beams=1)
    return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
def answer_question(question, context):
    inputs = qa_tokenizer(question, context, return_tensors="pt", max_length=384, truncation=True).to(device)
    outputs = qa_model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer

In [None]:
def chatbot(user_input):
    user_input = user_input.strip().lower()
    summarization_keywords = ["summarize", "tell me about", "describe"]
    question_keywords = ["what", "how", "explain", "why", "when", "where", "who"]

    if any(user_input.startswith(keyword) for keyword in summarization_keywords):
        topic = " ".join(user_input.split(" ")[1:])
        retrieved_paragraphs = retrieve_relevant_paragraphs(topic, top_k=1)
        if retrieved_paragraphs:
            summary = summarize_text(retrieved_paragraphs[0])
            return f"**Summary:** {summary}"
        return "❌ No relevant information found."

    elif any(user_input.startswith(keyword) for keyword in question_keywords):
        retrieved_paragraphs = retrieve_relevant_paragraphs(user_input, top_k=1)
        if retrieved_paragraphs:
            answer = answer_question(user_input, retrieved_paragraphs[0])
            return f"**Answer:** {answer}"
        return "❌ No answer found."

    return "🤔 Please ask a question or request a summary."

gr.Interface(
    fn=chatbot,
    inputs="text",
    outputs="text",
    title="🌱 Agriculture Chatbot",
    description=(
        "🔹 **Ask agricultural questions or request topic summaries!**\n\n"
        "**Usage Guide:**\n"
        "✅ **To get a summary**, start with: `summarize`, `tell me about`, or `describe`.\n"
        "✅ **To ask a question**, start with: `what`, `how`, `explain`, `why`, `when`, `where`, `who`."
    )
).launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0dfd8bee4a1f592f60.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


