In [2]:
pip install transformers[torch]

Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate>=0.26.0 (from transformers[torch])
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.10.1-py3-none-any.whl (374 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.10.1
Note: you may need to restart the kernel to use updated packages.




In [None]:
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

# Load data
dataset = load_dataset("csv", data_files={"train": r"C:\Users\Lenovo\Ai_farmer_query_based\data\faq.csv"})

# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["questions"], padding=True, truncation=True)
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("answers", "labels")
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Model
num_labels = len(set(dataset["train"]["labels"]))
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# Training
training_args = TrainingArguments(
    output_dir="src/models/intent_model",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"]
)

trainer.train()


In [7]:
pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp312-cp312-win_amd64.whl.metadata (10 kB)
Downloading sentencepiece-0.2.1-cp312-cp312-win_amd64.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--
   --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--
   ------------------- -------------------- 0.5/1.1 MB 560.1 kB/s eta 0:00:01
   ------------------- -------------------- 0.5/1.1 MB 560.1 kB/s eta 0:00:01
   ------------------- -------------------- 0.5/1.1 MB 560.1 kB/s eta 0:00:01
   ----------------------------- ---------- 0.8/1.1 MB 486.4 kB/s eta 0:00:01
   ------------------------



In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load data
dataset = load_dataset("csv", data_files={"train": r"C:\Users\Lenovo\Ai_farmer_query_based\data\faq.csv"})

# Tokenizer & Model
model_name = "t5-small"  # you can also try "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing
def preprocess(batch):
    inputs = tokenizer(batch["questions"], padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(batch["answers"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

dataset = dataset.map(preprocess, batched=True, remove_columns=["questions", "answers"])
dataset.set_format(type="torch")

# Training
training_args = TrainingArguments(
    output_dir="src/models/qa_model",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=2,
    logging_steps=50,
    evaluation_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"]
)

trainer.train()

In [2]:
# # ===============================
# # 1. Install Required Libraries
# # ===============================
# !pip install transformers datasets sentence-transformers faiss-cpu torch pandas scikit-learn

# # ===============================
# # 2. Import Libraries
# # ===============================
# import pandas as pd
# from datasets import load_dataset
# from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, pipeline
# from sentence_transformers import SentenceTransformer
# import faiss
# import torch

# # ===============================
# # 3. Load and Prepare Data
# # ===============================
# # Load FAQ dataset (combine faq.csv and faq2.csv if both exist)
# faq1 = pd.read_csv(r"C:\Users\Lenovo\Ai_farmer_query_based\data\faq.csv")
# faq2 = pd.read_csv(r"C:\Users\Lenovo\Ai_farmer_query_based\data\faq2.csv")
# faq = pd.concat([faq1, faq2]).dropna().reset_index(drop=True)

# # Ensure labels are integers
# faq["category"] = faq["category"].astype("category")
# faq["label"] = faq["category"].cat.codes

# faq.to_csv(r"C:\Users\Lenovo\Ai_farmer_query_based\data\faq_combined.csv", index=False)

# # Load into HuggingFace dataset
# dataset = load_dataset("csv", data_files={"train": r"C:\Users\Lenovo\Ai_farmer_query_based\data\faq_combined.csv"})

# # ===============================
# # 4. Tokenization
# # ===============================
# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# def tokenize(batch):
#     return tokenizer(batch["questions"], padding=True, truncation=True)

# dataset = dataset.map(tokenize, batched=True)
# dataset = dataset.rename_column("label", "labels")
# dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# # ===============================
# # 5. Train Intent Classifier
# # ===============================
# num_labels = len(set(faq["labels"]))  # number of categories
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# training_args = TrainingArguments(
#     output_dir="src/models/intent_model",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
#     logging_dir="logs",
#     logging_steps=50,
#     save_total_limit=2,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset["train"],
# )

# trainer.train()
# trainer.save_model("src/models/intent_model")

# # ===============================
# # 6. Build Retriever with FAISS
# # ===============================
# embedder = SentenceTransformer("all-MiniLM-L6-v2")

# faq_embeddings = embedder.encode(faq["question"].tolist(), convert_to_numpy=True)

# index = faiss.IndexFlatL2(faq_embeddings.shape[1])
# index.add(faq_embeddings)

# def retrieve_answer(query, top_k=1):
#     query_vec = embedder.encode([query], convert_to_numpy=True)
#     D, I = index.search(query_vec, k=top_k)
#     return faq.iloc[I[0][0]]["answer"]

# # ===============================
# # 7. Connect Intent + Retriever
# # ===============================
# intent_classifier = pipeline("text-classification", model="src/models/intent_model", tokenizer=tokenizer)

# def get_response(query):
#     # Step 1: Detect intent
#     intent = intent_classifier(query)[0]["label"]

#     # Step 2: Retrieve best answer
#     try:
#         answer = retrieve_answer(query)
#     except:
#         answer = None

#     # Step 3: Fallback
#     if not answer:
#         answer = "Sorry, I don’t know the exact answer. Please consult an expert."
#     return f"Intent: {intent}\nAnswer: {answer}"

# # ===============================
# # 8. Test the System
# # ===============================
# print(get_response("How to treat wheat rust?"))
# print(get_response("Best fertilizer for rice?"))
# print(get_response("What is the ideal soil for sugarcane?"))


In [None]:
from datasets import load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [4]:
# # Load your CSV (faq.csv or faq2.csv)
# dataset = load_dataset("csv", data_files=r"C:\Users\Lenovo\Ai_farmer_query_based\data\faq.csv")["train"]

# print("Columns:", dataset.column_names)
# print("Sample row:", dataset[0])


Columns: ['questions', 'answersr"C:\\Users\\Lenovo\\Ai_farmer_query_based\\data']
Sample row: {'questions': 'What is the best time to plant rice?', 'answersr"C:\\Users\\Lenovo\\Ai_farmer_query_based\\data': 'The best time to plant rice depends on your region, but generally it is during the monsoon season between June and July.'}


In [None]:

# Load both FAQ files
faq1 = load_dataset("csv", data_files=r"C:\Users\Lenovo\Ai_farmer_query_based\data\faq.csv")["train"]
faq2 = load_dataset("csv", data_files=r"C:\Users\Lenovo\Ai_farmer_query_based\data\faq2.csv")["train"]

# Merge them into one dataset
dataset = concatenate_datasets([faq1, faq2])

print("Combined dataset size:", len(dataset))
print("Columns:", dataset.column_names)
print("Sample row:", dataset[0])


In [None]:
# # Use a pretrained model for embeddings
# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# # Encode all FAQ questions
# question_embeddings = model.encode(dataset["questions"], convert_to_numpy=True)

# # Create FAISS index
# index = faiss.IndexFlatL2(question_embeddings.shape[1])
# index.add(question_embeddings)

# print("FAISS index built with", index.ntotal, "questions.")

In [None]:
from sentence_transformers import SentenceTransformer
import faiss

# Use pretrained embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Encode questions
question_embeddings = model.encode(dataset["questions"], convert_to_numpy=True)

# Build FAISS index
index = faiss.IndexFlatL2(question_embeddings.shape[1])
index.add(question_embeddings)

print("FAISS index built with", index.ntotal, "questions.")


In [None]:
dataset = dataset.rename_column("answersr\"C:\\Users\\Lenovo\\Ai_farmer_query_based\\data", "answers")

In [2]:
def get_answer(query, top_k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        results.append({
            "matched_question": dataset[int(idx)]["questions"],
            "answer": dataset[int(idx)]["answers"]
        })
    return results


In [13]:
# query = "Best way to grow rice?"
# answer = get_answer(query)
# print("Query:", query)
# print("Answer:", answer)


Query: Best way to grow rice?
Answer: The best time to plant rice depends on your region, but generally it is during the monsoon season between June and July.


In [None]:
query = "Best fertilizer for wheat?"
answers = get_answer(query, top_k=3)

print("Query:", query)
print("\nTop 3 Retrieved Answers:")
for i, res in enumerate(answers, 1):
    print(f"\n{i}. Q: {res['matched_question']}\n   A: {res['answer']}")