Installing the libraries

In [1]:
!pip install transformers datasets sentencepiece torch torchvision huggingface_hub[hf_xet] accelerate



Importing the libraries

In [2]:
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


Loading the ML Model

In [3]:
import pickle

with open("ml_model.pk1", "rb") as f:
    ml_model = pickle.load(f)

sample_input = pd.DataFrame({
    'Crop': ['Wheat'],
    'Crop_Year': [2023],
    'Season': ['Rabi'],
    'State': ['Punjab'],
    'Area': [250],
    'Production': [5000],
    'Annual_Rainfall': [800],
    'Fertilizer': [50],
    'Pesticide': [10]
})

predicted_yield = ml_model.predict(sample_input)
print("Predicted Yield:", predicted_yield)

Predicted Yield: [7.30157732]


Preparing synthetic dataset for fine tuning the NLP Model

In [4]:
dataset = load_dataset("csv", data_files="optimization.csv")

# Train-test split
dataset = dataset["train"].train_test_split(test_size=0.2)

Using the Huggingface model and Tokenizing the data

In [5]:
model_checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

def preprocess(examples):
    inputs = tokenizer(examples["input_text"], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(examples["output_text"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

Map: 100%|██████████| 800/800 [00:00<00:00, 8202.77 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 7491.03 examples/s]


Using data collator for combining individual samples into a batch

In [6]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Setting up the training configuration

In [7]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,0.093395
2,No log,0.000289
3,No log,0.000113
4,No log,7.7e-05
5,0.748100,6.8e-05




TrainOutput(global_step=500, training_loss=0.7481402587890625, metrics={'train_runtime': 1838.5745, 'train_samples_per_second': 2.176, 'train_steps_per_second': 0.272, 'total_flos': 185890504704000.0, 'train_loss': 0.7481402587890625, 'epoch': 5.0})

Saving the Model

In [8]:
model.save_pretrained("./crop_nlp_model")
tokenizer.save_pretrained("./crop_nlp_model")

('./crop_nlp_model\\tokenizer_config.json',
 './crop_nlp_model\\special_tokens_map.json',
 './crop_nlp_model\\spiece.model',
 './crop_nlp_model\\added_tokens.json',
 './crop_nlp_model\\tokenizer.json')

Testing the model with ML input and user input

In [12]:
nlp_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

test_input = "Crop: Sugarcane, Predicted Yield: 2.5 t/ha, Area=20 ha, Budget=200000 INR, Goal: maximize profit"
result = nlp_pipeline(test_input, max_new_tokens=80)

print("Recommendation:", result[0]['generated_text'])

Device set to use cpu


Recommendation: Plant Co varieties, apply fertigation, and intercrop with pulses. This helps to maximize profit effectively.
