In [26]:
!pip install evaluate



In [27]:
!pip install datasets
!pip install rouge_score



In [28]:
pip install transformers datasets torch




In [29]:
import json
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader

# Load your JSON data
with open("/content/foodreportdata", "r") as f:
    data = json.load(f)

# Custom Dataset for T5
class FoodWasteDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query = self.data[idx]["query"]
        output = self.data[idx]["query_output"]

        # Construct the input text
        input_text = f"Fooditem: {query['fooditem']}, Area of food wasted: {query['area_of_food_wasted']},area_of_plate_total_area:{query['area_of_plate_total_area']}, Percentage of food wasted: {query['percentage_of_food_wasted']}, Time of meal: {query['time_of_meal']}, Age: {query['age_of_person']}, Place: {query['place_where_food_was_eaten']}"

        # Construct the target text
        target_text = f"Consumer solution: {output['consumer_solution']} Management solution: {output['management_solution']}"

        # Tokenize the input and target text
        input_tokens = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        target_tokens = self.tokenizer(target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        # Prepare the input and target token IDs and attention masks
        source_ids = input_tokens["input_ids"].squeeze()
        target_ids = target_tokens["input_ids"].squeeze()

        source_mask = input_tokens["attention_mask"].squeeze()
        target_mask = target_tokens["attention_mask"].squeeze()

        return {
            "input_ids": source_ids,
            "attention_mask": source_mask,
            "labels": target_ids,
            "decoder_attention_mask": target_mask
        }

# Initialize the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Create the dataset
dataset = FoodWasteDataset(data, tokenizer)

# DataLoader for batching
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [30]:

from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [31]:
from transformers import T5ForConditionalGeneration, AdamW

# Load the pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [33]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 20  # You can increase this based on your dataset size
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        # Move batch to device (GPU/CPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Loss calculation
        loss = outputs.loss
        epoch_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader)}")


Epoch 1/20, Loss: 0.6578024534078745
Epoch 2/20, Loss: 0.43074966164735645
Epoch 3/20, Loss: 0.39051599227465117
Epoch 4/20, Loss: 0.3458263942828545
Epoch 5/20, Loss: 0.30059765164668745
Epoch 6/20, Loss: 0.27271632047799915
Epoch 7/20, Loss: 0.24958140231095827
Epoch 8/20, Loss: 0.23031981060138115
Epoch 9/20, Loss: 0.21693034584705645
Epoch 10/20, Loss: 0.2021628331679564
Epoch 11/20, Loss: 0.18586775316641882
Epoch 12/20, Loss: 0.1762076627749663
Epoch 13/20, Loss: 0.16846218132055724
Epoch 14/20, Loss: 0.15867222616305718
Epoch 15/20, Loss: 0.15211046888278082
Epoch 16/20, Loss: 0.1488398817869333
Epoch 17/20, Loss: 0.1446871064030207
Epoch 18/20, Loss: 0.13923554638257393
Epoch 19/20, Loss: 0.13475410582927558
Epoch 20/20, Loss: 0.12850595437563384


In [34]:
model.save_pretrained("fine_tuned_t5_food_waste_model")
tokenizer.save_pretrained("fine_tuned_t5_food_waste_model")


('fine_tuned_t5_food_waste_model/tokenizer_config.json',
 'fine_tuned_t5_food_waste_model/special_tokens_map.json',
 'fine_tuned_t5_food_waste_model/spiece.model',
 'fine_tuned_t5_food_waste_model/added_tokens.json')

In [35]:

def generate_solution(query_input):
    model.eval()

    input_tokens = tokenizer(query_input, return_tensors="pt", max_length=512, truncation=True)
    input_ids = input_tokens["input_ids"].to(device)
    attention_mask = input_tokens["attention_mask"].to(device)

    output_tokens = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100)
    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    return generated_text

test_query = "Fooditem: Paneer Tikka, Area of food wasted: 135000, area_of_plate_total_area: 500000,Percentage of food wasted: 27%, Time of meal: Dinner, Age: 38, Place: Work Cafeteria"
print(test_query)
print(generate_solution(test_query))


Fooditem: Paneer Tikka, Area of food wasted: 135000, area_of_plate_total_area: 500000,Percentage of food wasted: 27%, Time of meal: Dinner, Age: 38, Place: Work Cafeteria
Consumer solution: Order a small portion to avoid waste. Management solution: Offer small portions to allow for more if appetite is uncertain.


In [36]:
import torch
from sklearn.metrics import f1_score, accuracy_score, recall_score

def validate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            # Extract inputs and labels from the batch
            inputs = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            decoder_input_ids = labels[:, :-1].contiguous()

            outputs = model(input_ids=inputs, decoder_input_ids=decoder_input_ids)
            logits = outputs.logits
            _, preds = torch.max(logits, 2)

            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(labels[:, 1:].contiguous().cpu().numpy().flatten())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')

    # Print the metrics
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'Recall: {recall:.4f}')

validate_model(model, test_loader, device)


Accuracy: 0.9495
F1 Score: 0.9467
Recall: 0.9495


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
import pickle
with open('food_waste_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('food_waste_tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("Model and tokenizer saved to pickle files.")

Model and tokenizer saved to pickle files.
