# Setup

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import re

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim

from PIL import Image
from io import BytesIO
from torchvision import transforms
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Sampling DataSet

In [7]:
# Read the dataset from Hugging Face using Dask
df = dd.read_parquet(
    "hf://datasets/raghavendrad60/vqa_plant-disease-classification-merged-dataset/data/train-*.parquet",
    columns=['class', 'image']
)

# Plants to keep
patterns = ['Tomato', 'Chili', 'Cucumber', 'Strawberry', 'Pepper']
desired_plants = [p.lower() for p in patterns]

# Filter function to include only desired plants
def filter_partition(df):
    df['class'] = df['class'].astype(str).str.strip().str.lower()
    df['plant'] = df['class'].str.extract(r'^([^_]+)')
    return df[df['plant'].isin(desired_plants)]

df = df.map_partitions(filter_partition).persist()

# Function to sample a small number of rows from a partition
def sample_partition(partition, n_per_partition=200):
    return partition.sample(n=min(len(partition), n_per_partition), random_state=np.random.randint(0, 1e6))

# Sample a limited number of rows from each partition
sampled = df.map_partitions(sample_partition, n_per_partition=200)

# Collect the sampled data into memory, then take a final random sample of 50000
sampled_df = sampled.compute()
final_sample = sampled_df.sample(n=min(5000, len(sampled_df)), random_state=42)

ImportError: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: Install huggingface_hub to access HfFileSystem

In [3]:
final_sample

Unnamed: 0,class,image,plant
1314,tomato__bacterial_spot,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,tomato
1556,tomato__yellow_leaf_curl_virus,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,tomato
1868,strawberry__healthy,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,strawberry
421,tomato__target_spot,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,tomato
1043,pepper_bell__healthy,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,pepper
...,...,...,...
226,tomato__septoria_leaf_spot,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,tomato
1449,tomato__bacterial_spot,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,tomato
1069,tomato__late_blight,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,tomato
292,tomato__target_spot,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,tomato


# Training

In [4]:
# Count samples per class
class_counts = final_sample['class'].value_counts()
print(class_counts)

# Filter out classes with fewer than 2 samples
valid_classes = class_counts[class_counts >= 2].index
final_sample = final_sample[final_sample['class'].isin(valid_classes)]

class
tomato__late_blight                               382
tomato__yellow_leaf_curl_virus                    375
tomato__target_spot                               355
cucumber__healthy                                 243
tomato__septoria_leaf_spot                        231
tomato__leaf_mold                                 213
strawberry__healthy                               209
strawberry___leaf_scorch                          200
tomato__spider_mites_(two_spotted_spider_mite)    200
tomato__healthy                                   191
pepper_bell__healthy                              170
tomato__bacterial_spot                            169
tomato__mosaic_virus                              134
tomato__early_blight                              125
pepper_bell__bacterial_spot                       105
chili__yellowish                                  100
chili__leaf curl                                  100
chili__healthy                                     49
chili__leaf spot      

In [5]:
# Extract actual byte data from the dictionary in the 'image' column
final_sample['image_bytes'] = final_sample['image'].apply(lambda x: eval(x)['bytes'] if isinstance(x, str) else x['bytes'])

# Encode class labels
le = LabelEncoder()
final_sample['label'] = le.fit_transform(final_sample['class'])

# Split
train_df, val_df = train_test_split(final_sample, test_size=0.2, stratify=final_sample['label'], random_state=42)

# Define transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

In [6]:
class ByteImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_bytes = self.df.iloc[idx]['image_bytes']
        image = Image.open(BytesIO(img_bytes)).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label = self.df.iloc[idx]['label']
        return image, label

train_dataset = ByteImageDataset(train_df, transform)
val_dataset = ByteImageDataset(val_df, transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, final_sample['label'].nunique())
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(5):  # Adjust as needed
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Evaluation
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Validation Accuracy: {100 * correct / total:.2f}%")


# saving the Label Encoder as a pickle file
label_encoder_path = 'label_encoder.pkl' # Oder ein anderer Pfad/Dateiname
with open(label_encoder_path, 'wb') as f:
    pickle.dump(le, f)

print(f"LabelEncoder gespeichert unter: {label_encoder_path}")

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 96.7MB/s]


# Inference (Prediction)

In [None]:
# Same image preprocessing used during training
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Inference function
def predict_from_bytes(img_bytes):
    model.eval()
    image = Image.open(BytesIO(img_bytes)).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)  # Add batch dimension
    with torch.no_grad():
        outputs = model(image)
        _, predicted = torch.max(outputs, 1)
        class_index = predicted.item()
        return le.inverse_transform([class_index])[0]  # Return class label

In [None]:
# Picture input (from file or database)
with open("/content/Tomato_Leaf_Mold_Test3", "rb") as f:
    img_bytes = f.read()

predicted_class = predict_from_bytes(img_bytes)
print("Predicted class:", predicted_class)

#Save the Model using Pickle

In [None]:
import pickle

with open('advanced_ml_project_model.pkl', 'wb') as f:
    pickle.dump(model.state_dict(), f)