In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

In [2]:
%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


In [3]:
import torch
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('CUDA version:', torch.version.cuda)
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce GTX 960
CUDA version: 11.7
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [4]:
import numpy as np
import PIL
from PIL import Image
import datasets
import evaluate
import torch
import json
import codecs
import os
from os import sys

from transformers import AutoImageProcessor
from transformers import SegformerImageProcessor

from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer, EarlyStoppingCallback

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
module_path = os.path.abspath(os.path.join('./src'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_prepossessing import create_datasets_for_plants, get_labels
from constants import *

In [6]:
checkpoint = "nvidia/mit-b0"
image_processor = SegformerImageProcessor.from_pretrained(checkpoint)
image_processor



SegformerImageProcessor {
  "do_normalize": true,
  "do_reduce_labels": false,
  "do_rescale": true,
  "do_resize": true,
  "feature_extractor_type": "SegformerFeatureExtractor",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "SegformerImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 512,
    "width": 512
  }
}

In [7]:
def train_transforms(example_batch):
    images = [x for x in example_batch["image"]]
    labels = [x for x in example_batch["annotation"]]
    inputs = image_processor(images, labels, return_tensors="pt")
    return inputs

In [8]:
metric = evaluate.load("mean_iou")

In [9]:
def compute_metrics(num_labels, eval_pred):
    with torch.no_grad():
        logits, labels = eval_pred
        logits_tensor = torch.from_numpy(logits)
        logits_tensor = torch.nn.functional.interpolate(
            logits_tensor,
            size=labels.shape[-2:],
            mode="bilinear",
            align_corners=False,
        ).argmax(dim=1)

        pred_labels = logits_tensor.detach().cpu().numpy()
        metrics = metric.compute(
            predictions=pred_labels,
            references=labels,
            num_labels=num_labels,
            ignore_index=255,
            reduce_labels=False,
        )
        for key, value in metrics.items():
            if type(value) is np.ndarray:
                metrics[key] = value.tolist()
        return metrics

In [10]:
# TODO try to remove eval_accumulation_steps
training_args = TrainingArguments(
    output_dir="segformer-b0-scene-parse-150",
    learning_rate=6e-5,
    num_train_epochs=25,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    eval_steps=100,
    logging_steps=1,
    # eval_accumulation_steps=5,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    seed=seed,
)

In [11]:
def initialize_trainer(model, num_labels, train_ds, test_ds) :
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        compute_metrics=lambda eval_pred: compute_metrics(num_labels, eval_pred),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    return trainer

In [12]:
def train_model_of_type_for_crop(model_type, crop):
    model_plant_names = [crop] + weed_plants
    train_ds, val_ds, test_ds = create_datasets_for_plants(model_plant_names, model_type, crop)

    print("Training subset number of images: " + str(train_ds.num_rows))
    print("Validation subset number of images: " + str(val_ds.num_rows))
    print("Test subset number of images: " + str(test_ds.num_rows))

    train_ds.set_transform(train_transforms)
    val_ds.set_transform(train_transforms)
    test_ds.set_transform(train_transforms)

    id2label, label2id = get_labels(crop, model_type)

    print('Number of classes:', len(id2label))
    print('id2label:', id2label)
    print('label2id:', label2id)

    model = AutoModelForSemanticSegmentation.from_pretrained(checkpoint, id2label=id2label, label2id=label2id)
    trainer = initialize_trainer(model, len(id2label), train_ds, val_ds)
    trainer.train()

    # Save the trained model, so that it can be used for inference later.
    # Save the log history, so that it can be used for plotting later.
    trainer.save_model('models/' + model_type + '/' + crop)
    with open('models/' + model_type + '/' + crop + '/log_history.json', 'w') as file:
        log_history = trainer.state.log_history
        json.dump(log_history, file)

    test_metric = trainer.evaluate(test_ds)
    test_metric

    with open('models/' + model_type + '/' + crop + '/test_metric.json', 'w') as file:
        json.dump(test_metric, file)

In [13]:
train_model_of_type_for_crop("multiclass", "broad_bean")

['img_00173.png', 'img_00174.png', 'img_00175.png', 'img_00176.png', 'img_00177.png', 'img_00178.png', 'img_00672.png', 'img_00673.png', 'img_00674.png', 'img_00675.png', 'img_00676.png', 'img_00677.png', 'img_00678.png', 'img_00679.png', 'img_00680.png', 'img_00681.png', 'img_00682.png', 'img_00683.png', 'img_00684.png', 'img_00882.png', 'img_00883.png', 'img_00884.png', 'img_00885.png', 'img_00886.png', 'img_00887.png', 'img_00938.png', 'img_00980.png', 'img_00981.png', 'img_00982.png', 'img_00983.png', 'img_00984.png', 'img_00985.png', 'img_00986.png', 'img_00987.png', 'img_00988.png', 'img_00989.png', 'img_01070.png', 'img_01071.png', 'img_01072.png', 'img_01073.png', 'img_01074.png', 'img_01075.png', 'img_01076.png', 'img_01077.png', 'img_01078.png', 'img_01079.png', 'img_01219.png', 'img_01220.png', 'img_01221.png', 'img_01222.png', 'img_01223.png', 'img_01224.png', 'img_01225.png', 'img_01226.png', 'img_01227.png', 'img_01228.png', 'img_01279.png', 'img_01280.png', 'img_01281.pn

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b0 and are newly initialized: ['decode_head.batch_norm.running_var', 'decode_head.batch_norm.running_mean', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.classifier.bias', 'decode_head.linear_c.0.proj.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.classifier.weight', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_c.2.proj.weight', 'decode_head.batch_norm.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_fuse.weight', 'decode_head.batch_norm.weight', 'decode_head.linear_c.1.proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1/1475 [00:09<3:46:01,  9.20s/it]

{'loss': 2.3566, 'learning_rate': 5.9959322033898307e-05, 'epoch': 0.02}


  0%|          | 2/1475 [00:14<2:51:23,  6.98s/it]

{'loss': 2.3155, 'learning_rate': 5.991864406779661e-05, 'epoch': 0.03}


  0%|          | 3/1475 [00:20<2:33:43,  6.27s/it]

{'loss': 2.2867, 'learning_rate': 5.9877966101694917e-05, 'epoch': 0.05}


  0%|          | 4/1475 [00:25<2:25:02,  5.92s/it]

{'loss': 2.2813, 'learning_rate': 5.983728813559322e-05, 'epoch': 0.07}


  0%|          | 5/1475 [00:30<2:20:21,  5.73s/it]

{'loss': 2.259, 'learning_rate': 5.979661016949153e-05, 'epoch': 0.08}


  0%|          | 6/1475 [00:36<2:17:07,  5.60s/it]

{'loss': 2.2331, 'learning_rate': 5.975593220338983e-05, 'epoch': 0.1}


  0%|          | 7/1475 [00:41<2:15:29,  5.54s/it]

{'loss': 2.2219, 'learning_rate': 5.971525423728814e-05, 'epoch': 0.12}


  1%|          | 8/1475 [00:46<2:14:05,  5.48s/it]

{'loss': 2.2086, 'learning_rate': 5.967457627118644e-05, 'epoch': 0.14}


  1%|          | 9/1475 [00:52<2:13:14,  5.45s/it]

{'loss': 2.2019, 'learning_rate': 5.963389830508475e-05, 'epoch': 0.15}


  1%|          | 10/1475 [00:57<2:12:38,  5.43s/it]

{'loss': 2.192, 'learning_rate': 5.959322033898305e-05, 'epoch': 0.17}


  1%|          | 11/1475 [01:03<2:13:00,  5.45s/it]

{'loss': 2.1605, 'learning_rate': 5.955254237288136e-05, 'epoch': 0.19}


  1%|          | 12/1475 [01:08<2:12:27,  5.43s/it]

{'loss': 2.1491, 'learning_rate': 5.951186440677966e-05, 'epoch': 0.2}


  1%|          | 13/1475 [01:13<2:11:57,  5.42s/it]

{'loss': 2.1372, 'learning_rate': 5.947118644067797e-05, 'epoch': 0.22}


  1%|          | 14/1475 [01:19<2:11:18,  5.39s/it]

{'loss': 2.1343, 'learning_rate': 5.943050847457627e-05, 'epoch': 0.24}


  1%|          | 15/1475 [01:24<2:11:01,  5.38s/it]

{'loss': 2.1009, 'learning_rate': 5.9389830508474584e-05, 'epoch': 0.25}


  1%|          | 16/1475 [01:30<2:10:50,  5.38s/it]

{'loss': 2.0831, 'learning_rate': 5.934915254237288e-05, 'epoch': 0.27}


  1%|          | 17/1475 [01:35<2:10:59,  5.39s/it]

{'loss': 2.0512, 'learning_rate': 5.930847457627119e-05, 'epoch': 0.29}


  1%|          | 18/1475 [01:40<2:10:45,  5.39s/it]

{'loss': 2.0312, 'learning_rate': 5.926779661016949e-05, 'epoch': 0.31}


  1%|▏         | 19/1475 [01:46<2:10:39,  5.38s/it]

{'loss': 2.0488, 'learning_rate': 5.92271186440678e-05, 'epoch': 0.32}


  1%|▏         | 20/1475 [01:51<2:10:30,  5.38s/it]

{'loss': 2.0035, 'learning_rate': 5.91864406779661e-05, 'epoch': 0.34}


  1%|▏         | 21/1475 [01:57<2:10:32,  5.39s/it]

{'loss': 1.9973, 'learning_rate': 5.914576271186441e-05, 'epoch': 0.36}


  1%|▏         | 22/1475 [02:02<2:10:21,  5.38s/it]

{'loss': 1.9994, 'learning_rate': 5.910508474576271e-05, 'epoch': 0.37}


  2%|▏         | 23/1475 [02:07<2:10:05,  5.38s/it]

{'loss': 1.9561, 'learning_rate': 5.906440677966102e-05, 'epoch': 0.39}


  2%|▏         | 24/1475 [02:13<2:09:48,  5.37s/it]

{'loss': 1.9343, 'learning_rate': 5.902372881355933e-05, 'epoch': 0.41}


  2%|▏         | 25/1475 [02:18<2:09:38,  5.36s/it]

{'loss': 1.892, 'learning_rate': 5.8983050847457634e-05, 'epoch': 0.42}


  2%|▏         | 26/1475 [02:23<2:10:13,  5.39s/it]

{'loss': 1.8725, 'learning_rate': 5.894237288135593e-05, 'epoch': 0.44}


  2%|▏         | 27/1475 [02:29<2:10:05,  5.39s/it]

{'loss': 1.8801, 'learning_rate': 5.890169491525424e-05, 'epoch': 0.46}


  2%|▏         | 28/1475 [02:34<2:09:40,  5.38s/it]

{'loss': 1.8483, 'learning_rate': 5.886101694915254e-05, 'epoch': 0.47}


  2%|▏         | 29/1475 [02:40<2:09:47,  5.39s/it]

{'loss': 1.83, 'learning_rate': 5.882033898305085e-05, 'epoch': 0.49}


  2%|▏         | 30/1475 [02:45<2:09:21,  5.37s/it]

{'loss': 1.7652, 'learning_rate': 5.877966101694915e-05, 'epoch': 0.51}


  2%|▏         | 31/1475 [02:50<2:09:51,  5.40s/it]

{'loss': 1.7702, 'learning_rate': 5.873898305084746e-05, 'epoch': 0.53}


  2%|▏         | 32/1475 [02:56<2:09:50,  5.40s/it]

{'loss': 1.7255, 'learning_rate': 5.869830508474576e-05, 'epoch': 0.54}


  2%|▏         | 33/1475 [03:01<2:10:08,  5.41s/it]

{'loss': 1.7311, 'learning_rate': 5.8657627118644074e-05, 'epoch': 0.56}


  2%|▏         | 34/1475 [03:07<2:09:49,  5.41s/it]

{'loss': 1.7232, 'learning_rate': 5.861694915254238e-05, 'epoch': 0.58}


  2%|▏         | 35/1475 [03:12<2:09:33,  5.40s/it]

{'loss': 1.6986, 'learning_rate': 5.8576271186440684e-05, 'epoch': 0.59}


  2%|▏         | 36/1475 [03:17<2:09:04,  5.38s/it]

{'loss': 1.6735, 'learning_rate': 5.853559322033899e-05, 'epoch': 0.61}


  3%|▎         | 37/1475 [03:23<2:09:02,  5.38s/it]

{'loss': 1.6152, 'learning_rate': 5.849491525423729e-05, 'epoch': 0.63}


KeyboardInterrupt: 

In [None]:
# import subprocess
# from typing import NoReturn

# def shutdown_windows() -> NoReturn:
#     subprocess.run(["shutdown", "/s", "/t", "0"])

# shutdown_windows()