In [3]:
!pip install pycocotools

Collecting pycocotools
  Downloading pycocotools-2.0.6.tar.gz (24 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: pycocotools
  Building wheel for pycocotools (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pycocotools: filename=pycocotools-2.0.6-cp39-cp39-linux_x86_64.whl size=400023 sha256=b02e1b122807ff667a354742f079b8571aa35f2c9585a5935c1eac66cda162d4
  Stored in directory: /root/.cache/pip/wheels/29/98/97/6c7dca1f8e4c854e15a2676ac98ae3f46ec83ee031d827a5c8
Successfully built pycocotools
Installing collected packages: pycocotools
Successfully installed pycocotools-2.0.6
[0m

In [1]:
import torch
import torchvision
import LocoFasterRCNN

faster_rcnn = LocoFasterRCNN.LocoFasterRCNN()
model = faster_rcnn.get_model()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Step 2: Load the state_dict into the model
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       

In [5]:
import LocoUtils

batch_size = 4
#load data
# Example Usage:
base_img_folder = '/notebooks/dataset/loco_new/images'
annotations_folder = '/notebooks/dataset/loco_new/annotations'
loader = LocoUtils.LocoDatasetLoader(base_img_folder, annotations_folder)
train_data_loader = loader.get_train(batch_size=batch_size)
val_data_loader = loader.get_val(batch_size=16)

loading annotations into memory...
Done (t=1.81s)
creating index...
index created!
loading annotations into memory...
Done (t=0.12s)
creating index...
index created!
loading annotations into memory...
Done (t=0.93s)
creating index...
index created!
loading annotations into memory...
Done (t=0.13s)
creating index...
index created!
loading annotations into memory...
Done (t=0.18s)
creating index...
index created!
loading annotations into memory...
Done (t=0.08s)
creating index...
index created!


In [6]:
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm
import time

model_folder = "/notebooks/models/fasterRCNN5/"

# Initialize GradScaler
scaler = GradScaler()
learning_rate = 0.001

# Define the parameters for training
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=learning_rate, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
    
def validate(val_model, val_data):
    val_loss = 0
    with torch.no_grad():
        for images, labels in val_data:
            val_imgs = list(img.to(device) for img in images)
            with autocast():
                labels = [{k: v.to(device) for k, v in t.items()} for t in labels]
                val_loss_dict = val_model(val_imgs, labels)
                val_losses = sum(loss for loss in val_loss_dict.values())
            val_loss += val_losses
    val_loss /= len(val_data)
    return val_loss


def train(train_model, train_data, epoch, accu_steps):
    train_model.train()
    epoch_loss = 0.0
    optimizer.zero_grad()  # Zero the gradients at the beginning of the epoch
    
    total_batches = len(train_data)
    quarter = total_batches // 4  # 25% of total batches
    half = total_batches // 2  # 50% of total batches
    three_quarters = (3 * total_batches) // 4  # 75% of total batches
    
    for batch, (imgs, annotations) in enumerate(train_data):
        # Move images to device and use AMP for forward pass
        imgs = list(img.to(device) for img in imgs)

        # Automatic mixed precision
        with autocast():
            # Move annotations to device
            annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]

            # Forward pass (compute the loss)
            loss_dict = train_model(imgs, annotations)
            losses = sum(loss for loss in loss_dict.values())

        # Normalize loss for gradient accumulation
        normalized_loss = losses / accu_steps
        
        # Backward pass and scale the gradients
        scaler.scale(normalized_loss).backward()

        # Update weights every accumulation_steps
        if (batch+1) % accu_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        #print(f'Iteration: {batch}/{len_dataloader}, Loss: {losses}')
        
        epoch_loss += losses

        # Logging
        if batch in [quarter, half, three_quarters]:
            avg_loss = epoch_loss / (batch + 1)
            #validation_loss = validate(model, val_data_loader)
            print(f"Epoch: {epoch} Iteration: {batch}/{total_batches}, Avg Train Loss: {avg_loss}")

    # Complete any remaining accumulation steps
    if (total_batches % accu_steps) != 0:
        # optimizer step in mixed-precision
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

    # Average epoch loss
    epoch_loss /= total_batches
    validation_loss = validate(train_model, val_data_loader)
    print(f"End of Epoch {epoch}, Avg Train Loss: {epoch_loss}, Avg Val Loss: {validation_loss}")  
    torch.save(model.state_dict(), f"{model_folder}fasterRCNN_valloss_{validation_loss:.5f}_t_{round(time.time())}_epoch{epoch}.pth")
    
    lr_scheduler.step()
    
    return validation_loss

In [None]:
accumulation_steps = 4
num_epochs = 20

print(f"Effectiv Batch Size: {batch_size*accumulation_steps}");
best_validation = 1000000
best_epoch = 0

for epoch in tqdm(range(num_epochs)):
    val = train(model, train_data_loader, epoch, accumulation_steps)
    if val < best_validation:
        best_validation = val
        best_epoch = epoch

print(best_epoch)
print(best_validation)

Effectiv Batch Size: 16


  0%|          | 0/20 [00:00<?, ?it/s]

Epoch: 0 Iteration: 176/705, Avg Train Loss: 1.6924943923950195
Epoch: 0 Iteration: 352/705, Avg Train Loss: 1.5123544931411743
Epoch: 0 Iteration: 528/705, Avg Train Loss: 1.4282758235931396
End of Epoch 0, Avg Train Loss: 1.3873661756515503, Avg Val Loss: 1.0961534976959229


  5%|▌         | 1/20 [03:22<1:04:09, 202.61s/it]

Epoch: 1 Iteration: 176/705, Avg Train Loss: 1.1801424026489258
Epoch: 1 Iteration: 352/705, Avg Train Loss: 1.1690423488616943
Epoch: 1 Iteration: 528/705, Avg Train Loss: 1.163226842880249
End of Epoch 1, Avg Train Loss: 1.1575835943222046, Avg Val Loss: 0.9902628660202026


 10%|█         | 2/20 [06:43<1:00:27, 201.51s/it]

Epoch: 2 Iteration: 176/705, Avg Train Loss: 1.0942485332489014
Epoch: 2 Iteration: 352/705, Avg Train Loss: 1.0910834074020386
Epoch: 2 Iteration: 528/705, Avg Train Loss: 1.089572548866272
End of Epoch 2, Avg Train Loss: 1.0835442543029785, Avg Val Loss: 0.9505996704101562


 15%|█▌        | 3/20 [10:03<56:57, 201.03s/it]  

Epoch: 3 Iteration: 176/705, Avg Train Loss: 1.0660642385482788
Epoch: 3 Iteration: 352/705, Avg Train Loss: 1.037660002708435
Epoch: 3 Iteration: 528/705, Avg Train Loss: 1.0393742322921753
End of Epoch 3, Avg Train Loss: 1.030774712562561, Avg Val Loss: 0.9223023653030396


 20%|██        | 4/20 [13:24<53:37, 201.08s/it]

Epoch: 4 Iteration: 176/705, Avg Train Loss: 0.988882839679718
Epoch: 4 Iteration: 352/705, Avg Train Loss: 0.9942277669906616
Epoch: 4 Iteration: 528/705, Avg Train Loss: 0.986353874206543
End of Epoch 4, Avg Train Loss: 0.9894009828567505, Avg Val Loss: 0.9119749069213867


 25%|██▌       | 5/20 [16:46<50:16, 201.08s/it]

Epoch: 5 Iteration: 176/705, Avg Train Loss: 0.9650481343269348
Epoch: 5 Iteration: 352/705, Avg Train Loss: 0.9720538258552551
Epoch: 5 Iteration: 528/705, Avg Train Loss: 0.9621459245681763
End of Epoch 5, Avg Train Loss: 0.9602461457252502, Avg Val Loss: 0.8940663933753967


 30%|███       | 6/20 [20:07<46:54, 201.04s/it]

Epoch: 6 Iteration: 176/705, Avg Train Loss: 0.9606383442878723
Epoch: 6 Iteration: 352/705, Avg Train Loss: 0.9460082650184631
Epoch: 6 Iteration: 528/705, Avg Train Loss: 0.935067355632782
End of Epoch 6, Avg Train Loss: 0.9298205375671387, Avg Val Loss: 0.8953602313995361


 35%|███▌      | 7/20 [23:27<43:32, 200.95s/it]

Epoch: 7 Iteration: 176/705, Avg Train Loss: 0.8937671780586243
Epoch: 7 Iteration: 352/705, Avg Train Loss: 0.9031398892402649
Epoch: 7 Iteration: 528/705, Avg Train Loss: 0.9078530669212341
End of Epoch 7, Avg Train Loss: 0.9029924869537354, Avg Val Loss: 0.8951126337051392


 40%|████      | 8/20 [26:48<40:10, 200.91s/it]

Epoch: 8 Iteration: 176/705, Avg Train Loss: 0.8975557088851929
Epoch: 8 Iteration: 352/705, Avg Train Loss: 0.8819948434829712
Epoch: 8 Iteration: 528/705, Avg Train Loss: 0.8778542876243591
End of Epoch 8, Avg Train Loss: 0.8822071552276611, Avg Val Loss: 0.8915929794311523


 45%|████▌     | 9/20 [30:09<36:50, 200.95s/it]

Epoch: 9 Iteration: 176/705, Avg Train Loss: 0.8790626525878906
Epoch: 9 Iteration: 352/705, Avg Train Loss: 0.8701871037483215
Epoch: 9 Iteration: 528/705, Avg Train Loss: 0.8591680526733398
End of Epoch 9, Avg Train Loss: 0.8616758584976196, Avg Val Loss: 0.8960938453674316


 50%|█████     | 10/20 [33:30<33:30, 201.05s/it]

Epoch: 10 Iteration: 176/705, Avg Train Loss: 0.8309755325317383
Epoch: 10 Iteration: 352/705, Avg Train Loss: 0.846805989742279
Epoch: 10 Iteration: 528/705, Avg Train Loss: 0.8410441279411316
End of Epoch 10, Avg Train Loss: 0.8407310843467712, Avg Val Loss: 0.9018093347549438


 55%|█████▌    | 11/20 [36:52<30:09, 201.08s/it]

Epoch: 11 Iteration: 176/705, Avg Train Loss: 0.8197072148323059
Epoch: 11 Iteration: 352/705, Avg Train Loss: 0.8216326236724854
Epoch: 11 Iteration: 528/705, Avg Train Loss: 0.8198654055595398
End of Epoch 11, Avg Train Loss: 0.8207520246505737, Avg Val Loss: 0.8976460099220276


 60%|██████    | 12/20 [40:13<26:48, 201.07s/it]

Epoch: 12 Iteration: 176/705, Avg Train Loss: 0.7865601181983948
Epoch: 12 Iteration: 352/705, Avg Train Loss: 0.8013361692428589
Epoch: 12 Iteration: 528/705, Avg Train Loss: 0.8081340193748474
End of Epoch 12, Avg Train Loss: 0.8024759292602539, Avg Val Loss: 0.9105001091957092


 65%|██████▌   | 13/20 [43:34<23:27, 201.09s/it]

Epoch: 13 Iteration: 176/705, Avg Train Loss: 0.7864357233047485
Epoch: 13 Iteration: 352/705, Avg Train Loss: 0.7762001752853394
Epoch: 13 Iteration: 528/705, Avg Train Loss: 0.7839877009391785
End of Epoch 13, Avg Train Loss: 0.7862030267715454, Avg Val Loss: 0.9178214073181152


 70%|███████   | 14/20 [46:55<20:06, 201.04s/it]

Epoch: 14 Iteration: 176/705, Avg Train Loss: 0.7552698850631714
Epoch: 14 Iteration: 352/705, Avg Train Loss: 0.7719546556472778
Epoch: 14 Iteration: 528/705, Avg Train Loss: 0.7740982174873352
End of Epoch 14, Avg Train Loss: 0.7704252004623413, Avg Val Loss: 0.9284406900405884


 75%|███████▌  | 15/20 [50:16<16:44, 201.00s/it]

Epoch: 15 Iteration: 176/705, Avg Train Loss: 0.7240247130393982
Epoch: 15 Iteration: 352/705, Avg Train Loss: 0.7463788390159607
Epoch: 15 Iteration: 528/705, Avg Train Loss: 0.7494082450866699
End of Epoch 15, Avg Train Loss: 0.7528107762336731, Avg Val Loss: 0.9444001317024231


 80%|████████  | 16/20 [53:36<13:23, 200.87s/it]

Epoch: 16 Iteration: 176/705, Avg Train Loss: 0.7418927550315857
Epoch: 16 Iteration: 352/705, Avg Train Loss: 0.7361294627189636
Epoch: 16 Iteration: 528/705, Avg Train Loss: 0.739048182964325
End of Epoch 16, Avg Train Loss: 0.7389781475067139, Avg Val Loss: 0.9621924161911011


 85%|████████▌ | 17/20 [56:58<10:03, 201.14s/it]