In [1]:
import os
from pathlib import Path

import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms,models
import torch.nn.functional as F
import pretrainedmodels
from torch.utils.data import Dataset, DataLoader

import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from GLC.data_loading.environmental_raster import PatchExtractor


In [3]:
from dataset import GeoLifeCLEF2022Dataset
from transform import get_train_transforms,get_valid_transforms,load_patch

In [4]:

DATA_PATH = DATA_PATH = Path("./datasets")
''' raster data
extractor_bio = PatchExtractor(DATA_PATH / "rasters", size=256)
extractor_bio.add_all_bioclimatic_rasters()
extractor_bio.append('sndppt')
'''

' raster data\nextractor_bio = PatchExtractor(DATA_PATH / "rasters", size=256)\nextractor_bio.add_all_bioclimatic_rasters()\nextractor_bio.append(\'sndppt\')\n'

In [5]:
dataset = GeoLifeCLEF2022Dataset(DATA_PATH,subset = "train", 
                                 region = 'both', 
                                 patch_data = "rgb", \
                                 use_rasters = False,\
                                 transform = get_train_transforms())#,\
                                 #transform = None)#,\
                                 #patch_extractor = extractor_bio )

In [6]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

batch_size = 16

In [7]:
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=32,shuffle = True,drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=32,shuffle = False,drop_last=True)

In [8]:
N_classes = 17036
k=30

In [9]:
from torchvision.models.resnet import ResNet, BasicBlock

import torch.nn as nn

class ResNetGeolife(ResNet):
    def __init__(self):
        super().__init__(BasicBlock, [3, 4, 6, 3], num_classes=N_classes)

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False)

        
# model = ResNetGeolife().to(device)

In [10]:
model = models.resnet50(pretrained = True)
model.fc = nn.Linear(2048,N_classes)
#print(model)

In [11]:
optimizer = torch.optim.Adam(model.parameters(),lr = 0.001)



def loss_fn(preds, labels,**kwargs):
    #print (preds)
    #print(labels)
    loss = nn.CrossEntropyLoss(**kwargs)(preds, labels)
    #loss = nn.BCEWithLogitsLoss()
    return loss
    


In [12]:
from smooth import topk
from smooth.topk.svm import SmoothTopkSVM

loss_function = SmoothTopkSVM(N_classes)
loss_function = loss_function.cuda()

def loss_fn(pred,labels):
    loss = loss_function(pred,labels)
    return loss

Setting tau to 1.0


In [13]:
from tqdm.notebook import tqdm
def train(model,optim,train_loader,val_loader,epochs = 2,device='cpu', patience=5):
    last_acc = None
    trigger_time = 0
    for epoch in range(epochs):
        training_loss = 0.0
        val_loss = 0.0
        model.train()
        
        num_correct = 0
        num_examples = 0
        for idx,(inputs,target) in enumerate(train_loader):
            optimizer.zero_grad()
            inputs = inputs.float()
            
            # normalize?
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s
            
            
            inputs = inputs.to(device)
            target = target.to(device)
            

            
            output = model(inputs)
            
            # output = output.to('cpu')
            

            
            loss = loss_fn(output,target)#,ignore_index = -1)
            
            
            
            loss.backward()
            optimizer.step()
            training_loss += loss.item()
            
            if idx % 50 == 0:
                print(f"Epoch: {epoch}, {idx}/{len(train_loader.dataset)}")
        
        training_loss /= len(train_loader.dataset)
        
        model.eval()
        num_correct = 0
        num_examples = 0
        
        for idx,(inputs,target) in enumerate(val_loader):
            inputs = inputs.float()
            
            # normalize?
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s
            
            inputs = inputs.to(device)
            target = target.to(device)
            
            output = model(inputs)
            
            loss = loss_fn(output,target)#,ignore_index = -1)
            
            val_loss += loss.item()
            
            # sus
            # It should have done in the top k fashion as it is validating.
            # But it looks like it is finding the top element
            # FIXED
            
            _,pred = torch.topk(output,k)
            correct = torch.eq(target[:, None, ...], pred).any(dim=1)
           
            num_correct += correct.sum()
            
            num_examples += correct.shape[0]
        
        val_loss /= len(val_loader.dataset)
        
        
        
        try:
            x =  num_correct/num_examples
            if last_acc != None:
                if last_acc > x:
                    trigger_time += 1
                    print(f"Triggered! {trigger_time}")
                else:
                    trigger_time = 0
            
            last_acc = x
        except ZeroDivisionError:
            x = 0
        print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}, '
              'accuracy = {:.2f}'.format(epoch+1, training_loss, val_loss, x))
        if trigger_time >= patience:
            print("Early stopping.")
            return


In [None]:
train(model.to(device), optimizer, train_loader, val_loader, epochs=100, device=device)

Epoch: 0, 0/78056
Epoch: 0, 50/78056
Epoch: 0, 100/78056
Epoch: 0, 150/78056
Epoch: 0, 200/78056
Epoch: 0, 250/78056
Epoch: 0, 300/78056
Epoch: 0, 350/78056
Epoch: 0, 400/78056
Epoch: 0, 450/78056
Epoch: 0, 500/78056
Epoch: 0, 550/78056
Epoch: 0, 600/78056
Epoch: 0, 650/78056
Epoch: 0, 700/78056
Epoch: 0, 750/78056
Epoch: 0, 800/78056
Epoch: 0, 850/78056
Epoch: 0, 900/78056
Epoch: 0, 950/78056
Epoch: 0, 1000/78056
Epoch: 0, 1050/78056
Epoch: 0, 1100/78056
Epoch: 0, 1150/78056
Epoch: 0, 1200/78056
Epoch: 0, 1250/78056
Epoch: 0, 1300/78056
Epoch: 0, 1350/78056
Epoch: 0, 1400/78056
Epoch: 0, 1450/78056
Epoch: 0, 1500/78056
Epoch: 0, 1550/78056
Epoch: 0, 1600/78056
Epoch: 0, 1650/78056
Epoch: 0, 1700/78056
Epoch: 0, 1750/78056
Epoch: 0, 1800/78056
Epoch: 0, 1850/78056
Epoch: 0, 1900/78056
Epoch: 0, 1950/78056
Epoch: 0, 2000/78056
Epoch: 0, 2050/78056
Epoch: 0, 2100/78056
Epoch: 0, 2150/78056
Epoch: 0, 2200/78056
Epoch: 0, 2250/78056
Epoch: 0, 2300/78056
Epoch: 0, 2350/78056
Epoch: 0, 2400/

In [None]:
PATH = './torchvision_resnet50.bin'
torch.save(model.state_dict(), PATH)

In [10]:
model = ResNetGeolife()
model.load_state_dict(torch.load('./torchvision_resnet50.bin'))

<All keys matched successfully>

In [11]:
def validate(model,val_loader):
    model.eval()
    num_correct = 0
    num_examples = 0
    
    for inputs,target in val_loader:
        inputs = inputs.float()
            
        # normalize?

        inputs = inputs.to(device)
        target = target.to(device)
        
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        output = model(inputs)
        # sus
        # It should have done in the top k fashion as it is validating.
        # But it looks like it is finding the top element

        _,pred = torch.topk(output,k)
        # print(pred,target)
        correct = torch.eq(target[:, None, ...], pred).any(dim=1)
        
        num_correct += correct.sum().item()

        num_examples += correct.shape[0]
    print('accuracy: {:.2f}'.format(num_correct/num_examples))

In [12]:
validate(model.to(device),val_loader)

accuracy: 0.10


NameError: name 'gc' is not defined

In [28]:
test_dataset = GeoLifeCLEF2022Dataset(DATA_PATH,subset = "test", 
                                 region = 'both', 
                                 patch_data = 'landcover', \
                                 use_rasters = True,\
                                 transform = None,\
                                 patch_extractor = extractor_bio
                                 )

In [29]:
test_loader = DataLoader(test_dataset, batch_size=1, shuffle = False)

In [None]:
def test_inference (model, dl):
    correct_prediction = 0
    total_prediction = 0
    PREDS = []
    #LABELS = []
    model.eval()
    # Disable gradient updates
    with torch.no_grad():
        for data in tqdm(iter(dl)):
            # Get the input features , and put them on the GPU
            inputs = data[0]
            inputs = inputs.float()
            # Normalize the inputs
            #inputs_m, inputs_s = inputs.mean(), inputs.std()
            #inputs = (inputs - inputs_m) / inputs_s
            #print (inputs.shape)
            # inputs = np.repeat(inputs[..., np.newaxis], 3, -1)
            inputs = inputs.unsqueeze(0)
            #print (inputs.shape)
            #if inputs.size(1) > 3:
            # inputs = inputs.permute(0,3, 1,2)

            inputs = inputs.to(device)
            # Get predictions
            outputs = model(inputs)

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs,1)
            PREDS.append(prediction.view(-1).cpu().detach().numpy())



    PREDS = np.concatenate(PREDS)
   # LABELS = np.concatenate(LABELS)

#     preds_df = pd.DataFrame({'song_id':LABELS, 'genre_id':PREDS})
    return (PREDS)