In [1]:
!pip install evaluate

import pandas as pd
from collections import Counter
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import (
    CLIPProcessor,
    CLIPModel,
    CLIPConfig,
    CLIPTextConfig,
    CLIPVisionConfig
)
from transformers.modeling_outputs import SequenceClassifierOutput
from PIL import Image, ImageOps
from tqdm.notebook import tqdm
import evaluate
import ast
import sklearn
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
TRAIN_CSV = "/kaggle/input/final-version-dataset/train_data_1.csv"
VAL_CSV = "/kaggle/input/final-version-dataset/validation_data.csv"
TR_IMAGE_DIR = "/kaggle/input/final-version-dataset/Dataset/Dataset/Train/"
VAL_IMAGE_DIR = "/kaggle/input/final-version-dataset/Dataset/Dataset/Validation/"
TEACHER_PATH = "openai/clip-vit-large-patch14-336"
BATCH_SIZE = 16
NUM_CLASSES = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train_df = pd.read_csv(TRAIN_CSV)
val_df = pd.read_csv(VAL_CSV)

print("Unique values for Train: " + str({**Counter(train_df.text)}))
print("Unique values for Validation: " + str({**Counter(val_df.text)}))

Unique values for Train: {'Other': 299, 'Belkart': 216, 'Mastercard Other': 201, 'Mastercard Belkart Other': 201, 'Mastercard': 213, 'Visa Mastercard Belkart': 200, 'Visa Mastercard Other': 226, 'Visa Other': 201, 'Visa': 213, 'Belkart Other': 201}
Unique values for Validation: {'Visa Mastercard Other': 101, 'Visa Mastercard': 100, 'Visa Mastercard Belkart': 200, 'Other': 202}


In [4]:
""""train_df = train_df[~((train_df.text == "Mastercard Other") | 
                      (train_df.text == "Visa Other") | 
                      (train_df.text == "Other") | 
                      (train_df.text == "Belkart Other"))]

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)""""

In [4]:
train_df.loc[:, "labels"] = train_df.labels.apply(ast.literal_eval)
train_df.loc[:, "labels"] = train_df.labels.apply(np.float32)

val_df.loc[:, "labels"] = val_df.labels.apply(ast.literal_eval)
val_df.loc[:, "labels"] = val_df.labels.apply(np.float32)

In [12]:
class Loader(Dataset):
    def __init__(self, data_dir, df, processor):
        super().__init__()
        self.data_dir = data_dir
        self.df = df
        self.mean = np.append(np.array(processor.image_processor.image_mean),0.45)
        self.std = np.append(np.array(processor.image_processor.image_std),0.26)

    
    def __len__(self):
        return len(self.df)

    def image_preprocessor(self, image):
        image = image.resize((336,336))
        image = np.array(image) / 255
        image = image.transpose(2, 0, 1)
        image = (image - self.mean[:, None, None]) / self.std[:, None, None]
        return torch.tensor(image)

    def __getitem__(self, index):
        file_name = self.df["file_name"][index]
        labels = self.df["labels"][index]
        image = Image.open(self.data_dir + file_name).convert("RGBA")
        pixel_values = self.image_preprocessor(image)
            
        return [pixel_values.squeeze(0),
                torch.tensor(labels)] 

In [13]:
class_list = [
    "Visa",
    "Mastercard",
    "Belkart",
    "Other"
]

prompt_list = []
for item in class_list:
    prompt = "A photo of " + item
    prompt_list.append(prompt)
    
processor = CLIPProcessor.from_pretrained(TEACHER_PATH)

text = processor.tokenizer(prompt_list, padding="max_length",
                           truncation=True, return_tensors="pt").to(device)

train_dataset = Loader(data_dir=TR_IMAGE_DIR, df=train_df,
                       processor=processor)

val_dataset = Loader(data_dir=VAL_IMAGE_DIR, df=val_df,
                     processor=processor)

In [14]:
train_dataloader = DataLoader(train_dataset, 
                              batch_size=BATCH_SIZE,
                              shuffle=True)

eval_dataloader = DataLoader(val_dataset, 
                             batch_size=BATCH_SIZE)

In [100]:
""""for param in CLIP.named_parameters():
    if (param[0] == 'visual_projection.weight' or
        param[0] == 'text_projection.weight' or
        param[0] == "logit_scale"):
        continue
    else:
        param[1].requires_grad=True""""

In [15]:
def normalize(vector, e=1e-08):
    return (vector - vector.mean())/(vector.std() + e)

class CustomModel(nn.Module):
    def __init__(self, model, num_labels):
        super(CustomModel,self).__init__()
        self.num_labels = num_labels
        self.model = model
        self.dense = nn.Linear(4,512)
        self.classifier = nn.Linear(512,4)
        self.normalize = normalize
        self.activation = nn.Sigmoid()
    

    def forward(self, input_ids, pixel_values, attention_mask, labels=None):

        outputs = self.model(pixel_values=pixel_values,
                             input_ids=input_ids,
                             attention_mask=attention_mask)

        logits_per_image = outputs.logits_per_image
        #normalized_logits = self.normalize(logits_per_image)
        flattened_logits = torch.flatten(logits_per_image)
        pred_logits = self.dense(flattened_logits)
        logits = self.activation(self.classifier(pred_logits))
        
        loss=None
        if labels is not None:
            criterion = nn.BCELoss()
            loss = criterion(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))

    
        return SequenceClassifierOutput(loss=loss, logits=logits)

vision_config = CLIPVisionConfig(num_channels=4, image_size=336, patch_size=14)
configuration = CLIPConfig.from_text_vision_configs(CLIPTextConfig(), vision_config)
CLIP = CLIPModel(configuration)
model = CustomModel(CLIP, NUM_CLASSES).to(device)

In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.999)
f1 = evaluate.load("f1", trust_remote_code=True)

In [17]:
#iter = []
preds = []

for epoch in range(5):
    total_loss = 0.0
    model.train()
    for images,labels in tqdm(train_dataloader):
        batch_loss = 0.0
        for image, label in zip(images,labels):
            image = image.to(device).unsqueeze(0)
            label = label.to(device).unsqueeze(0)
            
            outputs = model(pixel_values=image,
                           input_ids=text["input_ids"],
                           attention_mask=text["attention_mask"],
                           labels=label)
        
            loss = outputs.loss
            batch_loss+=loss
            
        batch_loss.backward()
        #plot_grad_flow(CLIP.named_parameters())
        #iter.append(ave_grads)
        optimizer.step()
        optimizer.zero_grad()
        total_loss += batch_loss.item()

    model.eval()
    for images,labels in tqdm(eval_dataloader):
        for image,label in zip(images,labels):
            image = image.to(device).unsqueeze(0)
            label = label.to(device)

            outputs = model(pixel_values=image,
                            input_ids=text["input_ids"],
                            attention_mask=text["attention_mask"])

            activation = outputs.logits
            predictions = (activation >= 0.5).float()
            preds.append(predictions)
            f1.add_batch(predictions=predictions,
                         references=label)

        
    print("Training Loss: " + str(total_loss / len(train_dataloader)))
    print(f1.compute(average='binary'))

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

Training Loss: 10.37223939334645
{'f1': 0.7138286526839673}


  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

Training Loss: 8.650934892542223
{'f1': 0.5607327757865394}


  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

Training Loss: 8.46806132442811
{'f1': 0.5842323651452282}


  0%|          | 0/136 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
#pred = torch.cat(preds, dim=-1)
pred = torch.stack(preds)
true = torch.tensor(val_df["labels"], dtype=torch.float32)

print(sklearn.metrics.classification_report(
    true.cpu().numpy(),
    pred[-true.shape[0]:].cpu().numpy(),
    target_names=["Mastercard",
                  "Visa",
                  "Белкарт",
                  "Иные"]
))

              precision    recall  f1-score   support

  Mastercard       1.00      0.25      0.40       401
        Visa       0.00      0.00      0.00       401
     Белкарт       0.00      0.00      0.00       200
        Иные       0.40      0.67      0.50       303

   micro avg       0.33      0.23      0.27      1305
   macro avg       0.35      0.23      0.23      1305
weighted avg       0.40      0.23      0.24      1305
 samples avg       0.25      0.39      0.29      1305



  true = torch.tensor(val_df["labels"], dtype=torch.float32)


In [77]:
image = train_dataset[830][0].unsqueeze(0).to(device)
text = processor.tokenizer(text=["a photo of Visa","a photo of Mastercard", "a photo of Belkart",
                                 "a photo of Other"],
                          return_tensors="pt",
                          padding=True).to(device)

outputs = CLIP(pixel_values=image, input_ids=text["input_ids"])
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=-1)
print(torch.round(probs, decimals=3))

tensor([[0.1450, 0.1830, 0.2330, 0.4390]], device='cuda:0',
       grad_fn=<RoundBackward1>)


In [73]:
train_df[train_df.text == "Other"].index

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       919, 920, 921, 922, 923, 924, 925, 926, 927, 928],
      dtype='int64', length=299)

In [71]:
val_df[val_df.text == "Visa Mastercard Other"].index

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100],
      dtype='int64', length=101)

In [78]:
image = val_dataset[95][0].unsqueeze(0).to(device)
# for reference: Mastercard, Visa, Belkart, Other

outputs = model(pixel_values=image, 
                input_ids=text["input_ids"],
                attention_mask=text["attention_mask"])
preds = outputs.logits
print(torch.round(preds, decimals=3))

tensor([0.9770, 0.1490, 0.9130, 0.8120], device='cuda:0',
       grad_fn=<RoundBackward1>)


In [102]:
"""def plot_grad_flow(named_parameters):
    global ave_grads
    ave_grads = []
    global layers
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean().cpu())
    plt.plot(ave_grads, alpha=0.3, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
    plt.xlim(xmin=0, xmax=len(ave_grads))
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)"""