In [1]:
!pip install evaluate

import pandas as pd
from collections import Counter
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import (
    CLIPProcessor,
    CLIPModel
)
from transformers.modeling_outputs import SequenceClassifierOutput
from PIL import Image
from tqdm import tqdm
import evaluate
import ast
import sklearn
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [11]:
TRAIN_CSV = "/kaggle/input/simplified-task/train_data.csv"
VAL_CSV = "/kaggle/input/simplified-task/validation_data.csv"
TR_IMAGE_DIR = "/kaggle/input/simplified-task/Dataset/Dataset/Dataset/Train/"
VAL_IMAGE_DIR = "/kaggle/input/simplified-task/Dataset/Dataset/Dataset/Validation/"
TEACHER_PATH = "openai/clip-vit-base-patch32"
BATCH_SIZE = 1
NUM_CLASSES = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train_df = pd.read_csv(TRAIN_CSV)
val_df = pd.read_csv(VAL_CSV)

print("Unique values for Train: " + str({**Counter(train_df.text)}))
print("Unique values for Validation: " + str({**Counter(val_df.text)}))

Unique values for Train: {'Belkart': 216, 'Mastercard': 213, 'Mir': 202, 'Visa Mastercard Mir': 226, 'Visa': 213, 'Visa Mastercard Belkart': 200, 'Mastercard Belkart Mir': 201, 'Visa Mir': 201, 'Accept': 190, 'Belkart Password': 201, 'ID-Check Belkart Password': 201, 'ID-Check Belkart': 200, 'ID-Check Mastercard': 201, 'ID-Check': 198, 'Mir Accept': 201, 'Password': 198, 'Secure ID-Check': 201, 'Secure': 202, 'Visa Secure': 201, 'Other': 201}
Unique values for Validation: {'Visa Mastercard Mir': 101, 'Visa Mastercard': 100, 'Visa Mastercard Belkart': 200, 'ID-Check Accept Secure Password': 101, 'ID-Check Secure Password': 101}


In [4]:
train_df = train_df[(train_df.text == "Visa Mir") | 
                    (train_df.text == "Visa") | 
                    (train_df.text == "Other") |
                    (train_df.text == "Mir") | 
                    (train_df.text == "Mastercard") |
                    (train_df.text == "Visa Mastercard Mir")]

val_df = val_df[(val_df.text == "Visa Mastercard Mir") |
                (val_df.text == "Visa Mastercard") |
                (val_df.text == "Visa Mastercard Belkart")]

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

In [5]:
train_df.loc[:, "labels"] = train_df.labels.apply(ast.literal_eval)
train_df.loc[:, "labels"] = train_df.labels.apply(np.float32)

val_df.loc[:, "labels"] = val_df.labels.apply(ast.literal_eval)
val_df.loc[:, "labels"] = val_df.labels.apply(np.float32)

In [6]:
class Loader(Dataset):
    def __init__(self, data_dir, df, processor):
        super().__init__()
        self.data_dir = data_dir
        self.df = df
        self.processor = processor

    
    def __len__(self):
        return len(self.df)


    def __getitem__(self, index):
        file_name = self.df["file_name"][index]
        labels = self.df["labels"][index]
        image = Image.open(self.data_dir + file_name).convert("RGBA")
        pixel_values = self.processor.image_processor(image, return_tensors="pt").pixel_values
        
        return {"pixel_values": pixel_values.squeeze(),
                "labels": torch.tensor(labels)}

In [8]:
class_list = [
    "Visa",
    "Mastercard",
    "Mir",
    "Other"
]

prompt_list = []
for item in class_list:
    prompt = "A photo of " + item
    prompt_list.append(prompt)
    
processor = CLIPProcessor.from_pretrained(TEACHER_PATH)

text = processor.tokenizer(prompt_list, padding="max_length",
                           truncation=True, return_tensors="pt").to(device)

train_dataset = Loader(data_dir=TR_IMAGE_DIR, df=train_df,
                       processor=processor)

val_dataset = Loader(data_dir=VAL_IMAGE_DIR, df=val_df,
                     processor=processor)

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

In [9]:
train_dataloader = DataLoader(train_dataset, 
                              batch_size=BATCH_SIZE,
                              drop_last=True,
                              shuffle=True)

eval_dataloader = DataLoader(val_dataset, 
                             batch_size=BATCH_SIZE,
                             drop_last=True)

In [100]:
""""for param in CLIP.named_parameters():
    if (param[0] == 'visual_projection.weight' or
        param[0] == 'text_projection.weight' or
        param[0] == "logit_scale"):
        continue
    else:
        param[1].requires_grad=True""""

In [48]:
def normalize(vector, e=1e-08):
    return (vector - vector.mean())/(vector.std() + e)

class CustomModel(nn.Module):
    def __init__(self, model, num_labels):
        super(CustomModel,self).__init__()
        self.num_labels = num_labels
        self.model = model
        self.classifier = nn.Linear(4,self.num_labels)
        self.normalize = normalize
        self.activation = nn.Sigmoid()
    

    def forward(self, input_ids, pixel_values, attention_mask, labels=None):

        with torch.no_grad():
            outputs = self.model(pixel_values=pixel_values,
                                 input_ids=input_ids,
                                 attention_mask=attention_mask)

        logits_per_image = outputs.logits_per_image
        flattened_logits = torch.flatten(logits_per_image)
        #normalized_logits = self.normalize(flattened_logits)
        logits = self.activation(self.classifier(flattened_logits))
        
        loss=None
        if labels is not None:
            criterion = nn.BCELoss()
            loss = criterion(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))

    
        return SequenceClassifierOutput(loss=loss, logits=logits)


stats = torch.load("/kaggle/input/clip_state/pytorch/default/1/state.pt")
CLIP = CLIPModel.from_pretrained(TEACHER_PATH,attn_implementation="sdpa")
CLIP.load_state_dict(stats)
model = CustomModel(CLIP, NUM_CLASSES).to(device)

  stats = torch.load("/kaggle/input/clip_state/pytorch/default/1/state.pt")


In [49]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
f1 = evaluate.load("f1", trust_remote_code=True)

In [53]:
#iter = []
preds = []

for epoch in range(5):
    train_loss = 0.0
    model.train()
    for batch in tqdm(train_dataloader):
        for k,v in batch.items():
            batch[k] = v.to(device)

        outputs = model(pixel_values=batch["pixel_values"],
                       input_ids=text["input_ids"],
                       attention_mask=text["attention_mask"],
                       labels=batch["labels"])
        
        loss = outputs.loss
        loss.backward()
        #plot_grad_flow(CLIP.named_parameters())
        #iter.append(ave_grads)
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()

    model.eval()
    for batch in tqdm(eval_dataloader):
        for k,v in batch.items():
            batch[k] = v.to(device)

        outputs = model(pixel_values=batch["pixel_values"],
                        input_ids=text["input_ids"],
                        attention_mask=text["attention_mask"])

        logits = outputs.logits
        activation = torch.sigmoid(logits)
        predictions = (activation >= 0.5).float()
        preds.append(predictions)
        f1.add_batch(predictions=predictions,
                     references=batch["labels"][0])

        
    print("Training Loss: " + str(train_loss / len(train_dataloader)))
    print(f1.compute(average='binary'))

100%|██████████| 1256/1256 [00:35<00:00, 35.21it/s]
100%|██████████| 401/401 [00:11<00:00, 33.74it/s]


Training Loss: 2.321382303682162
{'f1': 0.814924270410048}


100%|██████████| 1256/1256 [00:35<00:00, 35.69it/s]
100%|██████████| 401/401 [00:11<00:00, 33.63it/s]


Training Loss: 0.9732458575266847
{'f1': 0.814924270410048}


100%|██████████| 1256/1256 [00:35<00:00, 34.95it/s]
100%|██████████| 401/401 [00:12<00:00, 33.37it/s]


Training Loss: 0.499494560246777
{'f1': 0.814924270410048}


100%|██████████| 1256/1256 [00:35<00:00, 35.60it/s]
100%|██████████| 401/401 [00:12<00:00, 32.82it/s]


Training Loss: 0.42920639040245184
{'f1': 0.814924270410048}


100%|██████████| 1256/1256 [00:35<00:00, 34.95it/s]
100%|██████████| 401/401 [00:12<00:00, 33.28it/s]

Training Loss: 0.39423197793187037
{'f1': 0.814924270410048}





In [54]:
#pred = torch.cat(preds, dim=-1)
pred = torch.stack(preds)
true = torch.tensor(val_df["labels"], dtype=torch.float32)

print(sklearn.metrics.classification_report(
    true.cpu().numpy(),
    pred[-true.shape[0]:].cpu().numpy(),
    target_names=["Mastercard",
                  "Visa",
                  "Мир",
                  "Иные"]
))

              precision    recall  f1-score   support

  Mastercard       1.00      1.00      1.00       401
        Visa       1.00      1.00      1.00       401
         Мир       0.25      1.00      0.40       101
        Иные       0.50      1.00      0.67       200

   micro avg       0.69      1.00      0.81      1103
   macro avg       0.69      1.00      0.77      1103
weighted avg       0.84      1.00      0.88      1103
 samples avg       0.69      1.00      0.81      1103



In [52]:
image = train_dataset[414]["pixel_values"].unsqueeze(0).to(device)
text = processor.tokenizer(text=["a photo of Visa","a photo of Mastercard", "a photo of Mir",
                                 "a photo of Other"],
                          return_tensors="pt",
                          padding=True).to(device)

outputs = CLIP(pixel_values=image, input_ids=text["input_ids"])
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=-1)
print(torch.round(probs, decimals=3))

tensor([[0.0000, 0.0000, 0.9860, 0.0140]], device='cuda:0',
       grad_fn=<RoundBackward1>)


In [50]:
train_df[train_df.text == "Mir"].index

Index([213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
       ...
       405, 406, 407, 408, 409, 410, 411, 412, 413, 414],
      dtype='int64', length=202)

In [51]:
image = train_dataset[414]["pixel_values"].unsqueeze(0).to(device)
text = processor.tokenizer(text=["A photo of Visa","A photo of Mastercard", "A photo of Mir",
                                 "A photo of Other"],
                          return_tensors="pt",
                          padding=True).to(device)

outputs = model(pixel_values=image, 
                input_ids=text["input_ids"],
                attention_mask=text["attention_mask"])
logits_per_image = outputs.logits
probs = logits_per_image.sigmoid()
print(torch.round(probs, decimals=3))

tensor([0.5000, 0.5000, 0.5000, 0.5090], device='cuda:0',
       grad_fn=<RoundBackward1>)


In [102]:
"""def plot_grad_flow(named_parameters):
    global ave_grads
    ave_grads = []
    global layers
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean().cpu())
    plt.plot(ave_grads, alpha=0.3, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
    plt.xlim(xmin=0, xmax=len(ave_grads))
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)"""