In [433]:
import torch
import torch.nn as nn
from torchvision import transforms, models
from PIL import Image
import pandas as pd
import numpy as np
import os

In [434]:
df = pd.DataFrame({"image_path": [os.getcwd() + '/images/' + image_path for image_path in os.listdir('images')]})

In [435]:
with open("text.txt") as f:
    content = f.readlines()

content = [x.strip("\n") for x in content]
content = list(filter(len, content))

In [436]:
df = df[:106]
df["text"] = list(filter(None, pd.Series(content).str.split(".").explode()))

In [437]:
df["label"] = np.random.randint(low=0, high=2, size=106)
df["label"] = np.repeat([1, 0], [90, 16])
df["label"] = 0
# df["label"][df["text"].apply(len) > 90] = 1
df["label"][df["text"].str.contains("filosofi")] = 1

In [438]:
transform = transforms.Compose([
    transforms.Resize(size=(256, 256), 
                      interpolation=3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

In [439]:
class AdvertisementDataset(torch.utils.data.Dataset):
    """Dataset with cropped images and text from OCR performed on newspapers."""

    def __init__(self, df, transform=None):
        """
        Args:
            df (DataFrame): DataFrame with text, image path and label annotations.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        
        self.df = df
        self.transform = transform
        self.tokenizer = torch.hub.load('huggingface/pytorch-transformers', 
                                        'tokenizer', 
                                        'KB/bert-base-swedish-cased')  # Download vocabulary from S3 and cache.

    def __len__(self):
        return len(self.df)

    def load_image(self, index):
        image_path = self.df.iloc[index].image_path
        image = Image.open(image_path)
        return(image)

    def __getitem__(self, index):
        df_row = self.df.iloc[index]
        label = df_row["label"]

        # Text
        ocr_text = df_row["text"]
        token_info = self.tokenizer.encode_plus(ocr_text, 
                                                max_length=64, 
                                                truncation=True, 
                                                pad_to_max_length=True,
                                                return_tensors="pt")

        # token_output = {"input_ids": token_info["input_ids"].to("cuda"),
        #                 "token_type_ids": token_info["token_type_ids"].to("cuda"),
        #                 "attention_mask": token_info["attention_mask"].to("cuda")}

        # Image
        image_path = df_row["image_path"]
        image = self.load_image(index)
        if self.transform:
            image = transform(image) # Resize to 256x256 and imagenet normalization

        return image, token_info, label

In [440]:
dataset = AdvertisementDataset(df=df, transform=transforms)

Using cache found in /home/faton/.cache/torch/hub/huggingface_pytorch-transformers_master


In [441]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True, num_workers=4)

In [442]:
testloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4)

In [443]:
# a = iter(dataloader)
# next(a)

In [444]:
class BertResnetClassifier(nn.Module):
    def __init__(self, pretrained=True):
        super().__init__() # Initialize superclass nn.Module
        if pretrained:
            self.bert = torch.hub.load('huggingface/pytorch-transformers', 'model', 'KB/bert-base-swedish-cased', output_hidden_states=True)
            self.resnet50 = models.resnet50(pretrained=pretrained)
            self.resnet50.fc = nn.Identity() # Remove fc layer (classification head)
        else:
            # Load saved models from disk
            pass
        self.linear1 = nn.Linear(2816, 512)
        self.linear2 = nn.Linear(512, 1) # 1 output class.

    def forward(self, image, token_ids, type_ids, mask):
        # image.unsqueeze_(0) # Add batch dimension to image tensor
        image_embedding = self.resnet50(image)
        hidden_states = self.bert(token_ids, token_type_ids=type_ids, attention_mask=mask)
        
        output_embedding = torch.cat([image_embedding, hidden_states[0][:,0,:]], dim=1) # (1, 2048) (image) + (1, 768) (text) = (1, 2816)
        linear1_output = self.linear1(output_embedding)
        linear2_output = self.linear2(linear1_output)

        return linear2_output

    def __str__(self):
        print(self.temp)

In [445]:
model = BertResnetClassifier()

Using cache found in /home/faton/.cache/torch/hub/huggingface_pytorch-transformers_master


In [446]:
model.to("cuda")
torch.cuda.get_device_name(0)

'GeForce RTX 3090'

In [447]:
# a = model(image=dataset[0][0],
#       token_ids=dataset[1][1]["input_ids"],
#       type_ids=dataset[1][1]["token_type_ids"],
#       mask=dataset[1][1]["attention_mask"])

In [448]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))

In [528]:
for i, batch in enumerate(dataloader):
    images = batch[0]
    texts = batch[1]
    labels = batch[2].to("cuda")
    optimizer.zero_grad()
    output = model(image=images.to("cuda"),
                   token_ids=texts["input_ids"].squeeze(dim=1).to("cuda"),
                   type_ids=texts["token_type_ids"].squeeze(dim=1).to("cuda"),
                   mask=texts["attention_mask"].squeeze(dim=1).to("cuda"))

    labels = labels.unsqueeze(1).type_as(output) # (8) -> (8, 1) and long to float
    loss = loss_fn(output, labels)
    loss.backward()
    optimizer.step()

In [507]:
df[0:8]

Unnamed: 0,image_path,text,label,probs
0,/home/faton/projects/ad_classification/images/...,Simone de Beauvoir måste räknas som en av de m...,0,0.001757294
1,/home/faton/projects/ad_classification/images/...,Senast gjorde hon sig påmind genom ett Goncou...,0,0.006978888
2,/home/faton/projects/ad_classification/images/...,Men Goncourtakademien krönte denna gång en re...,1,0.2129315
3,/home/faton/projects/ad_classification/images/...,Det mest berömda liksom det mest omfattande a...,0,9.290784e-05
4,/home/faton/projects/ad_classification/images/...,Det är en undersökning på mycket bred bas som ...,0,0.0001352533
5,/home/faton/projects/ad_classification/images/...,"Historia, filosofi och etnologi har fått släp...",1,0.1430624
6,/home/faton/projects/ad_classification/images/...,I förbigående bör nämnas att Simone de Beauvo...,0,4.863443e-08
7,/home/faton/projects/ad_classification/images/...,(Det är säkert ingen händelse att den kvinnli...,0,2.598913e-07


In [529]:
probs_list = []

for i, batch in enumerate(testloader):
    images = batch[0]
    texts = batch[1]
    labels = batch[2].to("cuda")
    output = model(image=images.to("cuda"),
                   token_ids=texts["input_ids"].squeeze(dim=1).to("cuda"),
                   type_ids=texts["token_type_ids"].squeeze(dim=1).to("cuda"),
                   mask=texts["attention_mask"].squeeze(dim=1).to("cuda"))

    probs_list += torch.nn.functional.sigmoid(output).tolist()
    print(torch.nn.functional.sigmoid(output))
    print((torch.nn.functional.sigmoid(output) > 0.5).float()) # predict


tensor([[3.5975e-08],
        [7.1089e-04],
        [4.5710e-01],
        [2.5725e-05],
        [2.1728e-08],
        [3.9464e-01],
        [5.7320e-07],
        [1.9266e-06]], device='cuda:0', grad_fn=<SigmoidBackward>)
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], device='cuda:0')
tensor([[1.2702e-10],
        [3.3323e-06],
        [2.2379e-05],
        [4.8090e-06],
        [4.4815e-03],
        [6.9364e-04],
        [1.4362e-05],
        [3.4696e-04]], device='cuda:0', grad_fn=<SigmoidBackward>)
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], device='cuda:0')
tensor([[1.1649e-04],
        [1.9955e-05],
        [2.4999e-05],
        [1.0151e-04],
        [3.3446e-07],
        [8.1839e-02],
        [7.2932e-09],
        [1.1029e-04]], device='cuda:0', grad_fn=<SigmoidBackward>)
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
       

In [530]:
df["probs"] = sum(probs_list, [])

In [531]:
df.sort_values(by = "probs", ascending=False)

Unnamed: 0,image_path,text,label,probs
2,/home/faton/projects/ad_classification/images/...,Men Goncourtakademien krönte denna gång en re...,1,4.570983e-01
5,/home/faton/projects/ad_classification/images/...,"Historia, filosofi och etnologi har fått släp...",1,3.946420e-01
21,/home/faton/projects/ad_classification/images/...,I och med att kvinnan såsom hemgifts- eller a...,0,8.183917e-02
37,/home/faton/projects/ad_classification/images/...,"Själva framtidsperspektivet ter sig, i synner...",0,2.267263e-02
12,/home/faton/projects/ad_classification/images/...,Sedan behövs det bara ett enkelt trick för at...,0,4.481472e-03
...,...,...,...,...
71,/home/faton/projects/ad_classification/images/...,Å andra sidan är det säkert att mannens psykof...,0,1.714375e-09
40,/home/faton/projects/ad_classification/images/...,Förändringen är i varje fall inte av tillräck...,0,1.238455e-09
62,/home/faton/projects/ad_classification/images/...,"Den ""kvinnliga uppfostran” syftar, eller har ...",0,7.030374e-10
8,/home/faton/projects/ad_classification/images/...,Är kvinnan mannen underlägsen och i så fall va...,0,1.270227e-10


In [27]:
dataset[0][0].unsqueeze(0).shape

torch.Size([1, 3, 256, 256])

In [7]:
text_1 = "Vem var Jim Hansson?"
text_2 = "Jim Hansson var en skådespelare"
text_3 = "Det var jag."

In [10]:
token_info = tokenizer.encode_plus(text_1, max_length=32, truncation=True, pad_to_max_length=True)

In [11]:
with torch.no_grad():
    encoded_layers = model(torch.tensor([token_info["input_ids"]]), 
                            token_type_ids=torch.tensor([token_info["token_type_ids"]]), 
                            attention_mask=torch.tensor([token_info["attention_mask"]])) # last_hidden_state, pooled output (CLS token through some activation), hidden states

In [115]:
tokenizer.mask_token_id

4

In [36]:
params = list(model.named_parameters())

In [37]:
print(f"The model has {len(params)} named parameters.")

The model has 201 named parameters.


In [40]:
for p in params[0:5]:
    print(f"{p[0]:55} {tuple(p[1].size())}")

for p in params[5:21]:
    print(f"{p[0]:55} {tuple(p[1].size())}")

for p in params[-4:]:
    print(f"{p[0]:55} {tuple(p[1].size())}")

bert.embeddings.word_embeddings.weight                  (50325, 768)
bert.embeddings.position_embeddings.weight              (512, 768)
bert.embeddings.token_type_embeddings.weight            (2, 768)
bert.embeddings.LayerNorm.weight                        (768,)
bert.embeddings.LayerNorm.bias                          (768,)
bert.encoder.layer.0.attention.self.query.weight        (768, 768)
bert.encoder.layer.0.attention.self.query.bias          (768,)
bert.encoder.layer.0.attention.self.key.weight          (768, 768)
bert.encoder.layer.0.attention.self.key.bias            (768,)
bert.encoder.layer.0.attention.self.value.weight        (768, 768)
bert.encoder.layer.0.attention.self.value.bias          (768,)
bert.encoder.layer.0.attention.output.dense.weight      (768, 768)
bert.encoder.layer.0.attention.output.dense.bias        (768,)
bert.encoder.layer.0.attention.output.LayerNorm.weight  (768,)
bert.encoder.layer.0.attention.output.LayerNorm.bias    (768,)
bert.encoder.layer.0.interm