In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers datasets

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import (Dataset, DatasetDict)
from transformers import (AutoTokenizer,
        AutoModel,
        DefaultDataCollator,
        Trainer,
        TrainingArguments,
        AutoConfig)
from sklearn.model_selection import train_test_split
import torchvision.models as models
from PIL import Image

In [None]:
# Path
my_PATH = "/content/drive/MyDrive/ybigta"
os.chdir(my_PATH)
# show file list in the current directory
!dir

AllLabeledData.csv  DATA.zip		   FinalDataset.csv  비글부부.zip
DATAA.zip	    FinalDataset\ (1).csv  FULLDATA.csv


In [None]:
ch = "FULLDATA"
t_size = 0.2
v_size = 0.5

In [None]:
df = pd.read_csv(f"{ch}.csv", usecols= ['VideoTitle', 'Label','Image'], index_col=0).dropna()

In [None]:
df['Label']

VideoTitle
【히오스/하이라이트】 새해맞이 행복시공 (feat. Rich, KyoCha)     0.0
【히오스】 2019년에도 무서운 침아블로 (feat. Rich)            0.0
【다크 세라핌】 1화 - 추락천사 알카드                         0.0
【히오스】 단 8분 46초만에 끝내는 필승전략                      0.0
【메탈슬러그 - 침착맨X주호민】 인해전술 공략법                     1.0
                                              ... 
피지컬 노트북🏋️파나소닉 터프북 FZ-40 써봤습니다                  0.0
갤럭시 S23 시리즈 영혼까지 뜯어 봤습니다                       1.0
하루도 안 돼서 완판됐다는 갤럭시 S23 울트라 BMW M 에디션 열어봤습니다    1.0
갤럭시 S23 울트라 정품 케이스 결함까지 몰아보기                   0.0
5년만에 새로 나온 애플 홈팟(2세대) 사왔습니다                    0.0
Name: Label, Length: 25023, dtype: float64

In [None]:
#For loading data
def load_data(df,t_size, v_size):  
  train_df, test_df = train_test_split(df, test_size = t_size)
  val_df, test_df = train_test_split(test_df, test_size=v_size)

  raw_ds = DatasetDict({"train": Dataset.from_pandas(train_df),
       "validation": Dataset.from_pandas(val_df),
       "test": Dataset.from_pandas(test_df)})
  
  return raw_ds

In [None]:
raw_ds = load_data(df, t_size, v_size)
raw_ds

DatasetDict({
    train: Dataset({
        features: ['Label', 'Image', 'Title'],
        num_rows: 20018
    })
    validation: Dataset({
        features: ['Label', 'Image', 'Title'],
        num_rows: 2502
    })
    test: Dataset({
        features: ['Label', 'Image', 'Title'],
        num_rows: 2503
    })
})

In [None]:
#getting the tokenizer
checkpoint = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
#tokenizing data
def tokenize_function(batch):
  return tokenizer(batch['Title'], truncation = True, max_length= 100, add_special_tokens=True, padding='max_length')

tokenized =raw_ds.map(tokenize_function, batched = True)
tokenized

  0%|          | 0/21 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['Label', 'Image', 'Title', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20018
    })
    validation: Dataset({
        features: ['Label', 'Image', 'Title', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2502
    })
    test: Dataset({
        features: ['Label', 'Image', 'Title', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2503
    })
})

In [None]:
#set format of tokenized data for pytorch and get data collator
tokenized.set_format("torch", columns = ["input_ids", "attention_mask", "Label"])
data_collator = DefaultDataCollator("pt")

In [None]:
class LanguageModel(nn.Module):
    def __init__(self, checkpoint):
        super(LanguageModel, self).__init__()

        # Load model with given checkpoint and get the body
        self.transformer = AutoModel.from_pretrained(checkpoint,
                                                   config=AutoConfig.from_pretrained(
                                                       checkpoint,
                                                       output_attentions=True, 
                                                       output_hidden_states=True))
        self.bidir_LSTM = nn.LSTM(768, 50, bidirectional=True) 
        self.flatten = nn.Flatten() 
        self.dense_50 = nn.Linear(100, 50)
    
    def forward(self, input_ids, attention_mask):
        # Extract outputs from the body
        input_ids = input_ids
        attention_mask = attention_mask
        outputs = self.transformer(input_ids, attention_mask)
        
        # Add custom layers
        LSTM_out, _ = self.bidir_LSTM(outputs.last_hidden_state)
        max_pool_out, _ = torch.max(LSTM_out, 1)
        output = F.relu(self.dense_50(max_pool_out))
        return output
  

In [None]:
class ImgModel(nn.Module):
    def __init__(self, num_classes):
        super(ImgModel, self).__init__()
        
        self.base_model = models.densenet121(pretrained=True)
        for param in self.base_model.parameters():
            param.requires_grad = False
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.batch_norm1 = nn.BatchNorm2d(1024)
        self.dropout1 = nn.Dropout(p=0.5)
        self.fc1 = nn.Linear(1024, 256)
        self.batch_norm2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(256, 50)
        
    def forward(self, img_input):
        x = img_input
        x = x.unsqueeze(1)

        x = self.base_model.features(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.batch_norm1(x)
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.batch_norm2(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        return x

In [None]:
class ConcatModel(nn.Module):
    def __init__(self):
        super(ConcatModel, self).__init__()
        self.nlp_model = LanguageModel(checkpoint=checkpoint)
        self.img_model = ImgModel(num_classes=2)
        
        # Define context gating layer
        self.context_gate = nn.Sequential(
            nn.Linear(258, 258),
            nn.Sigmoid()
        )
        
        # Define dense layer with 20 units
        self.dense_layer = nn.Linear(258, 20)
        
        # Define dropout layer with 0.2 dropout rate
        self.dropout_layer = nn.Dropout(0.2)

        self.dense_layer2= nn.Linear(20,2)
        
        # Define final dense layer with softmax activation
        self.softmax_layer = nn.Softmax(dim=1)
        
    def forward(self, input_ids, attention_mask, img_input, label):
        nlp_output = self.nlp_model(input_ids, attention_mask)
        img_output = self.img_model(img_input)
        
        # Concatenate the outputs of the two models
        concat_output = torch.cat((nlp_output, img_output), dim=1)
        
        # Apply context gating
        gated_output = self.context_gate(concat_output) * concat_output
        
        # Apply dense layer, dropout layer, and softmax layer
        dense_output = self.dense_layer(gated_output)
        dropout_output = self.dropout_layer(dense_output)
        dense_output2 = self.dense_layer2(dropout_output)
        softmax_output = self.softmax_layer(dense_output2)
        
        return softmax_output

In [None]:
model = ConcatModel()

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for

In [None]:
!unzip -q /content/drive/MyDrive/ybigta/DATA.zip -d /content/Dataset

replace /content/Dataset/0_0.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
!unzip -q /content/drive/MyDrive/ybigta/DATAA.zip -d /content/Dataset

replace /content/Dataset/7_0.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: A


In [None]:
from torch.utils.data import DataLoader
from torchvision import transforms
import torch.utils.data as data
import os
from google.colab import files

class CustomDataset(data.Dataset):
    def __init__(self, inputid_data, attention_data, image_data, label_data, img_transform=None):
        self.inputid_data = inputid_data
        self.attention_data = attention_data
        self.image_data = image_data
        self.label_data = label_data
        self.img_transform = img_transform
        
    def __getitem__(self, index):
        # Load text data
        input_ids = self.inputid_data[index]
        attention_mask = self.attention_data[index]
        
        # Load image data
        # img_path를 이미지가 저장된 경로/self.image_data[index]로 하면 될 듯.
        img_path = f'/content/Dataset/{self.image_data[index]}'
        img_file = open(img_path, 'rb')
        img = Image.open(img_file).convert('RGB')
        if self.img_transform is not None:
            img = self.img_transform(img)
        img_input = img
        img_file.close()
        
        # Load label data
        label = self.label_data[index]

        return {
            'input_ids': input_ids,
            'attention_mask' : attention_mask,
            'img_input': img_input,
            'labels': label,
        }
    
    def __len__(self):
        return len(self.label_data)

# Example usage:
train_inputs_ids = tokenized["train"]["input_ids"]
train_attention_masks = tokenized["train"]["attention_mask"]
train_images = tokenized["train"]["Image"]
train_labels = tokenized["train"]["Label"]
train_transforms = transforms.Compose([transforms.Resize((180, 320)),
                                    transforms.ToTensor()])

train_dataset = CustomDataset(
    inputid_data=train_inputs_ids,
    attention_data=train_attention_masks,
    image_data=train_images,
    label_data=train_labels,
    img_transform=train_transforms
)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

valid_inputs_ids = tokenized["validation"]["input_ids"]
valid_attention_masks = tokenized["validation"]["attention_mask"]
valid_images = tokenized["validation"]["Image"]
valid_labels = tokenized["validation"]["Label"]
valid_transforms = transforms.Compose([transforms.Resize((180, 320)),
                                    transforms.ToTensor()])

valid_dataset = CustomDataset(
    inputid_data=valid_inputs_ids,
    attention_data=valid_attention_masks,
    image_data=valid_images,
    label_data=valid_labels,
    img_transform=valid_transforms
)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

test_inputs_ids = tokenized["test"]["input_ids"]
test_attention_masks = tokenized["test"]["attention_mask"]
test_images= tokenized["test"]["Image"]
test_labels = tokenized["test"]["Label"]
test_transforms = transforms.Compose([transforms.Resize((180, 320)),
                                    transforms.ToTensor()])

test_dataset = CustomDataset(
    inputid_data=test_inputs_ids,
    attention_data=test_attention_masks,
    image_data=test_images,
    label_data=test_labels,
    img_transform=test_transforms
)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)



In [None]:
for batch in train_dataloader:
    print(batch)
    print(batch["input_ids"].size())
    print(batch["attention_mask"].size())
    print(batch["labels"].size())
    print(batch["img_input"].size())
    break

In [None]:
from transformers import AdamW, get_scheduler
from transformers.optimization import get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr = 5e-5,eps = 1e-8)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps = num_training_steps,)

print(num_training_steps)

1878




In [None]:
from datasets import load_metric
accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('device:', device)

device: cuda


In [None]:

def batch_items(batch, device):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    img_input = batch["img_input"].to(device)
    label = batch["labels"].to(device)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "img_input": img_input, "label": label}


In [None]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs*len(test_dataloader)))
def train(loader, model, criterion, optimizer, epoch):
    '''
    Function for training.
    '''
    model = model.train()

    for batch in train_dataloader:
        batch = batch_items(batch, device)
        outputs = model(**batch)
        loss = criterion(output, batch["label"])
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)


def validate(loader, model, criterion, epoch):
    '''
    Function for validation.
    '''
    model = model.eval()
    with torch.no_grad():
        for batch in test_dataloader:
            batch = batch_items(batch)
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            loss = criterion(outputs, )
            accuracy_metric.add_batch(predictions=predictions, references=batch["label"])
            f1_metric.add_batch(predictions=predictions, references=batch["label"])
            progress_bar_eval.update(1)
        
    print("Accuracy:", accuracy_metric.compute()["accuracy"],f1_metric.compute(), "Loss:", loss)
 

In [None]:
#Tidied this up a bit

criterion = nn.CrossEntropyLoss()
for epoch in range(num_epoch):
    trn_loss = train(train_loader, model, criterion, optimizer, epoch)
    val_loss = validate(val_loader, model, criterion, epoch)

