In [None]:
!pip install open_clip_torch

In [None]:
!pip install git+https://github.com/BloodAxe/pytorch-toolbelt.git


In [None]:
!nvidia-smi

In [None]:
!pip install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124

In [None]:
import torch
print(torch.cuda.is_available())

#  True

In [None]:
import json
import os
import torch
import re
import torch.nn as nn
import torchvision.transforms as transforms
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoModel, AutoTokenizer, get_scheduler
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW,Adam
from tqdm.notebook import tqdm, trange
from time import perf_counter
from PIL import Image
import pandas as pd

In [None]:
!export CUDA_VISIBLE_DEVICES=0,1

In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
# set random seeds for repeatability
import numpy as np
import random

def set_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
seed_val = 0
set_seed(seed_val)

In [None]:
import pandas as pd
df=pd.read_csv('/kaggle/input/dsc2024/train_cluster.csv')

In [None]:
import json
with open('/kaggle/input/dsc2024/vimmsd-private-test.json', 'r') as f:
    data = json.load(f)

# Sử dụng json_normalize để làm phẳng dữ liệu trong "root"
df_test = pd.DataFrame.from_dict(data, orient='index')

In [None]:
def text_to_emoji(text):
    emoji_map = {
        ":))": "😊",
        "=)))": "😊",# Happy
        ":((": "😞",  # Sad
        ":'(": "😢",  # Crying
        ":D": "😃",   # Big smile
        ":(": "☹️",   # Disappointed
        ":|": "😐",   # Neutral
    }
    for text_emoji, real_emoji in emoji_map.items():
        text = text.replace(text_emoji, real_emoji)
    return text

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = text_to_emoji(text)
    text = re.sub(r'[^\w\s,]', '', text)
    text = re.sub(r'\d+', '', text)
    text = ' '.join(text.split())
    return text

df['caption']=df['caption'].apply(preprocess_text).apply(text_to_emoji)

In [None]:
from sklearn.model_selection import train_test_split
# split train and dev
df_train, df_dev= train_test_split(df, test_size=0.1, random_state=42)

In [None]:
IMAGE_TRAIN_FOLDER='/kaggle/input/dsc2024/training-images/train-images/'
IMAGE_TEST_FOLDER='/kaggle/input/dsc2024/private-test-images/test-images/'

In [None]:
import matplotlib.pyplot as plt

def show_sample(row_num):
    sample_row = df_train.iloc[row_num]
    print('Index:', row_num)
    print('Label:', sample_row['label'])
    print('Text:', sample_row['caption'])
    image_path = IMAGE_TRAIN_FOLDER + sample_row['image']
    im = Image.open(image_path)
    plt.imshow(im)
    plt.axis('off')
    plt.show()

In [None]:
from random import randint
show_sample(randint(0, len(df_train)-1))


In [None]:
label_to_id = {lab:i for i, lab in enumerate(df_train['label'].sort_values().unique())}
id_to_label = {v:k for k,v in label_to_id.items()}
label_to_id

In [None]:
num_out_labels = len(label_to_id)

In [None]:
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image

class MultimodalDataset(Dataset):
    def __init__(self, train_dataframe, eval_dataframe, label_to_id, model, train=True, caption_col='caption', label_col='label', img_col='image'):
        self.train = train
        if self.train:
            self.data = train_dataframe
            self.img_folder = IMAGE_TRAIN_FOLDER
        else:
            self.data = eval_dataframe
            self.img_folder = IMAGE_TEST_FOLDER
            
        self.label_to_id = label_to_id
        self.model = model
        self.caption_col = caption_col
        self.label_col = label_col
        self.img_col = img_col
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        img_path = self.img_folder + row[self.img_col]
        caption = row[self.caption_col]
        if self.train:
            label = self.label_to_id[row[self.label_col]]
        else:
            label = None
#         # Kiểm tra nếu mô hình được bọc trong DataParallel
#         if isinstance(self.model, nn.DataParallel):
#             preprocessor = self.model.module.preprocessor
#             tokenizer = self.model.module.tokenizer
#         else:
#             preprocessor = self.model.preprocessor
#             tokenizer = self.model.tokenizer
        preprocessor = self.model.preprocessor
        tokenizer = self.model.tokenizer

        # Preprocess hình ảnh và token hóa văn bản
        image = preprocessor(Image.open(img_path))
        text = tokenizer(caption)
        
        if self.train:
            return image, text, label
        else:
            return image, text

In [None]:
import torch
import numpy as np
from torch.utils.data import DataLoader, WeightedRandomSampler

def create_oversampled_dataloader(dataset, label_col, batch_size, num_workers=4):
    # Đếm số lượng mẫu của mỗi class trong dataset ban đầu
    class_sample_counts = dataset.data[label_col].value_counts().to_dict()
    
    # Tính trọng số cho mỗi sample dựa trên số lượng mẫu của class đó
    weights = 1. / np.array([class_sample_counts[label] for label in dataset.data[label_col]])
    
    # In thông tin về dataset trước khi oversampling
    print("Số lượng mẫu của mỗi class trước khi oversampling:", class_sample_counts)

    # Tạo WeightedRandomSampler với trọng số đã tính
    sampler = WeightedRandomSampler(weights, num_samples=len(dataset), replacement=True)

    # Tạo DataLoader với sampler
    dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers)

    # Tính alpha trước khi oversampling
    total_count_before = sum(class_sample_counts.values())
    alpha_before_oversampling = {class_label: count / total_count_before for class_label, count in class_sample_counts.items()}
    print("Alpha trước khi oversampling:", alpha_before_oversampling)

    # Tính số lượng mẫu sau khi oversampling
    class_counts_after_sampling = {i: 0 for i in range(len(class_sample_counts))}
    
    for index in sampler:
        label = dataset[index][2]  # Index 2 là nhãn (label)
        class_counts_after_sampling[label] += 1
    
    # In thông tin về dataset sau khi oversampling
    print("Số lượng mẫu của mỗi class sau khi oversampling:", class_counts_after_sampling)

    # Tính alpha sau khi oversampling
    total_count_after = sum(class_counts_after_sampling.values())
    alpha_after_oversampling = {class_label: count / total_count_after for class_label, count in class_counts_after_sampling.items()}
    print("Alpha sau khi oversampling:", alpha_after_oversampling)

    # Chuyển alpha thành tensor để sử dụng trong Focal Loss
    alpha_for_focal = torch.tensor([alpha_after_oversampling[i] for i in range(len(alpha_after_oversampling))], dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')

    return dataloader, alpha_for_focal


In [None]:
# blank = test_dataset

In [None]:
import torch
import torch.nn as nn
import open_clip

class MultimodalClassifier(nn.Module):
    def __init__(self, model_name, pretrained, num_labels, device, mlp_hidden_size=512, dropout_prob=0.3):
        super(MultimodalClassifier, self).__init__()
        
        # Load the pre-trained model and tokenizer
        self.model, _, self.preprocessor = open_clip.create_model_and_transforms(model_name=model_name, pretrained=pretrained)
        self.tokenizer = open_clip.get_tokenizer(model_name)
        
        # Fine-tuning the entire model
        for param in self.model.parameters():
            param.requires_grad = False
        
        # Extract the output dimension of the vision model
        visual_output_dim = self.model.visual.trunk.patch_embed.proj.weight.shape[0]
        #text_output_dim = self.model.transformer.width
        # Define MLP layer with one hidden layer
        self.mlp = nn.Sequential(
            nn.Linear(2304, mlp_hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_prob)
        )
        #self.fusion_model = nn.Linear(2304,mlp_hidden_size)

        # Classifier
        self.classifier = nn.Linear(mlp_hidden_size, num_labels)
        self.device = device
        self.to(self.device)

    def forward(self, images, texts):
        # Ensure compatibility with DataParallel, if used
        model = self.model.module if isinstance(self.model, nn.DataParallel) else self.model

        # Extract image and text features
        with torch.no_grad():
            image_features = model.encode_image(images.to(self.device))
        with torch.no_grad():
            text_features = model.encode_text(texts.to(self.device))

        # Concatenate features
        combined_features = torch.cat((image_features, text_features), 1)
        #combined_features = combined_features / combined_features.norm(dim=-1, keepdim=True)

        # Pass through MLP
        mlp_output = self.mlp(combined_features)

        # Classify using the output of the MLP
        logits = self.classifier(mlp_output)
        return logits


In [None]:
## training parameters to be used for all models ##
num_train_epochs = 30
batch_size = 256
learning_rate = 1e-2
weight_decay = 0.001
warmup_steps = 2000

In [None]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, WeightedRandomSampler, RandomSampler
from transformers import get_scheduler
from time import perf_counter
from tqdm import tqdm, trange
import open_clip
from pytorch_toolbelt import losses as L

# Thiết lập hạt giống ngẫu nhiên cho reproducibility
set_seed(seed_val)

# Load mô hình OpenCLIP và thêm classifier
model_name = "nllb-clip-large-siglip"
pretrained = "mrl"
num_labels = 4  # multi-sarcasm, not-sarcasm, image-sarcasm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
openclip_model = MultimodalClassifier(model_name, pretrained, num_labels, device)

# # Nếu có nhiều hơn 1 GPU, áp dụng DataParallel
# device_ids = [0, 1]  # Sử dụng cả hai GPU
# torch.cuda.set_device(device_ids[0])
# openclip_model = nn.DataParallel(openclip_model, device_ids=device_ids)

# if torch.cuda.device_count() > 1:
#     print(f"Let's use {torch.cuda.device_count()} GPUs!")
#     openclip_model = nn.DataParallel(openclip_model)

openclip_model = openclip_model.to(device)

# Tạo dataset và dataloader
train_dataset = MultimodalDataset(train_dataframe=df_train, eval_dataframe=df_test, label_to_id=label_to_id, model=openclip_model, train=True, caption_col='caption', label_col='label', img_col='image')
train_dataloader, class_weights = create_oversampled_dataloader(train_dataset, label_col='label', batch_size=batch_size)  # Tăng num_workers để tải dữ liệu song song
train_sampler = RandomSampler(train_dataset)
#train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4)  # Tăng num_workers để tải dữ liệu song song

# Số bước huấn luyện tổng
t_total = len(train_dataloader) * num_train_epochs

# Optimizer và Scheduler
optimizer = AdamW(openclip_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = get_scheduler(name="cosine", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

alpha = torch.tensor([1.78072665, 2.55869691, 24.44479595, 140.31891652])
alpha = alpha.to(device)

# Tiêu chuẩn đánh giá (loss function)
criterion = L.CrossEntropyFocalLoss(gamma=1.0, reduction='mean', class_weights=alpha)
# criterion = torch.nn.CrossEntropyLoss()
# Tạo scaler cho Mixed Precision Training
scaler = torch.amp.GradScaler()

# Bắt đầu quá trình huấn luyện
openclip_model.train()

start = perf_counter()
for epoch_num in trange(num_train_epochs, desc='Epochs'):
    epoch_total_loss = 0

    # Duyệt qua từng batch trong train_dataloader
    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc='Batch'):
        b_imgs, b_text, b_labels  = batch
        
        # Tokenize văn bản bằng OpenCLIP
        b_inputs = b_text.squeeze(1).to(device)

        # Chuyển các tensor dữ liệu sang thiết bị đang sử dụng (GPU)
        b_labels = b_labels.to(device)
        b_imgs = b_imgs.to(device)

        # Reset gradients
        optimizer.zero_grad()

        # Sử dụng mixed precision cho forward pass
        with torch.amp.autocast('cuda'):
            b_logits = openclip_model(b_imgs, b_inputs)
            loss = criterion(b_logits, b_labels)
        
        # Sử dụng scaler để scale gradients
        scaler.scale(loss).backward()

        # Gradient clipping để tránh exploding gradients
        torch.nn.utils.clip_grad_norm_(openclip_model.parameters(), max_norm=1.0)

        # Cập nhật trọng số với scaled gradients
        scaler.step(optimizer)
        scaler.update()
        
        # Cập nhật scheduler
        scheduler.step()

        epoch_total_loss += loss.item()

    # Tính loss trung bình của mỗi epoch
    avg_loss = epoch_total_loss / len(train_dataloader)

    # In thông tin epoch
    print(f'epoch = {epoch_num+1}')
    print(f'    epoch_loss = {epoch_total_loss}')
    print(f'    avg_epoch_loss = {avg_loss}')
    print(f'    learning rate = {optimizer.param_groups[0]["lr"]}')
    
    torch.cuda.empty_cache()
    
# Lưu mô hình sau mỗi epoch
#torch.save(openclip_model.state_dict(), 'openclip_model.pth')

# Tính thời gian huấn luyện
end = perf_counter()
training_time = end - start
print(f'Training completed in {training_time} seconds')


In [None]:
torch.save(openclip_model.state_dict(), 'openclip.pth')


In [None]:
import torch
import json
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm

# Define device and initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
openclip_model = openclip_model.to(device)
openclip_model.eval()

# Initialize dataset and dataloader for testing
test_dataset = MultimodalDataset(train_dataframe=df_dev, eval_dataframe=df_test, label_to_id=label_to_id, model=openclip_model, train=True, caption_col='caption', label_col='label', img_col='image')
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, sampler=test_sampler, num_workers=4)

# Initialize results list
prediction_results = []

# Inference without gradient calculation
with torch.no_grad():
    for step, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader), desc='Inference'):
        b_imgs, b_text, b_labels = batch
        
        # Prepare inputs on the correct device
        b_inputs = b_text.squeeze(1).to(device)
        b_imgs = b_imgs.to(device)

        # Forward pass
        with torch.amp.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):
            logits = openclip_model(b_imgs, b_inputs)
        
        # Store the prediction results
        prediction_results += torch.argmax(logits, dim=-1).tolist()

# Map predicted IDs back to labels
prediction_results = [id_to_label[p] for p in prediction_results]


In [None]:
class_report = classification_report(df_dev['label'],prediction_results)
class_report

In [None]:
#make prediciton
import torch
import json
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm

# Đường dẫn tới mô hình đã lưu
#model_path = '/kaggle/input/model-clip-moitrain/openclip_model.pth'  # Cập nhật đường dẫn tới mô hình đã lưu

# Load lại mô hình OpenCLIP và thêm classifier
#model_name = "nllb-clip-large-siglip"
#pretrained = "mrl"
num_labels = 4  # multi-sarcasm, not-sarcasm, image-sarcasm, text-sarcasm
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Khởi tạo lại mô hình MultimodalClassifier
#openclip_model = MultimodalClassifier(model_name, pretrained, num_labels, device)

# Load trọng số đã lưu vào mô hình
#openclip_model.load_state_dict(torch.load(model_path))

# Đưa mô hình về chế độ đánh giá (eval mode)
#openclip_model = openclip_model.to(device)
openclip_model.eval()

# Tạo dataset và dataloader cho tập test
test_dataset = MultimodalDataset(train_dataframe=df_train, eval_dataframe=df_test, label_to_id=label_to_id, model=openclip_model, train=False, caption_col='caption', label_col='label', img_col='image')
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, sampler=test_sampler, num_workers=4)

# Không tính gradient trong quá trình inference
results = {}
with torch.no_grad():
    for step, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader), desc='Inference'):
        b_imgs, b_text = batch
        
        # Tokenize văn bản và chuyển các tensor dữ liệu sang thiết bị đang sử dụng (GPU)
        b_inputs = b_text.squeeze(1).to(device)
        b_imgs = b_imgs.to(device)

        # Thực hiện forward pass
        with torch.amp.autocast('cuda'):
            logits = openclip_model(b_imgs, b_inputs)
        
        # Lấy nhãn dự đoán (label dự đoán có xác suất lớn nhất)
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

        # Lấy id của mẫu và lưu nhãn dự đoán
        for idx, pred in zip(range(step * batch_size, (step + 1) * batch_size), predictions):
            sample_id = test_dataset.data.iloc[idx].name  # Giả sử cột 'id' là index của dataframe
            label_name = {v: k for k, v in label_to_id.items()}[pred]  # Đối chiếu nhãn với label_to_id
            results[sample_id] = label_name

# Lưu kết quả vào file result.json
output = {
    "results": results,
    "phase": "dev"
}

with open('results.json', 'w') as f:
    json.dump(output, f, indent=4)

print("Inference completed. Results saved to results.json.")

In [None]:
# Specify the file path
file_path = "/kaggle/working/training_report.txt"

# Save the report to a text file
with open(file_path, mode="w") as file:
    file.write(class_report)

print(f"Report saved to {file_path}")


In [None]:
# for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc='Batch'):
#         b_imgs, b_text, b_labels = batch
#         print(b_imgs.shape)
#         print(b_text.squeeze(1).shape)
#         print(b_labels.shape)

In [None]:
# # Giả sử bạn đã load mô hình
# import open_clip

# # Load model từ open_clip
# model_name = "nllb-clip-large-siglip"  # hoặc model bạn đang dùng
# pretrained = "mrl"  # Tên của mô hình pretrained (nếu có)

# # Load model và preprocess
# model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)