In [1]:
from PIL import Image
import cv2
import json
import gc
from time import sleep
import nltk

import pandas as pd
from datasets import Dataset, Image as HuggingFaceImage
import torch
import torch.nn as nn
from transformers import (
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
    MBart50TokenizerFast,
    MBartForConditionalGeneration,
    DataCollatorWithPadding,
    CLIPModel,
    CLIPProcessor,
    PreTrainedModel,
    PretrainedConfig,
)

In [2]:
import os
device = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["WANDB_API_KEY"] = "34170ca0db8872600d39c46e69d4e52d93e27f85"
device

'cuda'

In [3]:
def load_dataset(dataset_root, data_file, image_folder):
    with open(f"{dataset_root}/{data_file}", encoding="utf8") as f:
        data = json.load(f)

    image_metadata = data["images"]
    annotations = data["annotations"]
    image_index = {image["id"]: image for image in image_metadata}

    image_paths = []
    captions = []
    for x in annotations:
        image_name = image_index[x["image_id"]]["filename"]
        image_paths.append(f"{dataset_root}/{image_folder}/{image_name}")
        captions.append(x["caption"])

    return Dataset.from_dict({"images": image_paths, "captions": captions}).cast_column("images", HuggingFaceImage())

In [4]:
def freeze_model_layers(model):
    """
    Completely prevent any layer from being updated
    """
    for param in model.parameters():
        param.requires_grad = False

In [5]:
class CLIPMBartImageCaptioningConfig(PretrainedConfig):
    model_type = "image_captioning"
    
    def __init__(self, clip_model_name="openai/clip-vit-base-patch32", mbart_model_name="facebook/mbart-large-50-many-to-many-mmt", max_caption_length=140, **kwargs):
        super().__init__(**kwargs)
        self.clip_model_name = clip_model_name
        self.mbart_model_name = mbart_model_name
        self.max_caption_length = max_caption_length
        
class CLIPMBartImageCaptioningModel(PreTrainedModel):
    config_class = CLIPMBartImageCaptioningConfig
    def __init__(self, config):
        super().__init__(config)
        self.clip = CLIPModel.from_pretrained(config.clip_model_name)
        #self.clip_preprocess = CLIPProcessor.from_pretrained(config.clip_model_name)
        self.mbart = MBartForConditionalGeneration.from_pretrained(config.mbart_model_name)
        self.tokenizer = MBart50TokenizerFast.from_pretrained(config.mt5_model_name)
        self.tokenizer.src_lang = "vi_VN"
        self.tokenizer.tgt_lang = "vi_VN"
        clip_output_dim = self.clip.config.projection_dim
        mbart_input_dim = self.mbart.config.d_model
        self.projection = nn.Linear(clip_output_dim, mbart_input_dim)

        # Freeze CLIP
        freeze_model_layers(self.clip)

    def forward(self, images, captions):
        # Encode images using CLIP
        image_features = self.clip.get_image_features(images)
        image_embeddings = self.projection(image_features)
        
        # Prepare inputs for MT5
        # labels = self.tokenizer(captions, return_tensors="pt", padding=True, truncation=True, max_length=self.config.max_caption_length)
        outputs = self.mbart(
            inputs_embeds=image_embeddings.unsqueeze(1),
            labels=captions,
        )

        return {
            "loss": outputs.loss,
            # "logits": outputs.logits,
            #outputs
        }
    def generate(self, images,max_length = 140):
        with torch.no_grad():
            image_features = self.clip.get_image_features(images)
            image_embeddings = self.projection(image_features)
            outputs = self.mbart.generate(inputs_embeds=image_embeddings.unsqueeze(1), max_length = max_length)
        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)


In [6]:
config = CLIPMBartImageCaptioningConfig(
    clip_model_name="openai/clip-vit-base-patch32",
    mt5_model_name="facebook/mbart-large-50-many-to-many-mmt",
    max_caption_length=200,
)

In [7]:
processor = CLIPProcessor.from_pretrained(config.clip_model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(config.mbart_model_name)
tokenizer.src_lang = "vi_VN"
tokenizer.tgt_lang = "vi_VN"
def transforms(example_batch):
    preprocess_image = processor(images=example_batch["images"], padding=True, return_tensors="pt")
    labels = tokenizer(example_batch["captions"], return_tensors="pt", padding="max_length", truncation=True, max_length=config.max_caption_length)
    return {
        "images": preprocess_image.pixel_values,
        "captions": labels.input_ids,
    }

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

In [8]:
train_dataset = load_dataset("/kaggle/input/ktvic-dataset/ktvic_dataset", "train_data.json", "train-images")
test_dataset = load_dataset("/kaggle/input/ktvic-dataset/ktvic_dataset", "test_data.json", "public-test-images")

In [9]:
#train_preprocess_dataset = train_dataset.select(range(100)).map(transforms, batched=True, batch_size=16)
train_preprocess_dataset = train_dataset.map(transforms, batched=True, batch_size=16)

train_preprocess_dataset.set_format("torch")
train_preprocess_dataset

Map:   0%|          | 0/18845 [00:00<?, ? examples/s]

Dataset({
    features: ['images', 'captions'],
    num_rows: 18845
})

In [10]:
#test_preprocess_dataset = test_dataset.select(range(100)).map(transforms, batched=True, batch_size=16)
test_preprocess_dataset = test_dataset.map(transforms, batched=True, batch_size=16)

test_preprocess_dataset.set_format("torch")
test_preprocess_dataset

Map:   0%|          | 0/2790 [00:00<?, ? examples/s]

Dataset({
    features: ['images', 'captions'],
    num_rows: 2790
})

In [11]:
train_preprocess_dataset[0]

{'images': tensor([[[ 0.5435,  0.5727,  0.6311,  ...,  0.7479,  0.8063,  0.8355],
          [ 0.4559,  0.4997,  0.5435,  ...,  0.7771,  0.7771,  0.7917],
          [ 0.4267,  0.4559,  0.5289,  ...,  0.7625,  0.7333,  0.7333],
          ...,
          [ 1.0252,  1.0398,  1.0398,  ...,  0.6165,  0.6311,  0.6019],
          [ 1.0398,  1.0398,  1.0398,  ...,  0.6165,  0.5873,  0.6019],
          [ 1.0690,  1.0982,  1.0836,  ...,  0.7333,  0.7625,  0.6603]],
 
         [[ 1.4746,  1.5046,  1.5796,  ...,  1.6547,  1.6547,  1.6547],
          [ 1.4145,  1.4596,  1.5046,  ...,  1.6547,  1.6397,  1.6247],
          [ 1.4295,  1.4596,  1.5346,  ...,  1.6247,  1.6096,  1.5946],
          ...,
          [ 1.0844,  1.0994,  1.0844,  ...,  0.3340,  0.3490,  0.3190],
          [ 1.0994,  1.0994,  1.0994,  ...,  0.3340,  0.3040,  0.3190],
          [ 1.1294,  1.1594,  1.1444,  ...,  0.4540,  0.4841,  0.3790]],
 
         [[ 2.0179,  2.0321,  2.0464,  ...,  2.0321,  1.9895,  1.9753],
          [ 1.9326

In [12]:
model = CLIPMBartImageCaptioningModel(config)

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [13]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [14]:
import torch
# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.allow_tf32 = True

training_args = TrainingArguments(
    output_dir="./results",
        eval_strategy="steps",

    num_train_epochs=3, #from 5
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    dataloader_num_workers=1,
    # gradient_accumulation_steps=4,
    # gradient_checkpointing=True,
    bf16=True,
    # tf32=True,
    save_steps=500,
    save_total_limit=1,
    eval_steps=500,
    report_to=None,
    #load_best_model_at_end=True,       # Load the best model based on eval metric
    #metric_for_best_model="ROUGE-L",  # Use default eval_loss
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_preprocess_dataset,
    eval_dataset=test_preprocess_dataset,
    #processing_class=tokenizer, 
    #compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Start fine-tuning
trainer.train()
path_model = './clip_mbart_model'

trainer.save_model("./clip_mbart_model")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss
500,0.5909,No log
1000,0.1218,No log
1500,0.1072,No log
2000,0.1,No log
2500,0.0937,No log
3000,0.0839,No log
3500,0.0828,No log


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	-

In [15]:
import shutil
path_model = './clip_mbart_model'

shutil.make_archive("clip_mbart_model", 'zip', path_model)

'/kaggle/working/clip_mbart_model.zip'

In [16]:
del trainer
torch.cuda.empty_cache()
gc.collect()

sleep(5)
torch.cuda.empty_cache()
gc.collect()

0

# **Get model**

In [17]:
model = CLIPMBartImageCaptioningModel.from_pretrained("./clip_mbart_model").to(device)

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [18]:
def test_transforms(example_batch):
        preprocess_image = processor(
            images=example_batch["images"], padding=True, return_tensors="pt"
        )
        return {
            "images": preprocess_image.pixel_values,
            "captions": example_batch["captions"],
        }

In [19]:
test_dataset = load_dataset("/kaggle/input/ktvic-dataset/ktvic_dataset", "test_data.json", "public-test-images")
#test_preprocess_dataset = test_dataset.select(range(100)).map(test_transforms, batched=True, batch_size=16)

test_preprocess_dataset = test_dataset.map(test_transforms, batched=True, batch_size=16)

test_preprocess_dataset.set_format("torch")
test_preprocess_dataset
test_captions = test_preprocess_dataset["captions"]


Map:   0%|          | 0/2790 [00:00<?, ? examples/s]

In [20]:
#test_images = test_preprocess_dataset["images"].to(device)
from torch.utils.data import DataLoader
from torch.nn import DataParallel
#model = DataParallel(model)
test_captions = test_preprocess_dataset["captions"]
test_loader = DataLoader(test_preprocess_dataset, batch_size=8, shuffle=False)

predicted_output = []

model.eval()  # Ensure the model is in evaluation mode
i = 0
with torch.no_grad():
    for batch in test_loader:
        i+=1

        batch_images = batch["images"].to(device)  # Move images to GPU
        batch_captions = batch["captions"]  # Captions are optional (for reference)

        # Generate predictions for the batch
        batch_outputs = model.generate(batch_images,max_length = 200)
        if i<3:
            print(batch_outputs)
        # Decode predictions and store them
        predicted_output.extend(batch_outputs)

['có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có nhiều người đang biểu diễn với trống ở trên đường', 'có nhiều người đang biểu diễn với trống ở trên đường', 'có nhiều người đang biểu diễn với trống ở trên đường']
['có nhiều người đang biểu diễn với trống ở trên đường', 'có nhiều người đang biểu diễn với trống ở trên đường', 'có một con tàu đang di chuyển ở trên biển', 'có một con tàu đang di chuyển ở trên biển', 'có một con tàu đang di chuyển ở trên biển', 'có một con tàu đang di chuyển ở trên biển', 'có một con tàu đang di chuyển ở trên biển', 'có một người phụ nữ mặc áo vàng đang đứng bên một kệ hàng']


In [21]:
print(predicted_output[:100])

['có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có một ngôi nhà cao tầng xuất hiện ở trong bức ảnh', 'có nhiều người đang biểu diễn với trống ở trên đường', 'có nhiều người đang biểu diễn với trống ở trên đường', 'có nhiều người đang biểu diễn với trống ở trên đường', 'có nhiều người đang biểu diễn với trống ở trên đường', 'có nhiều người đang biểu diễn với trống ở trên đường', 'có một con tàu đang di chuyển ở trên biển', 'có một con tàu đang di chuyển ở trên biển', 'có một con tàu đang di chuyển ở trên biển', 'có một con tàu đang di chuyển ở trên biển', 'có một con tàu đang di chuyển ở trên biển', 'có một người phụ nữ mặc áo vàng đang đứng bên một kệ hàng', 'có một người phụ nữ mặc áo vàng đang đứng bên một kệ hàng', 'có một người phụ nữ mặc áo vàng đang đứng bên một kệ hàng', 'có một người phụ nữ mặc áo vàng đang đứn

In [22]:
!pip install evaluate
!pip install pycocoevalcap
!pip install nltk==3.6.7
!pip install rouge_score

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pycocotools>=2.0.2 (from pycocoevalcap)
  Downloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (427 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.8/427.8 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycocotools, pycocoevalcap
Successfully installed pycocoevalcap-1.2 pycocotools-2.0.8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nltk==3.6.7
  Downloading nltk-3.6.7-py3-none-any.whl.metadata (2.8 kB)
Downloading nltk-3.6.7-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.6.7 which is incompatible.
textblob 0.18.0.post0 requires nltk>=3.8, but you have nltk 3.6.7 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.6.7


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c2c16555b6415a68e9e44722cd2632a85e93a4b090bbb606835e3fdc0c049fac
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [23]:
test_captions = [[caption] for caption in test_captions]


In [24]:
import evaluate
import nltk


In [25]:
bleu = evaluate.load("bleu")
results_1 = bleu.compute(predictions=predicted_output, references=test_captions, max_order=1)
print(results_1)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.3751866743251005, 'precisions': [0.3751866743251005], 'brevity_penalty': 1.0, 'length_ratio': 1.0292943923852316, 'translation_length': 34820, 'reference_length': 33829}


In [26]:
results_4 = bleu.compute(predictions=predicted_output, references=test_captions)
print(results_4)

{'bleu': 0.13648246178882584, 'precisions': [0.3751866743251005, 0.17867624102403995, 0.09500683994528043, 0.05448015122873346], 'brevity_penalty': 1.0, 'length_ratio': 1.0292943923852316, 'translation_length': 34820, 'reference_length': 33829}


In [27]:
cider = evaluate.load("Kamichanw/CIDEr")
results_c = cider.compute(predictions=predicted_output, references=test_captions)

print(results_c)

Downloading builder script:   0%|          | 0.00/4.93k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.81M [00:00<?, ?B/s]

PTBTokenizer tokenized 74228 tokens at 276433.58 tokens per second.


{'CIDEr': 1.0186763048008243}


In [28]:
import nltk

nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('wordnet2022')
! cp -rf /usr/share/nltk_data/corpora/wordnet2022 /usr/share/nltk_data/corpora/wordnet # temp fix for lookup error.
meteor = evaluate.load("meteor")
results_m = meteor.compute(predictions=predicted_output, references=test_captions)
print(results_m)

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet2022 to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/wordnet2022.zip.


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
{'meteor': 0.32372637396467974}


In [29]:
rouge = evaluate.load("rouge")
results_r = rouge.compute(predictions=predicted_output, references=test_captions)
print(results_r)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.5403304570401544, 'rouge2': 0.2634118260231637, 'rougeL': 0.41904260287821704, 'rougeLsum': 0.4189197880854494}


In [30]:
'''
from kaggle_secrets import UserSecretsClient
import os

# Disable tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

user_secrets = UserSecretsClient()
os.environ["KAGGLE_USERNAME"] =user_secrets.get_secret("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = user_secrets.get_secret("key")


'''



In [31]:
#!kaggle kernels output bonguyn2/batch-10 -p /kaggle/working
#!kaggle kernels output bonguyn2/clip-mbart -p /kaggle/working