In [4]:
!rm -rf /kaggle/working/vocab.pkl

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from PIL import Image
import pandas as pd
import numpy as np
import os
import re
import math
from tqdm import tqdm
import gc

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# ƒê∆∞·ªùng d·∫´n Kaggle
IMG_DIR = "/kaggle/input/flickr30k/flickr30k_images"
CAPTION_FILE = "/kaggle/input/flickr30k/captions.txt"
FEATURE_DIR = "/kaggle/working/features"

# Tham s·ªë Model - TRANSFORMER
EMBED_SIZE = 512          # Embedding dimension
HIDDEN_SIZE = 512         # Hidden dimension (d_model trong Transformer)
NUM_LAYERS = 3            # S·ªë Transformer decoder layers
NUM_HEADS = 8             # S·ªë attention heads
DROPOUT = 0.3             # Dropout rate
FFN_DIM = 2048            # Feed-forward network dimension (4x hidden)

# Tham s·ªë Training
BATCH_SIZE = 32           # Batch size
NUM_WORKERS = 2           # DataLoader workers
NUM_EPOCHS = 15           # S·ªë epochs
WARMUP_STEPS = 4000       # Warmup steps cho learning rate
MAX_LR = 1e-4             # Max learning rate

# T·∫°o th∆∞ m·ª•c
os.makedirs(FEATURE_DIR, exist_ok=True)

print(f"Configuration loaded")
print(f"   - HIDDEN_SIZE: {HIDDEN_SIZE}")
print(f"   - NUM_LAYERS: {NUM_LAYERS}")
print(f"   - NUM_HEADS: {NUM_HEADS}")
print(f"   - NUM_EPOCHS: {NUM_EPOCHS}")

Device: cpu
Configuration loaded
   - HIDDEN_SIZE: 512
   - NUM_LAYERS: 3
   - NUM_HEADS: 8
   - NUM_EPOCHS: 15


In [7]:
class Vocabulary:
    """
    X√¢y d·ª±ng t·ª´ ƒëi·ªÉn t·ª´ captions
    """
    def __init__(self, freq_threshold):
        self.itos = {0: "<pad>", 1: "<start>", 2: "<end>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        """
        Tokenize English text
        FIX: S·ª≠ d·ª•ng regex ƒë·ªÉ lo·∫°i b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát
        """
        text = str(text).lower()
        text = re.sub(r'[^a-z ]', '', text)  # Ch·ªâ gi·ªØ ch·ªØ c√°i v√† kho·∫£ng tr·∫Øng
        return text.split()

    def build_vocabulary(self, sentence_list):
        """X√¢y d·ª±ng vocabulary t·ª´ list captions"""
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                frequencies[word] = frequencies.get(word, 0) + 1
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        """Chuy·ªÉn text th√†nh list of indices"""
        tokenized_text = self.tokenizer_eng(text)
        return [self.stoi.get(token, self.stoi["<unk>"]) for token in tokenized_text]

print("Class Vocabulary ƒë√£ xong")

Class Vocabulary ƒë√£ xong


In [8]:
# Load v√† x·ª≠ l√Ω d·ªØ li·ªáu
print("LOADING & PROCESSING DATA")
print("="*80)

df = pd.read_csv(CAPTION_FILE)
print(f"Loaded {len(df)} captions")
print(f"Columns: {df.columns.tolist()}")
print(f"\nSample data:")
print(df.head())

# X√¢y d·ª±ng vocabulary
vocab = Vocabulary(freq_threshold=3)
vocab.build_vocabulary(df['comment'].tolist())
print(f"\nBuilt vocabulary with {len(vocab)} tokens")
print(f"Special tokens: <pad>={vocab.stoi['<pad>']}, "
      f"<start>={vocab.stoi['<start>']}, "
      f"<end>={vocab.stoi['<end>']}, "
      f"<unk>={vocab.stoi['<unk>']}")

LOADING & PROCESSING DATA
Loaded 158915 captions
Columns: ['image_name', 'comment_number', 'comment']

Sample data:
       image_name  comment_number  \
0  1000092795.jpg               0   
1  1000092795.jpg               1   
2  1000092795.jpg               2   
3  1000092795.jpg               3   
4  1000092795.jpg               4   

                                             comment  
0  Two young guys with shaggy hair look at their ...  
1  Two young  White males are outside near many b...  
2   Two men in green shirts are standing in a yard .  
3       A man in a blue shirt standing in a garden .  
4            Two friends enjoy time spent together .  

Built vocabulary with 9964 tokens
Special tokens: <pad>=0, <start>=1, <end>=2, <unk>=3


In [9]:
# ===================================================================
# SAVE VOCAB - CH·∫†Y NGAY TRONG NOTEBOOK HI·ªÜN T·∫†I (KH√îNG C·∫¶N TRAIN L·∫†I)
# ===================================================================

import pickle
from IPython.display import FileLink

# Ki·ªÉm tra vocab c√≥ t·ªìn t·∫°i kh√¥ng
print(f"‚úÖ Vocab ƒë√£ t·ªìn t·∫°i v·ªõi {len(vocab)} tokens")

# Save vocab
with open('/kaggle/working/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

print("‚úÖ ƒê√£ save vocab.pkl")

# Download link
print("\nüì• Click ƒë·ªÉ t·∫£i xu·ªëng:")
display(FileLink('/kaggle/working/vocab.pkl'))

‚úÖ Vocab ƒë√£ t·ªìn t·∫°i v·ªõi 9964 tokens
‚úÖ ƒê√£ save vocab.pkl

üì• Click ƒë·ªÉ t·∫£i xu·ªëng:


In [8]:
class EncoderCNN(nn.Module):
    """
    Encoder: EfficientNet-B0 ƒë·ªÉ extract visual features
    Output: (Batch, 49, 1280) - 49 spatial locations v·ªõi 1280-dim features
    """
    def __init__(self):
        super(EncoderCNN, self).__init__()
        backbone = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
        self.features = backbone.features
        
        # Freeze encoder ƒë·ªÉ ti·∫øt ki·ªám memory v√† tr√°nh overfitting
        for param in self.features.parameters():
            param.requires_grad = False
        
        self.features.eval()
            
    def forward(self, images):
        """
        Args:
            images: (B, 3, 224, 224)
        Returns:
            features: (B, 49, 1280)
        """
        with torch.no_grad():
            features = self.features(images)  # (B, 1280, 7, 7)
        
        # Reshape to (B, 49, 1280)
        features = features.permute(0, 2, 3, 1)  # (B, 7, 7, 1280)
        features = features.view(features.size(0), -1, features.size(3))  # (B, 49, 1280)
        return features

print("Class Encoder ƒë√£ xong")

Class Encoder ƒë√£ xong


In [9]:
encoder = EncoderCNN().to(device)
print("\n" + "="*80)
print("ENCODER ARCHITECTURE")
print("="*80)
print("EfficientNet-B0 Encoder initialized")
print("Output shape: (Batch, 49, 1280)")
print("Parameters: Frozen (no training)")

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20.5M/20.5M [00:00<00:00, 190MB/s]



ENCODER ARCHITECTURE
EfficientNet-B0 Encoder initialized
Output shape: (Batch, 49, 1280)
Parameters: Frozen (no training)


In [10]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def extract_features_smart():
    """
    Extract features t·ª´ images v√† l∆∞u v√†o disk
    Ch·ªâ ch·∫°y n·∫øu ch∆∞a c√≥ features
    """
    # Ki·ªÉm tra xem ƒë√£ extract ch∆∞a
    if os.path.exists(FEATURE_DIR):
        num_files = len([f for f in os.listdir(FEATURE_DIR) if f.endswith('.npy')])
        if num_files > 30000:  # Flickr30k c√≥ ~31,783 ·∫£nh
            print(f"\n‚úÖ Found {num_files} features in {FEATURE_DIR}")
            print("   Skipping feature extraction...")
            return
    
    print(f"\nFeature directory empty or incomplete")
    print("Starting feature extraction...")
    
    unique_images = df['image_name'].unique()
    encoder.eval()
    
    batch_size = 32
    total_batches = (len(unique_images) + batch_size - 1) // batch_size
    
    with torch.no_grad():
        for i in tqdm(range(0, len(unique_images), batch_size), 
                     desc="Extracting features", total=total_batches):
            batch_imgs_paths = unique_images[i : i + batch_size]
            img_tensors = []
            valid_paths = []
            
            for img_name in batch_imgs_paths:
                img_path = os.path.join(IMG_DIR, img_name)
                try:
                    image = Image.open(img_path).convert("RGB")
                    image = transform(image)
                    img_tensors.append(image)
                    valid_paths.append(img_name)
                except Exception as e:
                    continue
            
            if not img_tensors:
                continue
            
            batch_input = torch.stack(img_tensors).to(device)
            features = encoder(batch_input)
            features_np = features.cpu().numpy()
            
            for img_name, feature in zip(valid_paths, features_np):
                save_path = os.path.join(FEATURE_DIR, img_name + ".npy")
                np.save(save_path, feature)
            
            # Clear memory
            del batch_input, features, features_np
            
        gc.collect()
        torch.cuda.empty_cache()
    
    num_saved = len([f for f in os.listdir(FEATURE_DIR) if f.endswith('.npy')])
    print(f"Saved {num_saved} feature files to {FEATURE_DIR}")


# Extract features
extract_features_smart()


‚úÖ Found 31783 features in /kaggle/working/features
   Skipping feature extraction...


In [14]:
class FlickrDataset(Dataset):
    """Dataset class ƒë·ªÉ load features v√† captions"""
    def __init__(self, df, feature_dir, vocab):
        self.df = df
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.imgs = df["image_name"]
        self.captions = df["comment"]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        caption = self.captions.iloc[index]
        img_id = self.imgs.iloc[index]

        # Load feature
        feature_path = os.path.join(self.feature_dir, img_id + ".npy")
        features = np.load(feature_path)  # (49, 1280)
        
        # Numericalize caption
        numericalized_caption = [self.vocab.stoi["<start>"]]
        numericalized_caption += self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<end>"])

        return torch.tensor(features, dtype=torch.float32), torch.tensor(numericalized_caption)


class MyCollate:
    """Collate function ƒë·ªÉ padding captions trong batch"""
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        features = torch.stack([item[0] for item in batch], dim=0)  # (B, 49, 1280)
        captions = [item[1] for item in batch]
        
        # Padding captions
        targets = torch.nn.utils.rnn.pad_sequence(
            captions, batch_first=True, padding_value=self.pad_idx
        )

        return features, targets


# Chia train/val
train_size = int(0.9 * len(df))
train_df = df.iloc[:train_size].reset_index(drop=True)
val_df = df.iloc[train_size:].reset_index(drop=True)

print("\n" + "="*80)
print("CREATING DATALOADERS")
print("="*80)
print(f"Train samples: {len(train_df)}")
print(f"Val samples: {len(val_df)}")

pad_idx = vocab.stoi["<pad>"]

train_dataset = FlickrDataset(train_df, FEATURE_DIR, vocab)
val_dataset = FlickrDataset(val_df, FEATURE_DIR, vocab)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=MyCollate(pad_idx),
    num_workers=NUM_WORKERS,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=MyCollate(pad_idx),
    num_workers=NUM_WORKERS,
    pin_memory=True
)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")


CREATING DATALOADERS
Train samples: 143023
Val samples: 15892
Train batches: 4470
Val batches: 497


In [11]:
class PositionalEncoding(nn.Module):
    """Positional encoding cho Transformer"""
    def __init__(self, d_model, max_len=100):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

In [15]:
class TransformerDecoder(nn.Module):
    """
    Transformer Decoder cho Image Captioning
    
    Architecture:
    1. Embedding layer cho captions
    2. Positional encoding
    3. Multi-layer Transformer decoder
    4. Output projection
    """
    
    def __init__(self, vocab_size, embed_size=512, hidden_size=512, 
                 num_layers=3, num_heads=8, dropout=0.3):
        super(TransformerDecoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
        # 1. Embedding
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.pos_encoder = PositionalEncoding(embed_size)
        self.embed_dropout = nn.Dropout(dropout)
        
        # 2. Project image features (1280 -> hidden_size)
        self.feature_proj = nn.Linear(1280, hidden_size)
        
        # 3. Transformer Decoder
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=hidden_size,
            nhead=num_heads,
            dim_feedforward=hidden_size * 4,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True  # Pre-norm (better for deep models)
        )
        
        self.transformer_decoder = nn.TransformerDecoder(
            decoder_layer,
            num_layers=num_layers
        )
        
        # 4. Output
        self.fc_out = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Xavier uniform initialization"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def generate_square_subsequent_mask(self, sz):
        """
        Causal mask ƒë·ªÉ decoder kh√¥ng nh√¨n th·∫•y future tokens
        """
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask
    
    def forward(self, features, captions):
        """
        Args:
            features: (B, 49, 1280) - Image features
            captions: (B, seq_len) - Caption tokens
        
        Returns:
            logits: (B, seq_len, vocab_size)
        """
        batch_size = features.size(0)
        seq_len = captions.size(1)
        
        # 1. Project image features
        memory = self.feature_proj(features)  # (B, 49, hidden)
        
        # 2. Embed captions
        tgt = self.embed(captions) * math.sqrt(self.embed_size)
        tgt = self.pos_encoder(tgt)
        tgt = self.embed_dropout(tgt)  # (B, seq_len, embed)
        
        # 3. Create causal mask
        tgt_mask = self.generate_square_subsequent_mask(seq_len).to(captions.device)
        
        # 4. Transformer decoder
        output = self.transformer_decoder(
            tgt=tgt,
            memory=memory,
            tgt_mask=tgt_mask
        )  # (B, seq_len, hidden)
        
        # 5. Output projection
        output = self.dropout(output)
        logits = self.fc_out(output)  # (B, seq_len, vocab_size)
        
        return logits


# Kh·ªüi t·∫°o model
model = TransformerDecoder(
    vocab_size=len(vocab),
    embed_size=EMBED_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    num_heads=NUM_HEADS,
    dropout=DROPOUT
).to(device)

print("\n" + "="*80)
print("TRANSFORMER DECODER ARCHITECTURE")
print("="*80)
print(f"Vocab size: {len(vocab)}")
print(f"Embed size: {EMBED_SIZE}")
print(f"Hidden size: {HIDDEN_SIZE}")
print(f"Num layers: {NUM_LAYERS}")
print(f"Num heads: {NUM_HEADS}")
print(f"Dropout: {DROPOUT}")

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


TRANSFORMER DECODER ARCHITECTURE
Vocab size: 9964
Embed size: 512
Hidden size: 512
Num layers: 3
Num heads: 8
Dropout: 0.3
Total parameters: 23,481,068
Trainable parameters: 23,481,068


In [16]:
# Loss function
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# Optimizer
optimizer = optim.AdamW(
    model.parameters(),
    lr=MAX_LR,
    betas=(0.9, 0.98),
    eps=1e-9,
    weight_decay=0.01
)

# Learning rate scheduler v·ªõi warmup
def lr_lambda(step):
    """Warmup + inverse sqrt decay"""
    if step == 0:
        return 0
    if step < WARMUP_STEPS:
        return step / WARMUP_STEPS
    return (WARMUP_STEPS ** 0.5) * (step ** -0.5)

scheduler = LambdaLR(optimizer, lr_lambda)

print("\n" + "="*80)
print("TRAINING CONFIGURATION")
print("="*80)
print(f"Optimizer: AdamW")
print(f"Learning rate: {MAX_LR} (with warmup)")
print(f"Warmup steps: {WARMUP_STEPS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Num epochs: {NUM_EPOCHS}")


TRAINING CONFIGURATION
Optimizer: AdamW
Learning rate: 0.0001 (with warmup)
Warmup steps: 4000
Batch size: 32
Num epochs: 15


In [17]:
def train_one_epoch(model, train_loader, criterion, optimizer, scheduler, epoch):
    """Train 1 epoch"""
    model.train()
    total_loss = 0
    
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS} [TRAIN]", leave=True)
    
    for features, captions in loop:
        features = features.to(device)  # (B, 49, 1280)
        captions = captions.to(device)  # (B, seq_len)
        
        # Forward
        outputs = model(features, captions[:, :-1])  # (B, seq_len-1, vocab)
        
        # Reshape for loss
        outputs = outputs.reshape(-1, outputs.size(-1))
        targets = captions[:, 1:].reshape(-1)
        
        # Loss
        loss = criterion(outputs, targets)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
        # Update progress bar
        current_lr = optimizer.param_groups[0]['lr']
        loop.set_postfix(loss=loss.item(), lr=current_lr)
    
    avg_loss = total_loss / len(train_loader)
    return avg_loss

In [18]:
def validate(model, val_loader, criterion, epoch):
    """Validation"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        loop = tqdm(val_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS} [VAL]", leave=True)
        
        for features, captions in loop:
            features = features.to(device)
            captions = captions.to(device)
            
            # Forward
            outputs = model(features, captions[:, :-1])
            
            # Reshape
            outputs = outputs.reshape(-1, outputs.size(-1))
            targets = captions[:, 1:].reshape(-1)
            
            # Loss
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            
            loop.set_postfix(loss=loss.item())
    
    avg_loss = total_loss / len(val_loader)
    return avg_loss

In [2]:
import torch
import wandb

# --- C·∫§U H√åNH WANDB ---
# 1. B·∫°n v√†o Kaggle -> Add-ons -> Secrets -> Add new secret
# Label: wandb_key, Value: (L·∫•y API Key t·ª´ https://wandb.ai/authorize)
from kaggle_secrets import UserSecretsClient

try:
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("wandb_key")
    wandb.login(key=wandb_api_key)
except:
    # N·∫øu ch∆∞a set secret th√¨ nh·∫≠p tay khi ch·∫°y
    wandb.login() 

# Kh·ªüi t·∫°o project (N√≥ s·∫Ω t·ª± t·∫°o project m·ªõi tr√™n web/app cho b·∫°n)
run = wandb.init(
    project="transformer-project", 
    name="kaggle_run_mobile_tracking",
    config={
        "learning_rate": optimizer.param_groups[0]['lr'],
        "epochs": NUM_EPOCHS,
        "batch_size": 32, # V√≠ d·ª•, b·∫°n c√≥ th·ªÉ thay s·ªë th·ª±c t·∫ø
        "architecture": "Transformer"
    }
)

print("\n" + "="*80)
print("STARTING TRAINING")
print("="*80)

best_val_loss = float('inf')
patience_counter = 0

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\n{'='*80}")
    print(f"EPOCH {epoch}/{NUM_EPOCHS}")
    print('='*80)
    
    # Train
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, scheduler, epoch)
    
    # Validate
    val_loss = validate(model, val_loader, criterion, epoch)
    
    current_lr = optimizer.param_groups[0]['lr']

    print(f"\nEpoch {epoch} Results:")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss:   {val_loss:.4f}")
    
    # --- LOG L√äN WANDB (Quan tr·ªçng nh·∫•t) ---
    # D√≤ng n√†y s·∫Ω ƒë·∫©y d·ªØ li·ªáu l√™n app ƒëi·ªán tho·∫°i ngay l·∫≠p t·ª©c
    wandb.log({
        "epoch": epoch,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "learning_rate": current_lr
    })

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pth")
        print(f"Saved best model (Val Loss: {best_val_loss:.4f})")
    else:
        patience_counter += 1
        print(f"No improvement ({patience_counter}/{EARLY_STOP_PATIENCE})")
    
    # Early stopping
    if patience_counter >= EARLY_STOP_PATIENCE:
        print(f"\nEarly stopping at epoch {epoch}")
        break

# K·∫øt th√∫c tracking
wandb.finish()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33mleducminh583[0m ([33mleducminh583-national-economics-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



STARTING TRAINING

EPOCH 1/15


Epoch 1/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:28<00:00, 30.12it/s, loss=3.64, lr=7.05e-5]
Epoch 1/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 67.37it/s, loss=3.3] 



Epoch 1 Results:
Train Loss: 3.7352
Val Loss:   3.7137
Saved best model (Val Loss: 3.7137)

EPOCH 2/15


Epoch 2/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.33it/s, loss=3.58, lr=5.65e-5]
Epoch 2/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 65.10it/s, loss=3.21]



Epoch 2 Results:
Train Loss: 3.5145
Val Loss:   3.5965
Saved best model (Val Loss: 3.5965)

EPOCH 3/15


Epoch 3/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:28<00:00, 30.19it/s, loss=3.04, lr=4.85e-5]
Epoch 3/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 67.54it/s, loss=3.23]



Epoch 3 Results:
Train Loss: 3.4171
Val Loss:   3.5583
Saved best model (Val Loss: 3.5583)

EPOCH 4/15


Epoch 4/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.27it/s, loss=3.02, lr=4.32e-5]
Epoch 4/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 66.90it/s, loss=3.18]



Epoch 4 Results:
Train Loss: 3.3541
Val Loss:   3.5143
Saved best model (Val Loss: 3.5143)

EPOCH 5/15


Epoch 5/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.32it/s, loss=3.08, lr=3.93e-5]
Epoch 5/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 67.14it/s, loss=3.21]



Epoch 5 Results:
Train Loss: 3.3060
Val Loss:   3.4815
Saved best model (Val Loss: 3.4815)

EPOCH 6/15


Epoch 6/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.37it/s, loss=3.31, lr=3.63e-5]
Epoch 6/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 65.41it/s, loss=3.18]



Epoch 6 Results:
Train Loss: 3.2675
Val Loss:   3.4673
Saved best model (Val Loss: 3.4673)

EPOCH 7/15


Epoch 7/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.39it/s, loss=3.39, lr=3.39e-5]
Epoch 7/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 68.09it/s, loss=3.2] 



Epoch 7 Results:
Train Loss: 3.2338
Val Loss:   3.4393
Saved best model (Val Loss: 3.4393)

EPOCH 8/15


Epoch 8/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:28<00:00, 30.19it/s, loss=3.36, lr=3.19e-5]
Epoch 8/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 67.14it/s, loss=3.2] 



Epoch 8 Results:
Train Loss: 3.2027
Val Loss:   3.4376
Saved best model (Val Loss: 3.4376)

EPOCH 9/15


Epoch 9/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.30it/s, loss=2.72, lr=3.02e-5]
Epoch 9/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 67.67it/s, loss=3.19]



Epoch 9 Results:
Train Loss: 3.1775
Val Loss:   3.4303
Saved best model (Val Loss: 3.4303)

EPOCH 10/15


Epoch 10/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.40it/s, loss=3.1, lr=2.88e-5] 
Epoch 10/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 66.83it/s, loss=3.18]



Epoch 10 Results:
Train Loss: 3.1541
Val Loss:   3.4143
Saved best model (Val Loss: 3.4143)

EPOCH 11/15


Epoch 11/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.35it/s, loss=2.58, lr=2.75e-5]
Epoch 11/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 67.19it/s, loss=3.15]



Epoch 11 Results:
Train Loss: 3.1308
Val Loss:   3.4044
Saved best model (Val Loss: 3.4044)

EPOCH 12/15


Epoch 12/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.38it/s, loss=3.29, lr=2.64e-5]
Epoch 12/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 66.63it/s, loss=3.17]



Epoch 12 Results:
Train Loss: 3.1121
Val Loss:   3.3969
Saved best model (Val Loss: 3.3969)

EPOCH 13/15


Epoch 13/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.31it/s, loss=2.8, lr=2.55e-5] 
Epoch 13/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 68.17it/s, loss=3.2] 



Epoch 13 Results:
Train Loss: 3.0932
Val Loss:   3.3869
Saved best model (Val Loss: 3.3869)

EPOCH 14/15


Epoch 14/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:26<00:00, 30.43it/s, loss=3.5, lr=2.46e-5] 
Epoch 14/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 67.26it/s, loss=3.16]



Epoch 14 Results:
Train Loss: 3.0761
Val Loss:   3.3909
No improvement (1/5)

EPOCH 15/15


Epoch 15/15 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4470/4470 [02:27<00:00, 30.29it/s, loss=3.1, lr=2.38e-5] 
Epoch 15/15 [VAL]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 497/497 [00:07<00:00, 66.70it/s, loss=3.21]



Epoch 15 Results:
Train Loss: 3.0592
Val Loss:   3.3865
Saved best model (Val Loss: 3.3865)


0,1
epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñá‚ñá‚ñá‚ñà
learning_rate,‚ñà‚ñÜ‚ñÖ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
train_loss,‚ñà‚ñÜ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
val_loss,‚ñà‚ñÖ‚ñÖ‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
epoch,15.0
learning_rate,2e-05
train_loss,3.05923
val_loss,3.38654


In [19]:
def generate_caption_beam_search(model, img_path, vocab, encoder, transform,
                                 max_len=20, beam_width=3):
    """
    Generate caption v·ªõi beam search
    
    Args:
        model: Trained Transformer model
        img_path: Path to image
        vocab: Vocabulary object
        encoder: CNN encoder
        transform: Image transform
        max_len: Max caption length
        beam_width: Beam width
    
    Returns:
        caption: Generated caption string
    """
    model.eval()
    device = next(model.parameters()).device
    
    # Load v√† preprocess image
    img = Image.open(img_path).convert("RGB")
    img_tensor = transform(img).unsqueeze(0).to(device)
    
    # Extract features
    with torch.no_grad():
        features = encoder(img_tensor)  # (1, 49, 1280)
    
    # Initialize beam
    sequences = [[vocab.stoi["<start>"]]]
    scores = [0.0]
    
    for step in range(max_len):
        all_candidates = []
        
        for i, seq in enumerate(sequences):
            # N·∫øu sequence ƒë√£ k·∫øt th√∫c
            if seq[-1] == vocab.stoi["<end>"]:
                all_candidates.append((seq, scores[i]))
                continue
            
            # Prepare input
            tgt = torch.LongTensor([seq]).to(device)  # (1, len(seq))
            
            # Forward pass
            with torch.no_grad():
                logits = model(features, tgt)  # (1, len(seq), vocab_size)
                log_probs = torch.log_softmax(logits[:, -1, :], dim=-1)  # (1, vocab_size)
            
            # Get top-k
            topk_log_probs, topk_indices = torch.topk(log_probs, beam_width)
            
            for j in range(beam_width):
                candidate_seq = seq + [topk_indices[0][j].item()]
                candidate_score = scores[i] + topk_log_probs[0][j].item()
                all_candidates.append((candidate_seq, candidate_score))
        
        # Keep top beam_width sequences
        ordered = sorted(all_candidates, key=lambda x: x[1], reverse=True)
        sequences = [seq for seq, _ in ordered[:beam_width]]
        scores = [score for _, score in ordered[:beam_width]]
        
        # Early stop
        if all(seq[-1] == vocab.stoi["<end>"] for seq in sequences):
            break
    
    # Return best sequence
    best_seq = sequences[0]
    caption = [vocab.itos[idx] for idx in best_seq[1:] 
               if idx not in [vocab.stoi["<end>"], vocab.stoi["<pad>"]]]
    
    return ' '.join(caption)

In [20]:
def evaluate_bleu(model, df_eval, vocab, encoder, transform, num_samples=500):
    """
    ƒê√°nh gi√° BLEU score
    """
    from nltk.translate.bleu_score import corpus_bleu
    
    print("\n" + "="*80)
    print(f"üìä EVALUATING BLEU SCORE (on {num_samples} samples)")
    print("="*80)
    
    # Group captions by image
    grouped_df = df_eval.groupby('image_name')['comment'].apply(list).reset_index()
    
    if num_samples < len(grouped_df):
        sample_df = grouped_df.sample(n=num_samples, random_state=42)
    else:
        sample_df = grouped_df
    
    references = []
    hypotheses = []
    
    for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Generating captions"):
        img_name = row['image_name']
        true_captions = row['comment']
        
        img_path = os.path.join(IMG_DIR, img_name)
        
        try:
            # Generate caption
            pred_caption = generate_caption_beam_search(
                model, img_path, vocab, encoder, transform, 
                max_len=20, beam_width=3
            )
            
            pred_tokens = pred_caption.split()
            hypotheses.append(pred_tokens)
            
            # Tokenize ground truth captions
            refs_tokens = [vocab.tokenizer_eng(c) for c in true_captions]
            references.append(refs_tokens)
            
        except Exception as e:
            print(f"Error with {img_name}: {e}")
            continue
    
    # Calculate BLEU
    bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
    bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
    bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))
    
    print("\n" + "="*80)
    print("üìä BLEU SCORE RESULTS")
    print("="*80)
    print(f"BLEU-1: {bleu1:.4f}")
    print(f"BLEU-2: {bleu2:.4f}")
    print(f"BLEU-3: {bleu3:.4f}")
    print(f"BLEU-4: {bleu4:.4f}")
    print("="*80)
    
    return bleu1, bleu2, bleu3, bleu4


# Evaluate model
print("\nüîÑ Loading best model for evaluation...")
model.load_state_dict(torch.load("/kaggle/working/best_model.pth"))
model.eval()

evaluate_bleu(model, df, vocab, encoder, transform, num_samples=500)


üîÑ Loading best model for evaluation...

üìä EVALUATING BLEU SCORE (on 500 samples)


Generating captions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [01:14<00:00,  6.70it/s]



üìä BLEU SCORE RESULTS
BLEU-1: 0.6852
BLEU-2: 0.5220
BLEU-3: 0.3975
BLEU-4: 0.2942


(0.6852167329169033,
 0.5219982485048974,
 0.3974973270137489,
 0.29419529466015876)

In [21]:
print("\n" + "="*80)
print("TESTING WITH SAMPLE IMAGES")
print("="*80)

# L·∫•y 5 ·∫£nh ng·∫´u nhi√™n
sample_images = df['image_name'].unique()[:5]

for img_name in sample_images:
    img_path = os.path.join(IMG_DIR, img_name)
    
    # Ground truth captions
    true_captions = df[df['image_name'] == img_name]['comment'].values
    
    # Generated caption
    try:
        pred_caption = generate_caption_beam_search(
            model, img_path, vocab, encoder, transform,
            max_len=20, beam_width=3
        )
        
        print(f"\nImage: {img_name}")
        print(f"Generated: {pred_caption}")
        print(f"Ground Truth:")
        for i, cap in enumerate(true_captions[:3]):  # Show first 3
            print(f"   {i+1}. {cap}")
        print("-" * 80)
        
    except Exception as e:
        print(f"Error with {img_name}: {e}")


TESTING WITH SAMPLE IMAGES

Image: 1000092795.jpg
Generated: a man in a green shirt and jeans is standing in front of a tree
Ground Truth:
   1. Two young guys with shaggy hair look at their hands while hanging out in the yard .
   2. Two young  White males are outside near many bushes .
   3. Two men in green shirts are standing in a yard .
--------------------------------------------------------------------------------

Image: 10002456.jpg
Generated: two men are working on a machine
Ground Truth:
   1. Several men in hard hats are operating a giant pulley system .
   2. Workers look down from up above on a piece of equipment .
   3. Two men working on a machine wearing hard hats .
--------------------------------------------------------------------------------

Image: 1000268201.jpg
Generated: a little girl in a pink dress is sitting on a wooden bench
Ground Truth:
   1. A child in a pink dress is climbing up a set of stairs in an entry way .
   2. A little girl in a pink dress goin

In [1]:
import pickle
import re
from IPython.display import FileLink

# ============================================================================
# ƒê·ªäNH NGHƒ®A L·∫†I CLASS (B·∫ÆT BU·ªòC)
# ============================================================================

class Vocabulary:
    """
    X√¢y d·ª±ng t·ª´ ƒëi·ªÉn t·ª´ captions
    """
    def __init__(self, freq_threshold):
        self.itos = {0: "<pad>", 1: "<start>", 2: "<end>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}
        self.freq_threshold = freq_threshold
    
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenizer_eng(text):
        """
        Tokenize English text
        FIX: S·ª≠ d·ª•ng regex ƒë·ªÉ lo·∫°i b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát
        """
        text = str(text).lower()
        text = re.sub(r'[^a-z ]', '', text)  # Ch·ªâ gi·ªØ ch·ªØ c√°i v√† kho·∫£ng tr·∫Øng
        return text.split()
    
    def build_vocabulary(self, sentence_list):
        """X√¢y d·ª±ng vocabulary t·ª´ list captions"""
        frequencies = {}
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                frequencies[word] = frequencies.get(word, 0) + 1
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
    
    def numericalize(self, text):
        """Chuy·ªÉn text th√†nh list of indices"""
        tokenized_text = self.tokenizer_eng(text)
        return [self.stoi.get(token, self.stoi["<unk>"]) for token in tokenized_text]


# ============================================================================
# SAVE VOCABULARY
# ============================================================================

print("="*80)
print("üíæ SAVING VOCABULARY FOR DEPLOYMENT")
print("="*80)

# Ki·ªÉm tra vocab ƒë√£ t·ªìn t·∫°i ch∆∞a
try:
    print(f"‚úÖ Found existing vocab with {len(vocab)} tokens")
except:
    print("‚ùå Vocab ch∆∞a t·ªìn t·∫°i! C·∫ßn train model tr∆∞·ªõc.")
    raise

# Save vocabulary object
with open('/kaggle/working/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

print(f"‚úÖ Vocabulary saved successfully!")
print(f"   File: /kaggle/working/vocab.pkl")
print(f"   Size: {len(vocab)} tokens")
print(f"   Freq threshold: {vocab.freq_threshold}")
print(f"\n   Special tokens:")
print(f"      <pad>:   {vocab.stoi['<pad>']}")
print(f"      <start>: {vocab.stoi['<start>']}")
print(f"      <end>:   {vocab.stoi['<end>']}")
print(f"      <unk>:   {vocab.stoi['<unk>']}")

# Test load l·∫°i ƒë·ªÉ ƒë·∫£m b·∫£o kh√¥ng l·ªói
print("\nüîÑ Testing load...")
with open('/kaggle/working/vocab.pkl', 'rb') as f:
    test_vocab = pickle.load(f)

print(f"‚úÖ Load test successful! Vocab size: {len(test_vocab)}")

# Test tokenizer
test_sentence = "A dog is running in the park."
tokens = test_vocab.tokenizer_eng(test_sentence)
indices = test_vocab.numericalize(test_sentence)
print(f"\nüß™ Tokenizer test:")
print(f"   Input:   '{test_sentence}'")
print(f"   Tokens:  {tokens}")
print(f"   Indices: {indices[:10]}...")  # Show first 10

print("\n" + "="*80)
print("üì• DOWNLOAD FILES")
print("="*80)
print("Click v√†o c√°c link ƒë·ªÉ t·∫£i xu·ªëng:")
print()

# Model weights
print("1Ô∏è‚É£ Model weights (~100MB):")
display(FileLink('/kaggle/working/best_transformer_model.pth'))

# Vocabulary
print("\n2Ô∏è‚É£ Vocabulary (~1MB):")
display(FileLink('/kaggle/working/vocab.pkl'))

print("\n" + "="*80)
print("‚úÖ FILES READY FOR DEPLOYMENT!")
print("="*80)
print("\nüìã Next steps:")
print("1. ‚úÖ Download c·∫£ 2 files (click links tr√™n)")
print("2. üåê Go to https://huggingface.co/spaces")
print("3. ‚ûï Create new Space (SDK: Gradio)")
print("4. üì§ Upload 6 files:")
print("      - app.py")
print("      - model.py")
print("      - requirements.txt")
print("      - README.md")
print("      - best_transformer_model.pth (t·ª´ Kaggle)")
print("      - vocab.pkl (t·ª´ Kaggle)")
print("5. ‚è≥ Wait for build (~5 minutes)")
print("6. üéâ Your app is LIVE!")

üíæ SAVING VOCABULARY FOR DEPLOYMENT
‚ùå Vocab ch∆∞a t·ªìn t·∫°i! C·∫ßn train model tr∆∞·ªõc.


NameError: name 'vocab' is not defined

In [None]:
import os

# ƒê∆∞·ªùng d·∫´n c·∫ßn ki·ªÉm tra
WORK_DIR = "/kaggle/working"

print(f"üìÇ ƒêang li·ªát k√™ n·ªôi dung trong: {WORK_DIR}\n")

# Duy·ªát qua c√¢y th∆∞ m·ª•c
for root, dirs, files in os.walk(WORK_DIR):
    # T√≠nh c·∫•p ƒë·ªô s√¢u ƒë·ªÉ in th·ª•t l·ªÅ cho ƒë·∫πp
    level = root.replace(WORK_DIR, '').count(os.sep)
    indent = ' ' * 4 * (level)
    print(f"{indent}üìÅ {os.path.basename(root)}/")
    subindent = ' ' * 4 * (level + 1)
    
    # In ra t·ªëi ƒëa 5 file ƒë·∫ßu ti√™n trong m·ªói th∆∞ m·ª•c ƒë·ªÉ tr√°nh b·ªã spam m√†n h√¨nh
    for f in files[:5]:
        print(f"{subindent}üìÑ {f}")
    
    if len(files) > 5:
        print(f"{subindent}... (c√≤n {len(files) - 5} file n·ªØa)")

print("\n‚úÖ Ho√†n t·∫•t li·ªát k√™.")

In [2]:
import pickle
import re
from IPython.display import FileLink

# ============================================================================
# ƒê·ªäNH NGHƒ®A CLASS VOCABULARY (B·∫ÆT BU·ªòC)
# ============================================================================

class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<pad>", 1: "<start>", 2: "<end>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}
        self.freq_threshold = freq_threshold
    
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenizer_eng(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z ]', '', text)
        return text.split()
    
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                frequencies[word] = frequencies.get(word, 0) + 1
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
    
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        return [self.stoi.get(token, self.stoi["<unk>"]) for token in tokenized_text]


# ============================================================================
# KI·ªÇM TRA VOCAB T·ªíN T·∫†I
# ============================================================================



# ============================================================================
# SAVE VOCABULARY
# ============================================================================

print("\n" + "="*80)
print("üíæ SAVING VOCABULARY")
print("="*80)

with open('/kaggle/working/vocab.pkl', 'wb') as f:
    pickle.dump(existing_vocab, f)

print("‚úÖ Saved vocab.pkl")

# Test load
with open('/kaggle/working/vocab.pkl', 'rb') as f:
    test_vocab = pickle.load(f)

print(f"‚úÖ Test load successful: {len(test_vocab)} tokens")

# Test tokenizer
test_text = "A dog is running."
tokens = test_vocab.tokenizer_eng(test_text)
print(f"‚úÖ Test tokenize: '{test_text}' ‚Üí {tokens}")

print("\n" + "="*80)
print("üì• DOWNLOAD VOCAB")
print("="*80)
display(FileLink('/kaggle/working/vocab.pkl'))

‚úÖ Found existing vocab with 9964 tokens

üíæ SAVING VOCABULARY


PicklingError: Can't pickle <class '__main__.Vocabulary'>: it's not the same object as __main__.Vocabulary

In [3]:

try:
    # N·∫øu vocab ƒë√£ c√≥ s·∫µn trong notebook
    print(f"‚úÖ Found existing vocab with {len(vocab)} tokens")
    existing_vocab = vocab
except NameError:
    # N·∫øu ch∆∞a c√≥ vocab ‚Üí t·∫°o m·ªõi
    print("‚ö†Ô∏è  Vocab ch∆∞a t·ªìn t·∫°i, ƒëang t·∫°o m·ªõi...")
    
    import pandas as pd
    df = pd.read_csv("/kaggle/input/flickr30k/captions.txt")
    
    existing_vocab = Vocabulary(freq_threshold=3)
    existing_vocab.build_vocabulary(df['comment'].tolist())
    
    print(f"‚úÖ Created new vocab with {len(existing_vocab)} tokens")

‚úÖ Found existing vocab with 9964 tokens


In [4]:
print("\n" + "="*80)
print("üíæ SAVING VOCABULARY")
print("="*80)

with open('/kaggle/working/vocab.pkl', 'wb') as f:
    pickle.dump(existing_vocab, f)

print("‚úÖ Saved vocab.pkl")

# Test load
with open('/kaggle/working/vocab.pkl', 'rb') as f:
    test_vocab = pickle.load(f)

print(f"‚úÖ Test load successful: {len(test_vocab)} tokens")

# Test tokenizer
test_text = "A dog is running."
tokens = test_vocab.tokenizer_eng(test_text)
print(f"‚úÖ Test tokenize: '{test_text}' ‚Üí {tokens}")

print("\n" + "="*80)
print("üì• DOWNLOAD VOCAB")
print("="*80)
display(FileLink('/kaggle/working/vocab.pkl'))


üíæ SAVING VOCABULARY


PicklingError: Can't pickle <class '__main__.Vocabulary'>: it's not the same object as __main__.Vocabulary