## Import

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
import os
import numpy as np
import random
import warnings
import dataset as d
warnings.filterwarnings(action='ignore')

%load_ext autoreload
%autoreload 2
import gc
def torch_empty():
    gc.collect()
    torch.cuda.empty_cache()

## Hyperparameter Settings

In [2]:
CFG = {
    'IMG_SIZE':224,
    'EPOCHS':10, #Your Epochs,
    'LR':1e-5, #Your Learning Rate,
    'BATCH_SIZE': 32, #Your Batch Size,
    'SEED':41
}

## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Custom Dataset

In [4]:
# dataset.py 의 CustomDataset 클래스 사용
import dataset as d
from util.preprocessing import  *


In [5]:
train_mean = (0.4194325, 0.3830166, 0.3490198)
train_Std = (0.23905228, 0.2253936, 0.22334467)

valid_mean = ( 0.4170096, 0.38036022, 0.34702352)
valid_Std = (0.23896241, 0.22566794, 0.22329141)

In [9]:
train_data = pd.read_csv('./data/open/train_data.csv')
valid_data = pd.read_csv('./data/open/valid_data.csv')
test_data = pd.read_csv('./data/open/test_data.csv')


train_transform = d.ImageTransForm(CFG['IMG_SIZE'], train_mean, train_Std)
valid_transform = d.ImageTransForm(CFG['IMG_SIZE'], valid_mean, valid_Std)

In [10]:
train_data

Unnamed: 0.1,Unnamed: 0,img_name,img_path,mos,comments
0,60735,gubp008c6g,./train/gubp008c6g.jpg,6.748634,so much of your work has a wonderful warm ligh...
1,36247,uo7nyjk0q8,./train/uo7nyjk0q8.jpg,4.947712,i can see two or three arguments for this repr...
2,32522,9253eioziz,./train/9253eioziz.jpg,6.247335,one of the best judi the perfect light!
3,31185,o94xwnigvh,./train/o94xwnigvh.jpg,5.671795,"a fantastic use of hdr, and an out of this wor..."
4,74469,leyknxz72e,./train/leyknxz72e.jpg,5.676364,"well, yeah, hes a little obvious , but this is..."
...,...,...,...,...,...
44735,37194,69ozfzpls0,./train/69ozfzpls0.jpg,5.075000,"very good, the the hard light makes the micros..."
44736,6265,0mlwgzv380,./train/0mlwgzv380.jpg,5.835341,the textures seem very flat like too much nois...
44737,54886,yp8upku9tg,./train/yp8upku9tg.jpg,5.987342,interesting approach.nice mood created by your...
44738,860,xpzv31nrrz,./train/xpzv31nrrz.jpg,5.214984,im not sure the sepia tone is helping this pic...


In [11]:
train_dataset = d.CustomDataset(train_data, 'train', transform=train_transform)
valid_dataset = d.CustomDataset(test_data, 'valid', transform=valid_transform)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
test_loader = DataLoader(valid_data, batch_size=CFG['BATCH_SIZE'], shuffle=True)

dataloader_dict = {'train' : train_loader, 'valid' : test_loader}


## Define Model

In [14]:
# # 데이터 로드
from train.models.encoder_resnet import EncoderResnet
encoder = EncoderResnet(512)
# out, mos = encoder(dataset[0])
# print(mos.shape)
# print(out.shape)

# Sample data-set train model

In [30]:
#torch.cuda.get_device_name(0)
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
torch.backends.mps.is_available()

True

In [31]:
from torch import optim

criterion = nn.MSELoss()
optimizer = optim.Adam(encoder.parameters(), lr=1e-5)
criterion.to(device)
encoder.to(device)


EncoderResnet(
  (cnn_backbone): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(

In [32]:
torch_empty()

In [33]:
from train.trainer import trainer
train_history, valid_history = trainer(encoder, dataloader_dict=dataloader_dict, criterion=criterion, num_epoch=1, optimizer=optimizer, device=device, early_stop=5)

  1%|          | 7/1399 [00:50<2:46:05,  7.16s/it]


KeyboardInterrupt: 

In [12]:
# train_history = [1,2,3,4,5,6,7,8,9,10]
# valid_history = [1,2,3,4,5,6,7,8,9,10]

In [13]:
import pandas as pd
#[(a, b) for a, b in zip(train_history, valid_history)]
pd = pd.DataFrame(columns=['train_loss', 'test_loss'], data=[(train, valid) for train, valid in zip(train_history, valid_history)])

In [17]:
pd.to_csv('loss.csv')

In [23]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()
optimizer.zero_grad()

In [29]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 327733 KiB |   4988 MiB |  88722 MiB |  88402 MiB |
|       from large pool | 274688 KiB |   4970 MiB |  88434 MiB |  88165 MiB |
|       from small pool |  53045 KiB |    103 MiB |    288 MiB |    236 MiB |
|---------------------------------------------------------------------------|
| Active memory         | 327733 KiB |   4988 MiB |  88722 MiB |  88402 MiB |
|       from large pool | 274688 KiB |   4970 MiB |  88434 MiB |  88165 MiB |
|       from small pool |  53045 KiB |    103 MiB |    288 MiB |    236 MiB |
|---------------------------------------------------------------

In [17]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:<enter-size-here>"

In [17]:

# 단어 사전 생성
all_comments = ' '.join(train_data['comments']).split()
vocab = set(all_comments)
vocab = ['<PAD>', '<SOS>', '<EOS>'] + list(vocab)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}


# 모델, 손실함수, 옵티마이저
model = encoder.EncoderCNN(len(vocab))
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=CFG['LR'])


SyntaxError: invalid syntax (401381389.py, line 2)

## Train

In [10]:



# 학습
model.train()
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    loop = tqdm(train_loader, leave=True)
    for imgs, comments in loop:
        imgs = imgs.float()
        
        # Batch Preprocessing
        comments_tensor = torch.zeros((len(comments), len(max(comments, key=len)))).long()
        for i, comment in enumerate(comments):
            tokenized = ['<SOS>'] + comment.split() + ['<EOS>']
            comments_tensor[i, :len(tokenized)] = torch.tensor([word2idx[word] for word in tokenized])
            print(comments_tensor.size())
            
        # Forward & Loss
        predicted_comments = model(imgs, comments_tensor)
        loss = criterion(predicted_comments.view(-1, len(vocab)), comments_tensor.view(-1))

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} finished with average loss: {total_loss / len(train_loader):.4f}")

  0%|          | 1/583 [00:00<07:21,  1.32it/s]

torch.Size([128, 290])


  0%|          | 2/583 [00:01<06:59,  1.39it/s]

torch.Size([128, 225])


  1%|          | 3/583 [00:02<06:50,  1.41it/s]

torch.Size([128, 223])


  1%|          | 4/583 [00:02<06:46,  1.43it/s]

torch.Size([128, 245])


  1%|          | 5/583 [00:03<06:56,  1.39it/s]

torch.Size([128, 174])


  1%|          | 6/583 [00:04<06:51,  1.40it/s]

torch.Size([128, 185])


  1%|          | 6/583 [00:04<07:23,  1.30it/s]


KeyboardInterrupt: 

## Inference & Submit

In [None]:
test_data = pd.read_csv('./data/open/test.csv')
test_dataset = d.CustomDataset(test_data, transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
predicted_mos_list = []
predicted_comments_list = []

def greedy_decode(model, image, max_length=50):
    image = image.unsqueeze(0)
    mos, _ = model(image)
    output_sentence = []
    
    # 시작 토큰 설정
    current_token = torch.tensor([word2idx['<SOS>']])
    hidden = None
    features = model.cnn(image).view(image.size(0), -1)

    for _ in range(max_length):
        embeddings = model.embedding(current_token).unsqueeze(0)
        combined = torch.cat([features.unsqueeze(1), embeddings], dim=2)
        out, hidden = model.lstm(combined, hidden)
        
        output = model.fc(out.squeeze(0))
        _, current_token = torch.max(output, dim=1)

        # <EOS> 토큰에 도달하면 멈춤
        if current_token.item() == word2idx['<EOS>']:
            break

        # <SOS> 또는 <PAD> 토큰은 생성한 캡션에 추가하지 않음
        if current_token.item() not in [word2idx['<SOS>'], word2idx['<PAD>']]:
            output_sentence.append(idx2word[current_token.item()])
     
    return mos.item(), ' '.join(output_sentence)

# 추론 과정
with torch.no_grad():
    for imgs, _, _ in tqdm(test_loader):
        for img in imgs:
            img = img.float()
            mos, caption = greedy_decode(model, img)
            predicted_mos_list.append(mos)
            predicted_comments_list.append(caption)

# 결과 저장
result_df = pd.DataFrame({
    'img_name': test_data['img_name'],
    'mos': predicted_mos_list,
    'comments': predicted_comments_list  # 캡션 부분은 위에서 생성한 것을 사용
})

# 예측 결과에 NaN이 있다면, 제출 시 오류가 발생하므로 후처리 진행 (sample_submission.csv과 동일하게)
result_df['comments'] = result_df['comments'].fillna('Nice Image.')
result_df.to_csv('submit.csv', index=False)

print("Inference completed and results saved to submit.csv.")