<a href="https://colab.research.google.com/github/hanbuck30/multi-modal_about_dacon/blob/main/multi_modal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip -qq "/content/drive/MyDrive/open/open.zip"

In [None]:
import random
import pandas as pd
import numpy as np
import os
import cv2

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

import albumentations as A # fast image agumentation library
from albumentations.pytorch.transforms import ToTensorV2 # 이미지 형 변환
import torchvision.models as models

from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# gpu 사용하기 위한 코드
# cuda가 설치되어 있으면 gpu

In [None]:
!pip install transformers
from transformers import CanineTokenizer, CanineModel

model = CanineModel.from_pretrained('google/canine-s')
tokenizer = CanineTokenizer.from_pretrained('google/canine-s')

inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")

outputs = model(**encoding) # forward pass
pooled_output = outputs.pooler_output
sequence_output = outputs.last_hidden_state

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 67.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


Downloading:   0%|          | 0.00/670 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/529M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/854 [00:00<?, ?B/s]

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


## Hyperparameter Setting

In [None]:
CFG = {
    'IMG_SIZE':128,
    'EPOCHS':20,
    'LEARNING_RATE':0.001,
    'BATCH_SIZE':512,
    'SEED':41
}
# 이미지 사이즈, 이폭, 학습률, 배치사이즈, 시드 고정

## Fixed RandomSeed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## NLP-pre-processing

In [None]:
all_df = pd.read_csv('./train.csv')

In [None]:
all_df.loc[1]['overview']

'경기도 이천시 모가면에 있는 골프장으로 대중제 18홀이다. 회원제로 개장을 했다가 2016년 대중제로 전환하여 재개장했다. 총 부지 약 980,,000㎡에 전장 6,607m에 18홀 파 72이다. Lake 코스와 Mountain 코스가 있다. 미국 100대 골프 코스 설계자인 짐 파지오가 아마추어에게는 쉽고 프로골퍼에게는 어렵게 설계했다고 한다. 가까이에 뉴스프링빌CC, 써닝포인트CC, 비에이비스타CC, 덕평CC 등의 골프장이 있다.'

In [None]:
import re

def remove_white_space(text):
    text = re.sub(r'[\t\r\n\f\v]', ' ', str(text))
    return text

def remove_special_char(text):
    text = re.sub('[^ ㄱ-ㅣ가-힣 0-9]+', ' ', str(text))
    return text



all_df.overview = all_df.overview.apply(remove_white_space)
all_df.overview = all_df.overview.apply(remove_special_char)

In [None]:
all_df.loc[1]['overview']

'경기도 이천시 모가면에 있는 골프장으로 대중제 18홀이다  회원제로 개장을 했다가 2016년 대중제로 전환하여 재개장했다  총 부지 약 980 000 에 전장 6 607 에 18홀 파 72이다    코스와   코스가 있다  미국 100대 골프 코스 설계자인 짐 파지오가 아마추어에게는 쉽고 프로골퍼에게는 어렵게 설계했다고 한다  가까이에 뉴스프링빌  써닝포인트  비에이비스타  덕평  등의 골프장이 있다 '

## 토크나이징

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 29.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.4 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


## Data Load & Train/Validation Split

In [None]:
train_df, val_df, _, _ = train_test_split(all_df, all_df['cat3'], test_size=0.2, random_state=CFG['SEED'])
# train set, validation set 구별

In [None]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
train_vectors = tokenizer(
    list(train_df['overview']),
    
    return_tensors="pt",
    max_length=256, # Max_Length = 190
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [None]:
val_vectors = tokenizer(
    list(train_df['overview']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

print(train_vectors['input_ids'][0])
print(tokenizer.decode(train_vectors['input_ids'][0]))

tensor([ 9197, 12276,  7788,  8615,  8404,  7461, 26402,  6900, 10116, 23947,
         9197, 35208,  9084,  6908, 10053, 10300, 15073, 33400,   739, 20651,
        25029, 41763, 11110, 12120, 28759,  7461,   739, 41011, 13409,  9276,
        39869, 40955,  8705,  9025,  9080, 26344, 36815,  8204, 33400,   739,
         9426, 50189, 22564, 11594,  9073,  7405,  9989, 12120,  9339,  7540,
         7756,   739,  9338,  6841, 29958,  9276, 28831,  9025, 33400,   739,
        14387, 50189, 10575,  7613,   739,  9040,  6951, 50189,  9868,  8006,
        31505,  9225, 20503, 45704, 15045, 10013,   739, 35736,   739,  9018,
         6919, 33813, 11015,  9154,   739, 41011, 13409, 31369, 14627,  9284,
        13230, 17920,  7235, 10488,  7249, 33400,   739,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3, 

In [None]:
train_vectors

{'input_ids': tensor([[ 9197, 12276,  7788,  ...,     3,     3,     3],
        [ 9767,  7847,  7643,  ...,     3,     3,     3],
        [23412, 21278,  6921,  ...,     3,     3,     3],
        ...,
        [37281,  7478,  8711,  ...,     3,     3,     3],
        [ 9563, 14816,  8022,  ...,     3,     3,     3],
        [12445, 13676,  7627,  ...,     3,     3,     3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

## Label-Encoding

In [None]:
le = preprocessing.LabelEncoder()
le.fit(train_df['cat3'].values)
# 카테고리형 데이터를 수치형으로 변환하는 labelencoder

LabelEncoder()

In [None]:
train_df['cat3'] = le.transform(train_df['cat3'].values)
val_df['cat3'] = le.transform(val_df['cat3'].values)
# cat3에 labelencoder를 적용하기

## Vectorizer

In [None]:
vectorizer = CountVectorizer(max_features=4096)
# overview를 vectorize하는 vectorizer 선언, 최대 특성 수는 4096

In [None]:
train_vectors = vectorizer.fit_transform(train_df['overview'])
train_vectors = train_vectors.todense()

val_vectors = vectorizer.transform(val_df['overview'])
val_vectors = val_vectors.todense()

In [None]:
train_vectors.shape

(13588, 4096)

In [None]:
val_vectors.shape

(3398, 4096)

## CustomDataset

In [None]:
# Dataset 생성
class CustomDataset(Dataset):
    def __init__(self, img_path_list, pair_dataset,text_vector ,label_list, transforms, infer=False):
        self.img_path_list = img_path_list
        self.pair_dataset = pair_dataset
        self.label_list = label_list
        self.transforms = transforms
        self.infer = infer
        self.text_vector = text_vector
        
    def __getitem__(self, index):
        # NLP
        item = {key: val[index].clone().detach() for key, val in self.pair_dataset.items()}
        
        # Image 읽기
        img_path = self.img_path_list[index]
        image = cv2.imread(img_path)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image'] # transforms(=image augmentation) 적용
        
        # Label
        if self.infer: # infer == True, test_data로부터 label "결과 추출" 시 사용
            return image, torch.Tensor(text_vector).view(-1)
        else: # infer == False
            label = self.label_list[index] # dataframe에서 label 가져와 "학습" 시 사용
            return image, label,item,torch.Tensor(text_vector).view(-1)
        
    def __len__(self):
        return len(self.img_path_list)

In [None]:
train_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

- albumentations -> fast image augmentation library

- albumentations.Compose -> transform = A.Compose([])을 이용하여 이미지와 라벨 각각에 Augmentation을 적용하기 위한 객체를 생성

- albumentations.Resize(128, 128) -> 128*128 size로 resize
- albumentations.Normalize() -> 입력 받은 이미지 값의 범위를 (0, 255) → (-1, 1) 범위로 줄여주는 역할, 위에서는 평균값, 분산값, 최대 픽셀값으로 img = (img - mean * max_pixel_value) / (std * max_pixel_value)을 계산.
- ToTensorV2 -> tensor형 변환

In [None]:
# __init__(self, img_path_list, text_vectors, label_list, transforms, infer=False)
train_dataset = CustomDataset(train_df['img_path'].values, train_vectors, train_df['cat3'].values, train_transform)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0) # 6

val_dataset = CustomDataset(val_df['img_path'].values, val_vectors, val_df['cat3'].values, test_transform)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0) # 6

TypeError: ignored

- DataLoader: Dataset와 Sampler를 결합하고 지정된 데이터 세트에 대해 반복 가능한 기능을 제공.    
    - dataset (Dataset): 데이터를 로드할 데이터 집합.   
    - batch_size (int, optional): **how many samples** per batch to load (default: ``1``).   
    - num_workers (int, optional): **how many subprocesses** to use for data loading. ``0`` means that the data will be    loaded in the main process. (default: ``0``) -> 6으로 설정 시 안돌아감([Errno 32] Broken pipe). 0으로 변경해야 됨

## Model Define

In [None]:
class CustomModel(nn.Module):
    def __init__(self, num_classes=len(le.classes_)):
        super(CustomModel, self).__init__()

    # torchvision.models에서 사전훈련된 resnet 모델 가져오기
        self.model = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1), #cnn layer
            nn.BatchNorm2d(8), #batch norm layer
            nn.ReLU(),
            nn.Conv2d(8,16,kernel_size=3, stride=1, padding=1), #cnn layer
            
            nn.BatchNorm2d(16), #batch norm layer
            nn.ReLU(),
            nn.Conv2d(16,32,kernel_size=3, stride=1, padding=1), #cnn layer
            nn.BatchNorm2d(32), #batch norm layer
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2), #pooling layer
            nn.Dropout(p=0.25),
            nn.Conv2d(32,64,kernel_size=3, stride=1, padding=1), #cnn layer
            
            nn.BatchNorm2d(64), #batch norm layer
            nn.ReLU(),
    
    
            nn.Conv2d(64,128,kernel_size=3, stride=1, padding=1), #cnn layer
            
            
            
            nn.BatchNorm2d(128), #batch norm layer
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2), #pooling layer
            nn.Dropout(p=0.25),
           
            nn.Conv2d(128,256,kernel_size=3, stride=1, padding=1), #cnn layer
            
            nn.BatchNorm2d(256), #batch norm layer
            nn.ReLU(),
            
            nn.MaxPool2d(kernel_size=2, stride=2), #pooling layer
            nn.Dropout(p=0.25),
            nn.Conv2d(256,64,kernel_size=3, stride=1, padding=1), #cnn layer
            nn.BatchNorm2d(64), #batch norm layer
            nn.ReLU(),
            
            
            nn.MaxPool2d(kernel_size=2, stride=2), #pooling layer
        )

    # Text
        self.nlp_extract = nn.Sequential(
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            )
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(5120, num_classes)
            )
            

    def forward(self, img, text):
        img_feature = self.model(img)
        img_feature = torch.flatten(img_feature, start_dim=1)
        text_feature = self.nlp_extract(text)
        feature = torch.cat([img_feature, text_feature], axis=1)
        output = self.classifier(feature)
        return output

In [None]:
class CustomModel(nn.Module): # MnistResNet은 nn.Module 상속
    def __init__(self, in_channels=3,num_classes=len(le.classes_)):
        super(CustomModel, self).__init__()

    # torchvision.models에서 사전훈련된 resnet 모델 가져오기
        self.model = models.resnet18(pretrained=True).to(device)

    # 기본 채널이 3(RGB)이기 때문에 fashion_mnist에 맞게 1(grayscale image)로 바꿔준다.  
    # 원래 ResNet의 첫번째 층
    # self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.model.conv1 = nn.Conv2d(in_channels, 64, kernel_size=7 ,stride=2, padding=3, bias=False)
    
    # 1000개 클래스 대신 10개 클래스로 바꿔주기
        num_ftrs = self.model.fc.in_features
    # nn.Linear(in_features, out_features ...)
        self.model.fc = nn.Linear(num_ftrs, 4096)

    # Text
        self.nlp_extract = nn.Sequential(
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            )
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(5120, num_classes)
            )
            

    def forward(self, img, text):
        img_feature = self.model(img)
        img_feature = torch.flatten(img_feature, start_dim=1)
        text_feature = self.nlp_extract(text)
        feature = torch.cat([img_feature, text_feature], axis=1)
        output = self.classifier(feature)
        return output
  

결론:
- Image: conv -> ReLU -> MaxPooling -> conv -> relu -> maxpooling -> conv -> relu -> maxpooling -> conv -> relu -> maxpooling

- Text: linear -> relu -> linear

- classifier : linear

## Train

In [None]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device) # gpu(cpu)에 적용

    criterion = nn.CrossEntropyLoss().to(device) # CrossEntropyLoss: 다중분류를 위한 손실함수
    best_score = 0
    best_model = None # 최고의 모델을 추출하기 위한 파라미터
    
    for epoch in range(1,CFG["EPOCHS"]+1):
        model.train() # 학습시킴.
        train_loss = []
        for img, text, label in tqdm(iter(train_loader)): # train_loader에서 img, text, label 가져옴
            img = img.float().to(device)
            text = text.to(device)
            label = label.type(torch.LongTensor) # label type을 LongTensor로 형변환, 추가하여 에러 해결
            label = label.to(device)
            
            optimizer.zero_grad() # 이전 루프에서 .grad에 저장된 값이 다음 루프의 업데이트에도 간섭하는 걸 방지, 0으로 초기화

            model_pred = model(img, text) # 예측
            
            loss = criterion(model_pred, label) # 예측값과 실제값과의 손실 계산

            loss.backward() # .backward() 를 호출하면 역전파가 시작
            optimizer.step() # optimizer.step()을 호출하여 역전파 단계에서 수집된 변화도로 매개변수를 조정

            train_loss.append(loss.item())
            
        # 모든 train_loss 가져옴
        tr_loss = np.mean(train_loss)
            
        val_loss, val_score = validation(model, criterion, val_loader, device) # 검증 시작, 여기서 validation 함수 사용
            
        print(f'Epoch [{epoch}], Train Loss : [{tr_loss:.5f}] Val Loss : [{val_loss:.5f}] Val Score : [{val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step()
            # scheduler의 의미: Learning Rate Scheduler => learning rate를 조절한다. 
            # DACON에서는 CosineAnnealingLR 또는 CosineAnnealingWarmRestarts 를 주로 사용한다.
            
        if best_score < val_score: # 최고의 val_score을 가진 모델에 대해서만 최종적용을 시킴
            best_score = val_score
            best_model = model
    
    return best_model # val_score가 가장 높은 모델을 출력

In [None]:
def score_function(real, pred):
    return f1_score(real, pred, average="weighted")

def validation(model, criterion, val_loader, device):
    model.eval() # nn.Module에서 train time과 eval time에서 수행하는 다른 작업을 수행할 수 있도록 switching 하는 함수
    
    model_preds = [] # 예측값
    true_labels = [] # 실제값
    
    val_loss = []
    
    with torch.no_grad():
        for img, text, label in tqdm(iter(val_loader)): # val_loader에서 img, text, label 가져옴
            img = img.float().to(device)
            text = text.to(device)
            label = label.type(torch.LongTensor) # label type을 LongTensor로 형변환, 추가하여 에러 해결
            label = label.to(device)
            
            model_pred = model(img, text)
            
            loss = criterion(model_pred, label) # 예측값, 실제값으로 손실함수 적용 -> loss 추출
            
            val_loss.append(loss.item()) # loss 출력, val_loss에 저장
            
            model_preds += model_pred.argmax(1).detach().cpu().numpy().tolist()
            true_labels += label.detach().cpu().numpy().tolist()
        
    test_weighted_f1 = score_function(true_labels, model_preds) # 실제 라벨값들과 예측한 라벨값들에 대해 f1 점수 계산
    return np.mean(val_loss), test_weighted_f1 # 각각 val_loss, val_score에 적용됨

## Run!!

In [None]:
model = CustomModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = None

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/27 [00:00<?, ?it/s]

TypeError: ignored

## Inference

In [None]:
test_df = pd.read_csv('./test.csv')
test_vectors = vectorizer.transform(test_df['overview'])
test_vectors = test_vectors.todense()

In [None]:
test_dataset = CustomDataset(test_df['img_path'].values, test_vectors, None, test_transform, True)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, deivce):
    model.to(device)
    model.eval()
    
    model_preds = []
    
    with torch.no_grad():
        for img, text in tqdm(iter(test_loader)):
            img = img.float().to(device)
            text = text.to(device)
            
            model_pred = model(img, text)
            model_preds += model_pred.argmax(1).detach().cpu().numpy().tolist()
    # img, text에 따른 예측값들을 model_preds 배열에 넣어 리턴
    return model_preds

In [None]:
preds = inference(infer_model, test_loader, device)

  0%|          | 0/114 [00:00<?, ?it/s]

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['cat3'] = le.inverse_transform(preds)

In [None]:
submit.to_csv('./submit_jgw.csv', index=False)
# 제출 파일로 저장