In [1]:
from logging import raiseExceptions
from multiprocessing.sharedctypes import Value
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

'''
adam_epsilon=1e-08, 
batch_size=128, 
gpu_id=0, 
lr=5e-05, 
max_length=30, 
modality='text', 
model_fn='./saved/text/text.pth', 
n_epochs=6, 
num_b=57, num_d=404, num_m=552, num_s=3190, pretrained_model_name='kykim/bert-kor-base', train_fn='./data/train_df_negOne', use_albert=False, use_radam=True, valid_ratio=0.2, verbose=2, warmup_ratio=0.2)
'''

import argparse
import random
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from transformers import BertModel, BertTokenizerFast
from transformers import BertForSequenceClassification, AlbertForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch_optimizer as custom_optim

# from process.bert_trainer import BertTrainer as Trainer
from process.bert_dataset1 import ClassificationDataset, ClassificationCollator
from process.utils import read_text

from model.multimodel import MultiModalClassifier, BertClassifier
from trainer_valid import ValidationForBert


def define_argparser():
    p = argparse.ArgumentParser()

    p.add_argument('--train_fn', default='./data/train_df_negOne')    # 학습에 사용될 파일이름. // train_df1은 label 1씩 뺀것.
    # Recommended model list:
    # - kykim/bert-kor-base         # bs : 80
    # - kykim/albert-kor-base       # bs : 80
    # - beomi/kcbert-base           # bs : 80
    # - beomi/kcbert-large          # bs : 30
    p.add_argument('--pretrained_model_name', type=str, default='kykim/bert-kor-base')  # 다운받을 모델명(인터넷기준) # kykim/albert-kor-base
    p.add_argument('--use_albert', action='store_true', default=False)
    
    p.add_argument('--gpu_id', type=int, default=0)
    p.add_argument('--verbose', type=int, default=2)

    p.add_argument('--batch_size', type=int, default=128)

    p.add_argument('--warmup_ratio', type=float, default=.2)         # transformer가 학습이 까다로워.. 웜업함. // 위에 두줄은 안건들여도됨.
    p.add_argument('--adam_epsilon', type=float, default=1e-8)
    # If you want to use RAdam, I recommend to use LR=1e-4.
    # Also, you can set warmup_ratio=0.
    p.add_argument('--use_radam', default = True)                   # radam을 쓸때는 warup_ratio를 0으로 해야함. 그리고 추천한 lr = 1e-4이다.
    p.add_argument('--valid_ratio', type=float, default=.2)

    p.add_argument('--max_length', type=int, default=30)
    p.add_argument("--num_b", type=int, default=57)
    p.add_argument("--num_m", type=int, default=552)
    p.add_argument("--num_s", type=int, default=3190)
    p.add_argument("--num_d", type=int, default=404)
    p.add_argument("--scheduler", default=True)
    p.add_argument('--n_epochs', type=int, default=1)    ## text제외 10이었음.            # base기준 2번만 돌려도 괜춘한 성능을 보임.
    p.add_argument("--load_model_path_text", default='./saved/text/text.06.-0.93-0.88-.0.68-0.09-.pth')                                 ##############################   pre-trained model사용하려면 모델 불러워야함.
    p.add_argument("--load_model_path_img", default = './saved/img/img_ffn.09.-0.77-0.68-.0.66-0.88-.pth')

    ########     validating       ############
    p.add_argument("--test_fn", default='./data/test_df')                              ## test_df, dev_df
    p.add_argument("--dev_fn", default='./data/dev_df')                              ## test_df, dev_df
    p.add_argument("--test_img_path", type=str, default='./data/test_img_feat.h5')

    p.add_argument("--img_path", type=str, default='./data/train_img_feat.h5')         ##############################   이것도 바껴야함.




    #########################################################    변경할 것들    #####################################################
    p.add_argument('--validation_mode', type=bool, default=True)                       ## True : train-valid중 valid를... False: test
    p.add_argument('--dev_mode', type=bool, default=True)                       ## True : train-valid중 valid를... False: test

    # dev가 true이면    dev_df임.

    #########    저장할 모델의 위치    ##############
    p.add_argument("--modality", type=str, default='both')                             ##############################   modality : text, img, both
    p.add_argument("--predict_fn", default = './predict/dev/multi/multi_simple_6th_predict.csv')
    p.add_argument("--logging_fn", default = './predict/dev/multi/multi_simple_6th_predict.log')

    p.add_argument("--load_model_path_both", default = './saved/multi/multi_simple_concat.06.-0.93-0.89-.0.90-0.97-.pth')

    p.add_argument("--n_block", default = 1, type=int)
    p.add_argument("--multiModal_type", default = 'cross')
    p.add_argument("--num_head", default = 1, type=int)
    p.add_argument("--config_fn", type=str, default='./predict/dev/multi/multi_simple_config')
    # p.add_argument("--scheduler", default=True)
    # p.add_argument("--load_model_path", default='./saved/text/text.06.-0.93-0.88-.0.68-0.09-.pth')                                 ##############################   pre-trained model사용하려면 모델 불러워야함.
    # p.add_argument('--lr', type=float, default=5e-5) # 5e-5  / 0.001


# python inference.py --predict_fn ./predict/test/multi/test_multi_simple_6th_predict --logging_fn ./predict/test/multi/test_multi_simple_6th_predict.log --n_block 2 --multiModal_type simple --config_fn ./predict/test/multi/test_multi_simple_config --load_model_path_both ./saved/multi/multi_simple_concat.06.-0.93-0.89-.0.90-0.97-.pth && python inference.py --predict_fn ./predict/test/multi/test_multi_cross_2block_6th_predict --logging_fn ./predict/test/multi/test_multi_cross_2block_6th_predict.log --n_block 2 --multiModal_type cross --config_fn ./predict/test/multi/test_multi_cross_2block_config --load_model_path_both ./saved/multi/multi_cross.06.-0.94-0.90-.0.90-0.98-.pth && python inference.py --predict_fn ./predict/test/multi/test_multi_cross_2block_6th_predict.csv --logging_fn ./predict/test/multi/test_multi_cross_2block_6th_predict.log --n_block 2 --multiModal_type cross --config_fn ./predict/test/multi/test_multi_cross_2block_config --load_model_path_both ./saved/multi/multi_cross.06.-0.94-0.90-.0.90-0.98-.pth && python inference.py --predict_fn ./predict/test/multi/test_multi_cross_6block_6th_predict.csv --logging_fn ./predict/test/multi/test_multi_cross_6block_6th_predict.log --n_block 6 --multiModal_type cross --config_fn ./predict/test/multi/test_multi_cross_6block_config --load_model_path_both ./saved/multi/multi_cross_6block.06.-0.94-0.90-.0.90-0.98-.pth && python inference.py --predict_fn ./predict/test/multi/test_multi_cross_12block_6th_predict.csv --logging_fn ./predict/test/multi/test_multi_cross_12block_6th_predict.log --n_block 12 --multiModal_type cross --config_fn ./predict/test/multi/test_multi_cross_12block_config --load_model_path_both ./saved/multi/multi_cross_12block.06.-0.94-0.90-.0.91-0.98-.pth && python inference.py --predict_fn ./predict/test/multi/test_multi_cross_6block_2head_6th_predict.csv --logging_fn ./predict/test/multi/test_multi_cross_6block_2head_6th_predict.log --n_block 6 --num_head 2 --multiModal_type cross --config_fn ./predict/test/multi/test_multi_cross_6block_2head_config --load_model_path_both ./saved/multi/multi_cross_6block_2head.06.-0.94-0.90-.0.91-0.98-.pth && python inference.py --predict_fn ./predict/test/multi/test_multi_cross_6block_4head_6th_predict.csv --logging_fn ./predict/test/multi/test_multi_cross_6block_4head_6th_predict.log --n_block 6 --num_head 4 --multiModal_type cross --config_fn ./predict/test/multi/test_multi_cross_6block_4head_config --load_model_path_both ./saved/multi/multi_cross_6block_4head.06.-0.94-0.90-.0.90-0.98-.pth && python inference.py --predict_fn ./predict/test/multi/test_multi_cross_6block_6head_6th_predict.csv --logging_fn ./predict/test/multi/test_multi_cross_6block_6head_6th_predict.log --n_block 6 --num_head 6 --multiModal_type cross --config_fn ./predict/test/multi/test_multi_cross_6block_6head_config --load_model_path_both ./saved/multi/multi_cross_6block_6head.06.-0.94-0.90-.0.91-0.97-.pth




    ##################
    ##################
    #################
    #   trainer의 0이상을 -1 이상으로 확인하기
    ################
    #################
    ################

    config = p.parse_args()

    return config


def get_loaders(fn, tokenizer, config, valid_ratio=.2 ):
    '''
        fn : train_df path
        tokenizer : bertTokenizer
        img : img path
        
    '''
    # Get list of labels and list of texts.
    train_df=pd.read_csv(fn)
    train_df['img_idx'] = train_df.index

    texts=train_df['product']
    bcateid = train_df['bcateid']
    mcateid = train_df['mcateid']
    scateid = train_df['scateid']
    dcateid = train_df['dcateid']
    labels = list(zip(bcateid, mcateid, scateid, dcateid))
    imgs = train_df['img_idx']
    pids = train_df['pid']

    shuffled = list(zip(texts, labels, imgs, pids))  # 묶은다음 셔플링.
    random.shuffle(shuffled)
    texts = [e[0] for e in shuffled]
    labels = [e[1] for e in shuffled]
    imgs = [e[2] for e in shuffled]
    pids = [e[3] for e in shuffled]

    idx = int(len(texts) * (1 - valid_ratio))

    # Get dataloaders using given tokenizer as collate_fn.
    train_loader = DataLoader(
        ClassificationDataset(texts[:idx], labels[:idx], imgs[:idx], config.img_path, pids[:idx]),
        batch_size= config.batch_size,      ########################################
        shuffle=True,
        collate_fn=ClassificationCollator(tokenizer, config.max_length), ########################
    )


    # train_df.iloc[idx:].to_csv("validation.csv", index=False)
    valid_loader = DataLoader(
        ClassificationDataset(texts[idx:], labels[idx:], imgs[idx:], config.img_path, pids[idx:]),
        batch_size=config.batch_size,       ##########################################
        collate_fn=ClassificationCollator(tokenizer, config.max_length), #######################
    )

    return train_loader, valid_loader


def get_loaders_test(fn, tokenizer, config):
    '''
        fn : train_df path
        tokenizer : bertTokenizer
        img : img path
        
    '''
    # Get list of labels and list of texts.
    train_df=pd.read_csv(fn)
    print(train_df.head())
    train_df['img_idx'] = train_df.index

    texts=train_df['product']
    bcateid = train_df['bcateid']
    mcateid = train_df['mcateid']
    scateid = train_df['scateid']
    dcateid = train_df['dcateid']
    labels = list(zip(bcateid, mcateid, scateid, dcateid))
    imgs = train_df['img_idx']

    # Get dataloaders using given tokenizer as collate_fn.
    train_loader = DataLoader(
        ClassificationDataset(texts, labels, imgs, config.img_path),
        batch_size= config.batch_size,      ########################################
        collate_fn=ClassificationCollator(tokenizer, config.max_length), ########################
    )

    return train_loader




def get_optimizer(model, config):
    if config.use_radam:
        optimizer = custom_optim.RAdam(model.parameters(), lr=config.lr)
    else:
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']    # 애들은 no decay한데.
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                'weight_decay': 0.01
            },
            {
                'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0
            }
        ]

        optimizer = optim.AdamW(
            optimizer_grouped_parameters,
            lr=config.lr,
            eps=config.adam_epsilon
        )

    return optimizer



# if __name__ == '__main__':
from easydict import EasyDict as edict

config = torch.load("./predict/valid/multi/multi_simple_config")
config = vars(config['config'])
config = edict(config)
print(config)




print('modality : ', config.modality)
print(config.pretrained_model_name)
# Get pretrained tokenizer.
tokenizer = BertTokenizerFast.from_pretrained(config.pretrained_model_name)    # 적당히 전처리 된 text를 넣으면됨.
# Get dataloaders using tokenizer from untokenized corpus.

if config.validation_mode:
    _, valid_loader = get_loaders(
        config.train_fn,
        tokenizer,
        valid_ratio=config.valid_ratio,
        config = config
    )
    mini = next(iter(valid_loader))

else:
    test_fn = config.dev_fn if config.dev_mode else config.test_fn

    valid_loader = get_loaders_test(
        test_fn,
        tokenizer,
        config=config
    )
    mini = next(iter(valid_loader))



{'train_fn': './data/train_df_negOne', 'pretrained_model_name': 'kykim/bert-kor-base', 'use_albert': False, 'gpu_id': 0, 'verbose': 2, 'batch_size': 128, 'warmup_ratio': 0.2, 'adam_epsilon': 1e-08, 'use_radam': True, 'valid_ratio': 0.2, 'max_length': 30, 'num_b': 57, 'num_m': 552, 'num_s': 3190, 'num_d': 404, 'scheduler': True, 'n_epochs': 1, 'load_model_path_text': './saved/text/text.06.-0.93-0.88-.0.68-0.09-.pth', 'load_model_path_img': './saved/img/img_ffn.09.-0.77-0.68-.0.66-0.88-.pth', 'test_fn': './data/test_df', 'test_img_path': './data/test_img_feat.h5', 'img_path': './data/train_img_feat.h5', 'validation_mode': True, 'modality': 'both', 'predict_fn': './predict/valid/multi/multi_simple_6th_predict.csv', 'logging_fn': './predict/valid/multi/multi_simple_6th_predict.log', 'load_model_path_both': './saved/multi/multi_simple_concat.06.-0.93-0.89-.0.90-0.97-.pth', 'n_block': 6, 'multiModal_type': 'simple', 'num_head': 6, 'config_fn': './predict/valid/multi/multi_simple_config'}
mod

In [8]:
aaa = pd.read_csv("validation.csv")
aaa[aaa.pid=='P2412454373']

Unnamed: 0,pid,product,bcateid,mcateid,scateid,dcateid,img_idx
430046,P2412454373,Volkswagen 폭스바겐 VW1419V-SVBK 본사정품 여성용,19,131,-2,-2,6937900


In [5]:
mini['pid']

['Q4564270519',
 'P2412454373',
 'X3283266744',
 'G4186461142',
 'V1262201399',
 'P4539388728',
 'N3746304406',
 'J3957102515',
 'Q3766398650',
 'K4154460947',
 'M2989848478',
 'H4681234207',
 'H4687095127',
 'Y4404398396',
 'W4249018378',
 'X4619873316',
 'N2482738071',
 'I4730373980',
 'R3752461526',
 'H4718653737',
 'T4252545707',
 'R2644558638',
 'L4315594982',
 'V2473580237',
 'P657149189',
 'T4042032222',
 'K3897213687',
 'W911937221',
 'F4532260967',
 'R3067581534',
 'N3315095863',
 'J4628193831',
 'J3782856182',
 'L4397981454',
 'U4602724536',
 'W3089018568',
 'O3922406370',
 'N2447670376',
 'V3416325644',
 'V3914769804',
 'J3727715433',
 'G4472201065',
 'L4283912820',
 'H3818151326',
 'K2644276145',
 'Y3762875550',
 'M3136315592',
 'H2942240660',
 'O3943822845',
 'P2940342524',
 'Q4667491460',
 'O3124112715',
 'T3005228340',
 'P3664407638',
 'J3564674393',
 'N3380241599',
 'G4731198379',
 'W4527296474',
 'M3877683604',
 'U4546833879',
 'U4391192401',
 'M3429278469',
 'S3880280

In [4]:
mini['labels']

tensor([[  23,  205,  323,   -2],
        [  19,  131,   -2,   -2],
        [  10,  222,   -2,   -2],
        [  16,  278,  719,   -2],
        [  31,  207,  327,   29],
        [  10,  219,  826,   -2],
        [  22,  410,   -2,   -2],
        [   5,    5,   32,   18],
        [  43,  136,  935,   -2],
        [   5,    5,    6,   13],
        [  10,  222,  902,   -2],
        [  39,  234,  574,   -2],
        [  15,  221,  792,   -2],
        [  39,  182,   -2,   -2],
        [  40,  127,  607,   -2],
        [  12,   11,  212,   -2],
        [  42,  108,  657,   -2],
        [   9,    8, 2427,   -2],
        [  33,  331,  836,   -2],
        [  40,  127, 1322,   -2],
        [   2,    2,   37,    9],
        [  27,  106,  250,   -2],
        [   6,    6,    9,   -2],
        [   8,    6,  171,   -2],
        [  14,  396, 1528,   -2],
        [  15,   37,  504,   -2],
        [  10,  222,   -2,   -2],
        [  17,   20,  400,   -2],
        [   6,  375,  837,   -2],
        [  22,

In [2]:

print(mini['input_ids'].shape, mini['attention_mask'].shape, mini['labels'].shape, mini['imgs'].shape)

print(
    '|valid| =', len(valid_loader) * config.batch_size,
)

n_total_iterations = len(valid_loader) * config.n_epochs
n_warmup_steps = int(n_total_iterations * config.warmup_ratio)
if config.scheduler:
    print(
        '#total_iters =', n_total_iterations,
        '#warmup_iters =', n_warmup_steps,
    )




#     #######################################################################################################################################
print(config.modality)
if config.modality=='text':
    model = MultiModalClassifier(config=config)
    package = torch.load(config.load_model_path_text)['model']
    model.load_state_dict(package)

elif config.modality=='img':
    model = MultiModalClassifier(config=config)
    package = torch.load(config.load_model_path_img)['model']
    model.load_state_dict(package)

elif config.modality == 'both':
    model = MultiModalClassifier(config=config)
    package = torch.load(config.load_model_path_both)['model']
    model.load_state_dict(package, strict=False) # strict를 함으로서 일부분만 불러오기
    ########################       만약 여기서 Bert만 불러오는게 아니라 b_head도 불러온다면?    #########################
    # text_backBone = model.BertModel
    # config.modality = 'both'
    # model = MultiModalClassifier(config=config, backbone = text_backBone).cuda()
else:
    raise ValueError("check config.modality")

# print(model)

######### testing 4,23 ##############        
minibatch = next(iter(valid_loader))
input_ids = minibatch['input_ids']
attention_mask = minibatch['attention_mask']
imgs = minibatch['imgs']
y = minibatch['labels']

# b,m,s,d = model(input_ids, attention_mask, imgs)
# accuracy1 = (torch.argmax(b, dim=-1) == y[:,0]).sum() / (y[:,0]>=-1).sum().item()
# accuracy2 = (torch.argmax(m, dim=-1) == y[:,1]).sum() / (y[:,1]>=-1).sum().item()
# accuracy3 = (torch.argmax(s, dim=-1) == y[:,2]).sum() / ((y[:,2]>=-1).sum().item()+1e-06)
# accuracy4 = (torch.argmax(d, dim=-1) == y[:,3]).sum() / ((y[:,3]>=-1).sum().item()+1e-06)
# print(accuracy1,accuracy2,accuracy3,accuracy4)
# print(y)

#     # ############# # trainer에 4개 아웃풋 나오는것도 바꿔야함.


#     # model_loader = AlbertForSequenceClassification
#     # model = model_loader.from_pretrained(
#     #     config.pretrained_model_name,
#     #     num_labels=config.num_b             # 맨끝에잇는 <cls>token자리에 - layer를 하나 덧붙여줘.
#     # )
#     '''
#      (pooler): Linear(in_features=768, out_features=768, bias=True)
#     (pooler_activation): Tanh()
#         )
#         (dropout): Dropout(p=0.1, inplace=False)
#         (classifier): Linear(in_features=768, out_features=57, bias=True)
#         )

#     '''
#     ##############################################################


# optimizer = get_optimizer(model, config)

# By default, model returns a hidden representation before softmax func.
# Thus, we need to use CrossEntropyLoss, which combines LogSoftmax and NLLLoss.
crit = nn.CrossEntropyLoss(ignore_index=-2).cuda()


if config.gpu_id >= 0:
    model.cuda()
    crit.cuda()




torch.Size([128, 30]) torch.Size([128, 30]) torch.Size([128, 4]) torch.Size([128, 2048])
|valid| = 1627008
#total_iters = 12711 #warmup_iters = 2542
both
model :  both


Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
b,m,s,d = model(input_ids.to('cuda'), attention_mask.to("cuda"), imgs.to("cuda"))


In [11]:
config['validation_mode']

True

In [3]:
valid_ratio=config.valid_ratio,

'''
    fn : train_df path
    tokenizer : bertTokenizer
    img : img path
    
'''
# Get list of labels and list of texts.
train_df=pd.read_csv("./data/train_df_negOne")
train_df['img_idx'] = train_df.index

texts=train_df['product']
bcateid = train_df['bcateid']
mcateid = train_df['mcateid']
scateid = train_df['scateid']
dcateid = train_df['dcateid']
labels = list(zip(bcateid, mcateid, scateid, dcateid))
imgs = train_df['img_idx']

shuffled = list(zip(texts, labels, imgs))  # 묶은다음 셔플링.
random.shuffle(shuffled)
texts = [e[0] for e in shuffled]
labels = [e[1] for e in shuffled]
imgs = [e[2] for e in shuffled]
idx = int(len(texts) * (1 - .2))

# Get dataloaders using given tokenizer as collate_fn.
train_loader = DataLoader(
    ClassificationDataset(texts[:idx], labels[:idx], imgs[:idx], config.img_path),
    batch_size= config.batch_size,      ########################################
    shuffle=True,
    collate_fn=ClassificationCollator(tokenizer, config.max_length), ########################
)


# train_df.iloc[idx:].to_csv("validation.csv", index=False)
valid_loader = DataLoader(
    ClassificationDataset(texts[idx:], labels[idx:], imgs[idx:], config.img_path),
    batch_size=config.batch_size,       ##########################################
    shuffle=False,
    collate_fn=ClassificationCollator(tokenizer, config.max_length), #######################
)

train_loader, valid_loader




(<torch.utils.data.dataloader.DataLoader at 0x7fedfc84f370>,
 <torch.utils.data.dataloader.DataLoader at 0x7fee16bc5190>)

In [2]:
train_df

NameError: name 'train_df' is not defined

In [13]:
vd = pd.read_csv("validation.csv")

In [33]:
import numpy as np

np.sum(next(iter(valid_loader))['labels'].numpy() == [[13,12,-2,-2]], axis = 1)

array([1, 2, 1, 0, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 2, 1, 2, 2, 0, 2, 0, 2, 1,
       1, 1, 0, 3, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 1, 0, 2, 1, 2,
       2, 2, 1, 1, 2, 1, 1, 0, 2, 2, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 2, 1,
       1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 0, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1])

In [24]:
labels[idx:]

[(19, 71, 94, -2),
 (35, 237, 395, 68),
 (19, 131, 194, -2),
 (13, 12, 181, -2),
 (22, 270, 496, -2),
 (6, 84, 118, -2),
 (3, 3, 524, -2),
 (16, 454, 2141, -2),
 (4, 144, 516, -2),
 (27, 93, 134, -2),
 (48, 192, 1561, -2),
 (17, 314, -2, -2),
 (2, 2, -2, -2),
 (13, 12, -2, -2),
 (34, 343, 1983, -2),
 (1, 65, 684, 102),
 (19, 131, 194, -2),
 (15, 217, 505, -2),
 (25, 157, 310, 27),
 (39, 117, 1377, -2),
 (7, 244, 417, -2),
 (10, 219, 2816, -2),
 (8, 76, -2, -2),
 (36, 166, 1328, -2),
 (31, 56, 66, -2),
 (18, 21, -2, -2),
 (24, 35, 153, -2),
 (34, 172, 1837, -2),
 (8, 88, 216, -2),
 (32, 111, 2136, -2),
 (10, 180, 2705, -2),
 (20, 138, 203, -2),
 (39, 190, 847, -2),
 (8, 60, 230, -2),
 (1, 198, 424, -2),
 (5, 5, 32, 26),
 (3, 3, 524, -2),
 (19, 59, 69, -2),
 (8, 110, -2, -2),
 (11, 169, 2912, -2),
 (1, 198, 958, -2),
 (2, 2, 37, 5),
 (23, 30, 33, -2),
 (38, 288, 580, -2),
 (5, 13, 148, -2),
 (13, 48, 1381, -2),
 (43, 136, -2, -2),
 (26, 210, 1795, -2),
 (34, 453, -2, -2),
 (8, 76, 840, -

In [19]:
vd

Unnamed: 0,pid,product,bcateid,mcateid,scateid,dcateid,img_idx
0,X3992887813,덱케 O史：O DE1H9ABG052WSMT,13,12,-2,-2,6507854
1,H4214973612,화미 황엿 ( 소 ) 3kg x 5,26,44,1323,-2,6507855
2,I4580515344,[글로벌샵]독일 Nutreov Capileov Double Action Anti-H...,8,50,59,-2,6507856
3,O4211452195,아디다스 CE9026 CE9036 CV3997 CV3988 트레이닝 세트,5,5,49,-2,6507857
4,O4669409313,삐까 펄 중지갑 여성지갑 지갑 여성중지갑 손지갑 여,13,16,103,41,6507858
...,...,...,...,...,...,...,...
1626959,T4336486181,스킨다임 핸드크림 피치향(60g)핸드로션 면세점 입점,36,348,948,-2,8134813
1626960,X4551630682,옥상트랩(하수구용 옥상 트랩)사이즈_100파이,39,99,1929,-2,8134814
1626961,P4506131881,[KB 5% 청구할인]소야 아름 사각종지 (5컬러) 사각종지 종지 여주도자,15,37,1419,-2,8134815
1626962,G4716723552,모서리안전보호대 대 브라운 2M 유아안전 아이,22,424,2193,-2,8134816


In [18]:
next(iter(v_l[1]))['labels']

tensor([[  10,   15,   17,   -2],
        [  13,  141,  210,   -2],
        [  17,   20,   22,   -2],
        [  12,  151, 1104,   -2],
        [   8,    6,    9,   -2],
        [  16,  454, 2162,   -2],
        [   7,   79,  394,   -2],
        [  16,  178,  751,   -2],
        [  16,   55,  182,   -2],
        [  27,   93,  757,   -2],
        [  34,  326,  784,   -2],
        [  13,  250,  438,   -2],
        [   8,  110,   -2,   -2],
        [  13,   48,   58,   -2],
        [  52,  463,   -2,   -2],
        [   5,    5,   49,   -2],
        [  15,   68,   89,   -2],
        [  15,  221,  360,   -2],
        [  10,  219,  357,   -2],
        [  13,   12,  234,   -2],
        [  12,   11,  251,   -2],
        [  27,   64,   93,   -2],
        [  10,  180,   -2,   -2],
        [  52,  463, 2113,  191],
        [  27,   57,  649,   -2],
        [  17,   82,  249,   -2],
        [  50,  239,  604,   79],
        [  38,   97,  639,   -2],
        [  49,  255,   -2,   -2],
        [   5,

In [15]:
next(iter(v_l))['labels']

TypeError: 'DataLoader' object is not subscriptable

In [None]:

def get_loaders_test(fn, tokenizer, config):
    '''
        fn : train_df path
        tokenizer : bertTokenizer
        img : img path
        
    '''
    # Get list of labels and list of texts.
    train_df=pd.read_csv(fn)
    print(train_df.head())
    train_df['img_idx'] = train_df.index

    texts=train_df['product']
    bcateid = train_df['bcateid']
    mcateid = train_df['mcateid']
    scateid = train_df['scateid']
    dcateid = train_df['dcateid']
    labels = list(zip(bcateid, mcateid, scateid, dcateid))
    imgs = train_df['img_idx']

    # Get dataloaders using given tokenizer as collate_fn.
    train_loader = DataLoader(
        ClassificationDataset(texts, labels, imgs, config.img_path),
        batch_size= config.batch_size,      ########################################
        collate_fn=ClassificationCollator(tokenizer, config.max_length), ########################
    )

    return train_loader



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/media/deep/kakao/multi


In [20]:
from distutils import text_file
import torch
from torch.utils.data import Dataset
import h5py

class ClassificationDataset(Dataset):
    '''
        text, img, label
        if img None -> return img:None
    '''
    def __init__(self, texts, labels, img, img_path, tokenizer, max_len=30):
        '''
            texts : [bs, seq, hs]
            labels : [bs, 1]
        '''
        self.texts = texts
        self.labels = labels
        self.imgs = img
        self.img_h5_path = img_path
        self.tokenizer = tokenizer
        self.max_length = max_len

    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        with h5py.File(self.img_h5_path, 'r') as img_feats:
            img_feat = img_feats['img_feat'][self.imgs[item]]

        img_feat = torch.FloatTensor(img_feat).reshape(1,-1)

        encoding = self.tokenizer(   # __call__
            text,
            padding=True,
            truncation=True,      # maximum length로 잘라.
            return_tensors="pt",  # pytorch 타입으로 리턴
            max_length=self.max_length
        )
        ### [bs, seq, 1]   # 왜 1이냐면, next iter해서 하나의 idx만 가지고 온거야.

        return_value = {
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'labels': torch.tensor(label, dtype=torch.long),
            'imgs': img_feat
        }
        
        return return_value

# v_load = DataLoader(ClassificationDataset(texts[idx:], labels[idx:], imgs[idx:], config.img_path, tokenizer),
#                     batch_size = 128)
# bb = next(iter(v_load))

# bb['input_ids'].shape

tokenizer(texts[0],
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=30)['attention_mask'].shape

torch.Size([1, 18])

In [8]:
bb

{'input_ids': tensor([[    2, 17796, 28671, 15419,     3]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1]]),
 'labels': tensor([  3, 274, 514,  -2]),
 'imgs': tensor([[0.5816, 0.0561, 0.6432,  ..., 0.4215, 0.0869, 0.0255]])}