In [57]:
!pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install -r requirements.txt -qq

import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from transformers import *
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from ast import literal_eval

In [3]:
# 환경설정
## 환경변수
VER = 26
# LOAD_TOKENS_FROM = 'input/py-bigbird-v26'
# LOAD_MODEL_FROM = 'input/py-bigbird-v26'
# DOWNLOADED_MODEL_PATH = 'input/py-bigbird-v26'
DOWNLOADED_MODEL_PATH = 'model'
MODEL_NAME = 'google/bigbird-roberta-base'
CONFIG = {
    'model_name': MODEL_NAME,
    'max_length': 1024,
    'train_batch_size': 4,
    'valid_batch_size': 4,
    'epochs': 5,
    'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
    'max_grad_norm': 10,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
}
COMPUTE_VAL_SCORE = True if len(os.listdir('data/test')) <= 5 else False

## 기타설정
os.makedirs('result', exist_ok=True)
os.makedirs('model', exist_ok=True)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [20]:
## 모델설정
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
tokenizer.save_pretrained('model')

config_model = AutoConfig.from_pretrained(MODEL_NAME)
config_model.num_labels = 15
config_model.save_pretrained('model')

backbone = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config_model)
backbone.save_pretrained('model')

Downloading: 100%|██████████| 489M/489M [00:43<00:00, 11.7MB/s]    
Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassifi

In [55]:
# 데이터 불러오기
## 다운로드 (using Kaggle API)
# !kaggle competitions download -q -c feedback-prize-2021
# !unzip feedback-prize-2021.zip -d data

## train.csv
train_df = pd.read_csv('data/train.csv')

## train
train_ids, train_texts = [], []
for f in tqdm(os.listdir('data/train')):
    train_ids.append(f.replace('.txt', ''))
    train_texts.append(open(f'data/train/{f}', 'r', encoding='utf8').read())
train_texts_df = pd.DataFrame({'id': train_ids, 'text': train_texts})

## test
test_ids, test_texts = [], []
for f in tqdm(os.listdir('data/test')):
    test_ids.append(f.replace('.txt', ''))
    test_texts.append(open(f'data/test/{f}', 'r', encoding='utf8').read())
test_texts_df = pd.DataFrame({'id': test_ids, 'text': test_texts})

100%|██████████| 15594/15594 [00:00<00:00, 17578.89it/s]
100%|██████████| 5/5 [00:00<00:00, 5017.11it/s]
