In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
data_path = '/content/drive/MyDrive/KUBIG24_1/NLP/CONTEST/'

In [None]:
!pip install transformers

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import transformers
from transformers import AutoTokenizer, AdamW, RobertaForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from tqdm.notebook import tqdm

In [None]:
train=pd.read_csv(data_path + 'train_cleaned.csv')
questions = pd.concat([train['질문_1'], train['질문_2']])
categories = pd.concat([train['category'], train['category']])
train = pd.concat([questions, categories], axis=1)
train.columns = ['질문', 'category']

train.loc[(train['category'] == '마감재'), 'category'] = 0
train.loc[(train['category'] == '인테리어'), 'category'] = 1
train.loc[(train['category'] == '시공'), 'category'] = 2
train.loc[(train['category'] == '마감하자'), 'category'] = 3
train.loc[(train['category'] == '건축구조'), 'category'] = 4
train.loc[(train['category'] == '기타'), 'category'] = 5
train.loc[(train['category'] == '타 마감하자'), 'category'] = 6

train

In [None]:
test = pd.read_csv(data_path + 'test.csv')
test.drop(['id'], axis=1, inplace=True)
test

In [None]:
train, val = train_test_split(train, test_size=0.2, random_state=2021)

In [None]:
class NTDataset(Dataset):

    def __init__(self, csv_file):
        self.dataset = csv_file
        self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")
        print(self.dataset.describe())

        # 데이터프레임의 마지막 열을 레이블로 사용합니다.
        # 만약 다른 열을 레이블로 사용하려면 인덱스를 수정해야 합니다.
        self.label_idx = -1

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx].values
        text = row[0]
        y = row[self.label_idx]  # 데이터프레임에서 마지막 열을 레이블로 사용합니다.
        inputs = self.tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            max_length=14,
            pad_to_max_length=True,
            add_special_tokens=True
            )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y  # 레이블을 반환합니다.


In [None]:
class NTDataset_test(Dataset):

    def __init__(self, csv_file):
        self.dataset = csv_file
        self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

        print(self.dataset.describe())

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx]
        text = row['질문']  # 질문 열에서 데이터를 추출합니다.
        inputs = self.tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            max_length=14,
            pad_to_max_length=True,
            add_special_tokens=True
        )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask


In [None]:
#train_dataset = NTDataset(train)
#val_dataset = NTDataset(val)

test_dataset = NTDataset_test(test)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("klue/roberta-large")

# 테스트 데이터셋 객체 생성
test_dataset = NTDataset_test(test, tokenizer)

In [None]:
if torch.cuda.is_available():
  device=torch.device('cuda') #relatively fast
else:
  device=torch.device('cpu') #only cpu for training & evaluating #Very slow
print(device)

In [None]:
model = RobertaForSequenceClassification.from_pretrained("klue/roberta-large", num_labels=7).to(device)

In [None]:
epochs = 30
batch_size = 32

In [None]:
#optimizer = AdamW(model.parameters(), lr=1e-5)
#train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# train
losses = []
accuracies = []
total_loss = 0.0
correct = 0
total = 0

for i in range(epochs):

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss / total, "Accuracy:", correct.float() / total)

In [None]:
# validation
model.eval()

pred = []
correct = 0
total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(val_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  pred.append(predicted)
  correct += (predicted == y_batch).sum()
  total += len(y_batch)

print("val accuracy:", correct.float() / total)

epoch 10, batch 16 -> val accuracy 0.9845
epoch 30, batch 32 -> val accuracy 0.

In [None]:
test['category'] = 0

In [None]:
test['질문'] = test['질문'].astype(str)

In [None]:
test

In [None]:
# test
model.eval()

pred = []

with torch.no_grad():
    for input_ids_batch, attention_masks_batch in tqdm(test_loader):
        input_ids_batch = input_ids_batch.to(device)
        attention_masks_batch = attention_masks_batch.to(device)

        y_pred = model(input_ids_batch, attention_mask=attention_masks_batch)[0]
        _, predicted = torch.max(y_pred, 1)
        pred.extend(predicted.tolist())

In [None]:
test['category_idx'] = pred

In [None]:
test.loc[(test['category_idx'] == 0), 'category'] = '마감재'
test.loc[(test['category_idx'] == 1), 'category'] = '인테리어'
test.loc[(test['category_idx'] == 2), 'category'] = '시공'
test.loc[(test['category_idx'] == 3), 'category'] = '마감하자'
test.loc[(test['category_idx'] == 4), 'category'] = '건축구조'
test.loc[(test['category_idx'] == 5), 'category'] = '기타'
test.loc[(test['category_idx'] == 6), 'category'] = '타 마감하자'

In [None]:
test.to_csv('test_w_category2.csv', index=False)