In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig



In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed()

class CFG:
    max_len = 512
    batch_size = 8  # Effective batch size will be batch_size * accumulation_steps
    learning_rate = 2e-5  # Adjusted learning rate
    epochs = 12  # You can adjust this based on your needs
    device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
    accumulation_steps = 4  # Gradient accumulation steps

# 데이터 로드 및 전처리
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

train_df.dropna(inplace=True)
train_df.drop_duplicates(subset=['제목', '키워드'], keep='first', inplace=True)

def normalize_title(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text.strip()

def normalize_keywords(text):
    text = re.sub(r'[^가-힣\s,]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.strip()

def clean_keywords(keywords):
    keywords = keywords.split(',')
    keywords = [keyword.strip() for keyword in keywords if keyword.strip()]
    return ' '.join(keywords)

train_df['title'] = train_df['제목'].apply(normalize_title)
train_df['keywords'] = train_df['키워드'].apply(lambda x: clean_keywords(normalize_keywords(x)))
train_df['text'] = train_df['title'] + ' [SEP] ' + train_df['keywords']

test_df['title'] = test_df['제목'].apply(normalize_title)
test_df['keywords'] = test_df['키워드'].apply(lambda x: clean_keywords(normalize_keywords(x)))
test_df['text'] = test_df['title'] + ' [SEP] ' + test_df['keywords']

# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 학습 및 검증 데이터 분할
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=42)

# TF-IDF 벡터라이저 초기화 및 적용
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # 상위 1000개 특징만 사용
train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
val_tfidf = tfidf_vectorizer.transform(val_df['text'])
test_tfidf = tfidf_vectorizer.transform(test_df['text'])

class TfidfEnhancedDataset(Dataset):
    def __init__(self, texts, tfidf_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.tfidf_features = tfidf_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        tfidf_feature = torch.FloatTensor(self.tfidf_features[item].toarray().squeeze())
        label = self.labels[item] if self.labels is not None else -1
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'tfidf_features': tfidf_feature,
            'labels': torch.tensor(label, dtype=torch.long)
        }

class TfidfEnhancedClassifier(nn.Module):
    def __init__(self, base_model_name, num_labels, tfidf_dim):
        super(TfidfEnhancedClassifier, self).__init__()
        self.base_model = AutoModel.from_pretrained(base_model_name, output_hidden_states=True)
        self.dropout = nn.Dropout(0.1)
        self.attention = nn.Linear(self.base_model.config.hidden_size, 1)
        self.tfidf_layer = nn.Linear(tfidf_dim, self.base_model.config.hidden_size)
        self.classifier = nn.Linear(self.base_model.config.hidden_size * 2, num_labels)
    
    def forward(self, input_ids, attention_mask, tfidf_features):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        hidden_states = outputs.last_hidden_state
        
        attn_scores = self.attention(hidden_states).squeeze(-1)
        attn_weights = torch.softmax(attn_scores, dim=-1)
        context_vector = torch.sum(attn_weights.unsqueeze(-1) * hidden_states, dim=1)
        
        tfidf_vector = self.tfidf_layer(tfidf_features)
        
        combined_vector = torch.cat([context_vector, tfidf_vector], dim=1)
        pooled_output = self.dropout(combined_vector)
        logits = self.classifier(pooled_output)
        
        return logits

model_path = "klue-roberta-large"  

# 설정 로드 및 수정
config = AutoConfig.from_pretrained(model_path)
config.num_labels = len(label_encoder)  

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 모델 생성
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    config=config
)

# 저장된 가중치 로드
state_dict = torch.load(f"{model_path}/pytorch_model.bin")

# 불필요한 키 제거
for key in list(state_dict.keys()):
    if key.startswith('lm_head') or key == 'roberta.embeddings.position_ids':
        del state_dict[key]

# 모델에 가중치 로드
model.load_state_dict(state_dict, strict=False)

# GPU로 모델 이동 (필요한 경우)
model.to(CFG.device)

print("Model loaded successfully!")

# 데이터셋 및 데이터로더 생성
train_dataset = TfidfEnhancedDataset(train_df.text.tolist(), train_tfidf, train_df.label.tolist(), tokenizer, CFG.max_len)
val_dataset = TfidfEnhancedDataset(val_df.text.tolist(), val_tfidf, val_df.label.tolist(), tokenizer, CFG.max_len)
test_dataset = TfidfEnhancedDataset(test_df.text.tolist(), test_tfidf, None, tokenizer, CFG.max_len)

train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

# 모델 생성
num_labels = len(label_encoder)
tfidf_dim = train_tfidf.shape[1]
model = TfidfEnhancedClassifier(model_path, num_labels, tfidf_dim).to(CFG.device)

# 클래스 가중치 계산
class_weights = compute_class_weight('balanced', classes=np.unique(train_df['label']), y=train_df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(CFG.device)

# 손실 함수 및 옵티마이저 설정
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)

# 학습률 스케줄러 설정
total_steps = len(train_loader) * CFG.epochs // CFG.accumulation_steps
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1 * total_steps),
                                            num_training_steps=total_steps)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load(f"{model_path}/pytorch_model.bin")


Model loaded successfully!


Some weights of RobertaModel were not initialized from the model checkpoint at klue-roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# 학습 루프
for epoch in range(CFG.epochs):
    model.train()
    train_loss = 0
    optimizer.zero_grad()
    for idx, batch in enumerate(tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epochs}')):
        input_ids = batch['input_ids'].to(CFG.device)
        attention_mask = batch['attention_mask'].to(CFG.device)
        tfidf_features = batch['tfidf_features'].to(CFG.device)
        labels = batch['labels'].to(CFG.device)
        
        outputs = model(input_ids, attention_mask=attention_mask, tfidf_features=tfidf_features)
        loss = loss_fn(outputs, labels)
        loss = loss / CFG.accumulation_steps
        train_loss += loss.item()
        
        loss.backward()
        
        if (idx + 1) % CFG.accumulation_steps == 0 or (idx + 1) == len(train_loader):
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
    
    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{CFG.epochs}, Train Loss: {avg_train_loss:.4f}")
    # 검증
    model.eval()
    val_predictions = []
    val_true_labels = []
    val_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating'):
            input_ids = batch['input_ids'].to(CFG.device)
            attention_mask = batch['attention_mask'].to(CFG.device)
            tfidf_features = batch['tfidf_features'].to(CFG.device)
            labels = batch['labels'].to(CFG.device)
            
            outputs = model(input_ids, attention_mask=attention_mask, tfidf_features=tfidf_features)
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()
            
            _, preds = torch.max(outputs, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())
    
    val_f1 = f1_score(val_true_labels, val_predictions, average='macro')
    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation F1 Score: {val_f1:.4f}")

Epoch 1/12: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [50:58<00:00,  1.78it/s]


Epoch 1/12, Train Loss: 0.5865


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]


Validation Loss: 1.3702, Validation Accuracy: 0.5781, Validation F1 Score: 0.4330


Epoch 2/12: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [50:59<00:00,  1.78it/s]


Epoch 2/12, Train Loss: 0.3124


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]


Validation Loss: 1.1426, Validation Accuracy: 0.5846, Validation F1 Score: 0.4998


Epoch 3/12: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [50:59<00:00,  1.78it/s]


Epoch 3/12, Train Loss: 0.2316


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]


Validation Loss: 1.0139, Validation Accuracy: 0.6491, Validation F1 Score: 0.5721


Epoch 4/12: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [50:59<00:00,  1.78it/s]


Epoch 4/12, Train Loss: 0.1776


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]


Validation Loss: 0.9845, Validation Accuracy: 0.6207, Validation F1 Score: 0.5595


Epoch 5/12: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [51:00<00:00,  1.78it/s]


Epoch 5/12, Train Loss: 0.1294


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]


Validation Loss: 0.9702, Validation Accuracy: 0.7009, Validation F1 Score: 0.6174


Epoch 6/12: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [50:59<00:00,  1.78it/s]


Epoch 6/12, Train Loss: 0.0979


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]


Validation Loss: 0.9915, Validation Accuracy: 0.6673, Validation F1 Score: 0.6153


Epoch 7/12: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [51:00<00:00,  1.78it/s]


Epoch 7/12, Train Loss: 0.0676


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]


Validation Loss: 1.0753, Validation Accuracy: 0.7351, Validation F1 Score: 0.6388


Epoch 8/12: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [51:00<00:00,  1.77it/s]


Epoch 8/12, Train Loss: 0.0477


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]


Validation Loss: 1.0911, Validation Accuracy: 0.7359, Validation F1 Score: 0.6430


Epoch 9/12: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [50:59<00:00,  1.78it/s]


Epoch 9/12, Train Loss: 0.0344


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]


Validation Loss: 1.1614, Validation Accuracy: 0.7725, Validation F1 Score: 0.6643


Epoch 10/12:  59%|█████████████████████████████████████████████████████████████████████▎                                                | 3189/5432 [29:56<19:48,  1.89it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 12/12: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5432/5432 [50:59<00:00,  1.78it/s]


Epoch 12/12, Train Loss: 0.0119


Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1358/1358 [04:27<00:00,  5.08it/s]

Validation Loss: 1.3102, Validation Accuracy: 0.8004, Validation F1 Score: 0.6796





In [7]:
# 테스트 예측
model.eval()
test_predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Predicting'):
        input_ids = batch['input_ids'].to(CFG.device)
        attention_mask = batch['attention_mask'].to(CFG.device)
        tfidf_features = batch['tfidf_features'].to(CFG.device)
        
        outputs = model(input_ids, attention_mask=attention_mask, tfidf_features=tfidf_features)
        _, preds = torch.max(outputs, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 예측 결과 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

# 제출 파일 생성
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission["분류"] = decoded_predictions
sample_submission.to_csv("submission_klue_roberta_large.csv", encoding='UTF-8-sig', index=False)

print("Prediction completed and submission file created.")

Predicting: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2926/2926 [09:35<00:00,  5.08it/s]

Prediction completed and submission file created.





In [8]:
# 모델 저장
torch.save(model.state_dict(), 'klue_roberta_large_model.pth')
print("Model saved successfully.")

# 옵티마이저 상태 저장 (선택사항)
torch.save(optimizer.state_dict(), 'optimizer.pth')
print("Optimizer state saved successfully.")

Model saved successfully.
Optimizer state saved successfully.
