In [1]:
#필수 라이브러리 임포트
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertModel, DistilBertTokenizer 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import numpy as np
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#csv파일에서 df_train 저장
df_train = pd.read_csv(r'C:\Users\kingc\Desktop\ai\nlp-getting-started\train.csv')
df_real=pd.read_csv(r'C:\Users\kingc\Desktop\ai\nlp-getting-started\test.csv')
df_real['target']=0

#데이터 전처리
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # URL 제거
    text = re.sub(r'@\w+', '', text)    # 멘션 제거
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # 특수문자 제거
    text = text.lower().strip()        # 소문자 변환 및 양쪽 공백 제거
    return text

df_train['text'] = df_train['text'].apply(clean_text)
df_real['text'] = df_real['text'].apply(clean_text)



In [3]:
#df_train->train, test로 분리
train_input, test_input, train_target, test_target = train_test_split(
    df_train['text'], df_train['target'], test_size=0.2, random_state=42)


In [4]:
# 인덱스를 리셋하여 연속적인 숫자로 설정
train_input.reset_index(drop=True, inplace=True)
train_target.reset_index(drop=True, inplace=True)

In [5]:
#데이터셋 클래스 정의
class TextDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        target = self.targets.iloc[idx]


        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0), 
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'target': torch.tensor(target, dtype=torch.long)
        }


In [6]:
class DistilbertBinaryClassifier(nn.Module):
    def __init__(self, pretrained_model_name='distilbert-base-uncased'):
        super(DistilbertBinaryClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.5)  # Dropout 비율을 증가하여 과적합 방지
        self.fc = nn.Linear(self.distilbert.config.hidden_size, 1) 

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS 
        dropout_output = self.dropout(pooled_output)
        return self.fc(dropout_output)  # 로짓 출력

In [7]:
# 하이퍼파라미터 설정
PRETRAINED_MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 160
BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 2e-5

# distilbert 토크나이저 초기화
tokenizer =  DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

# 데이터셋 준비
train_dataset = TextDataset(train_input, train_target, tokenizer, MAX_LEN)
test_dataset = TextDataset(test_input, test_target, tokenizer, MAX_LEN)
real_dataset = TextDataset(df_real['text'], df_real['target'], tokenizer, MAX_LEN)

# 데이터로더 생성
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
real_loader = DataLoader(real_dataset, batch_size=1, shuffle=False)

# 모델 초기화
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilbertBinaryClassifier(pretrained_model_name=PRETRAINED_MODEL_NAME)
model.to(device)

# 손실 함수 및 옵티마이저
criterion = nn.BCEWithLogitsLoss()  # 이진 분류 손실 함수
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)


In [8]:
best_f1 = 0  # 최고의 F1 Score를 저장할 변수
best_accuracy = 0  # 최고의 Accuracy를 저장할 변수
best_epoch = 0  # 최고의 F1 Score를 기록한 에포크 번호

for epoch in range(EPOCHS):
    # ====== 학습 단계 ======
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device).float()  # BCEWithLogitsLoss는 float 입력 필요

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(-1), targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

    # ====== 평가 단계 ======
    model.eval()
    all_targets = []
    all_predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            # 예측값 계산
            outputs = model(input_ids, attention_mask)
            predictions = torch.sigmoid(outputs).squeeze(-1) > 0.5  # 0.5 기준으로 이진 분류

            all_targets.extend(targets.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

    # F1 Score 및 정확도 계산
    f1 = f1_score(all_targets, all_predictions)
    tn, fp, fn, tp = confusion_matrix(all_targets, all_predictions).ravel()
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    accuracy = accuracy_score(all_targets, all_predictions)
    print(f"Epoch {epoch + 1}, Test F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}")

    

    # F1 Score 비교 및 저장
    if f1 > best_f1:
        best_f1 = f1
        best_accuracy = accuracy
        best_epoch = epoch + 1
        # 모델 저장 (옵션)
        torch.save(model.state_dict(), "best_model.pth")
        print(f"New best F1 Score: {best_f1:.4f}, Accuracy: {best_accuracy:.4f}, Model saved.")

# 학습 종료 후 최고의 F1 Score와 에포크 출력
print(f"Best F1 Score: {best_f1:.4f} at Epoch {best_epoch}")
print(f"Best Accuracy: {best_accuracy:.4f} at Epoch {best_epoch}")

Epoch 1, Loss: 0.4517371957813661
TP: 495, TN: 778, FP: 96, FN: 154
Epoch 1, Test F1 Score: 0.7984, Accuracy: 0.8359
New best F1 Score: 0.7984, Accuracy: 0.8359, Model saved.
Epoch 2, Loss: 0.3282721198684587
TP: 483, TN: 800, FP: 74, FN: 166
Epoch 2, Test F1 Score: 0.8010, Accuracy: 0.8424
New best F1 Score: 0.8010, Accuracy: 0.8424, Model saved.
Best F1 Score: 0.8010 at Epoch 2
Best Accuracy: 0.8424 at Epoch 2


In [9]:
# 모델 예측 수행
model.eval()
all_preds = []

with torch.no_grad():  # No gradient computation for prediction
    for batch in real_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # 모델 예측
        outputs = model(input_ids, attention_mask)

        # 예측을 0 또는 1로 변환 (이진 분류)
        preds = torch.round(torch.sigmoid(outputs)).squeeze(-1).cpu().numpy()
        all_preds.extend(preds)




In [10]:
all_preds=list(map(int,all_preds))
sample_submission = pd.read_csv(r'C:\Users\kingc\Desktop\ai\nlp-getting-started\sample_submission.csv')
sample_submission["target"] = all_preds
sample_submission.to_csv("submission.csv", index=False)