In [1]:
#필수 라이브러리 임포트
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertModel, DistilBertTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
#csv파일에서 df_train 저장
df_train = pd.read_csv(r'C:\Users\kingc\Desktop\ai\nlp-getting-started\train.csv')
df_real=pd.read_csv(r'C:\Users\kingc\Desktop\ai\nlp-getting-started\test.csv')
df_real['target']=0

#df_train->train, test로 분리
train_input, test_input, train_target, test_target = train_test_split(
    df_train['text'], df_train['target'], test_size=0.2, random_state=42)

# 인덱스를 리셋하여 연속적인 숫자로 설정
train_input.reset_index(drop=True, inplace=True)
train_target.reset_index(drop=True, inplace=True)

In [3]:
#데이터셋 클래스 정의
class TextDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        target = self.targets.iloc[idx]


        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0), 
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'target': torch.tensor(target, dtype=torch.long)
        }


In [4]:
class DistilBertBinaryClassifier(nn.Module):
    def __init__(self, pretrained_model_name='distilbert-base-uncased'):
        super(DistilBertBinaryClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.3)  # Dropout 비율을 증가하여 과적합 방지
        self.fc = nn.Linear(self.distilbert.config.hidden_size, 1) 

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS 
        dropout_output = self.dropout(pooled_output)
        return self.fc(dropout_output)  # 로짓 출력

In [5]:
# 하이퍼파라미터 설정
PRETRAINED_MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 160
BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 3e-5

# BERT 토크나이저 초기화
tokenizer =  DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

# 데이터셋 준비
train_dataset = TextDataset(train_input, train_target, tokenizer, MAX_LEN)
test_dataset = TextDataset(test_input, test_target, tokenizer, MAX_LEN)
real_dataset = TextDataset(df_real['text'], df_real['target'], tokenizer, MAX_LEN)

# 데이터로더 생성
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
real_loader = DataLoader(real_dataset, batch_size=1, shuffle=False)

# 모델 초기화
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertBinaryClassifier(pretrained_model_name=PRETRAINED_MODEL_NAME)
model.to(device)

# 손실 함수 및 옵티마이저
criterion = nn.BCEWithLogitsLoss()  # 이진 분류 손실 함수
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)


In [6]:
# 학습 루프
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device).float()  # BCEWithLogitsLoss는 float 입력 필요

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(-1), targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


Epoch 1, Loss: 0.43611774931898895
Epoch 2, Loss: 0.3066454601725881


In [13]:
# 평가 함수
model.eval()
all_targets = []
all_predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        # 예측값 계산
        outputs = model(input_ids, attention_mask)
        predictions = torch.sigmoid(outputs).squeeze(-1) > 0.5  # 0.5 기준으로 이진 분류

        all_targets.extend(targets.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())

# F1 score 계산
f1 = f1_score(all_targets, all_predictions)
print(f"Test F1 Score: {f1:.4f}")

# 혼동 행렬 (Confusion Matrix) 계산
tn, fp, fn, tp = confusion_matrix(all_targets, all_predictions).ravel()
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

#Accuracy 계산
print("Accuracy:", (tn+tp)/(fn+fp+tn+tp))


Test F1 Score: 0.8010
TP: 499, TN: 776, FP: 98, FN: 150
Accuracy: 0.8371634931057124


In [8]:
# 모델 예측 수행
model.eval()
all_preds = []

with torch.no_grad():  # No gradient computation for prediction
    for batch in real_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # 모델 예측
        outputs = model(input_ids, attention_mask)

        # 예측을 0 또는 1로 변환 (이진 분류)
        preds = torch.round(torch.sigmoid(outputs)).squeeze(-1).cpu().numpy()
        all_preds.extend(preds)




In [9]:
all_preds=list(map(int,all_preds))
sample_submission = pd.read_csv(r'C:\Users\kingc\Desktop\ai\nlp-getting-started\sample_submission.csv')
sample_submission["target"] = all_preds
sample_submission.to_csv("submission.csv", index=False)