In [None]:
# 라이브러리 설치
%pip install transformers[torch] datasets accelerate -q
%pip install scikit-learn xgboost lightgbm -q
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 -q

# wandb 완전 제거
%pip uninstall wandb -y -q

print("라이브러리 설치 완료")


In [None]:
# 필수 라이브러리 임포트
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, get_linear_schedule_with_warmup,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
import json
import os
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# wandb 완전 비활성화
os.environ['WANDB_DISABLED'] = 'true'
os.environ['WANDB_MODE'] = 'disabled'

# 시드 고정
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)
print("환경 설정 완료")


In [None]:
# A100 GPU 확인 및 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# A100 최적화 설정
MODEL_NAME = "klue/roberta-large"
MAX_LENGTH = 512
BATCH_SIZE = 32
GRADIENT_ACCUMULATION = 2
LEARNING_RATE = 2e-5
WARMUP_STEPS = 1000
MAX_GRAD_NORM = 1.0
WEIGHT_DECAY = 0.01
LABEL_SMOOTHING = 0.1
EARLY_STOPPING_PATIENCE = 3

print(f"모델: {MODEL_NAME}")
print(f"배치 크기: {BATCH_SIZE} x {GRADIENT_ACCUMULATION} = {BATCH_SIZE * GRADIENT_ACCUMULATION} (effective)")
print(f"학습률: {LEARNING_RATE}")


In [None]:
# 데이터 업로드
from google.colab import files
uploaded = files.upload()


In [None]:
# 데이터 해제 및 로딩
import zipfile

# zip 파일 해제
with zipfile.ZipFile('train.zip', 'r') as zip_ref:
    zip_ref.extractall()

train = pd.read_csv('train.csv', encoding='utf-8-sig')
test = pd.read_csv('test.csv', encoding='utf-8-sig')

print(f"Training data: {train.shape}")
print(f"Test data: {test.shape}")
print(f"Generated distribution: {train['generated'].value_counts()}")
print(f"Generated ratio: {train['generated'].mean():.3f}")


In [None]:
# 문단 단위 분할 및 처리
def split_text_to_paragraphs(text, min_length=50):
    """전체 글을 문단으로 분할"""
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    paragraphs = [p for p in paragraphs if len(p) >= min_length]

    if not paragraphs:
        sentences = text.split('. ')
        paragraphs = []
        current_para = ""
        for sent in sentences:
            current_para += sent + ". "
            if len(current_para) >= 200:
                paragraphs.append(current_para.strip())
                current_para = ""
        if current_para.strip():
            paragraphs.append(current_para.strip())

    return paragraphs

# 문단 단위 변환
train_paragraphs = []
for idx, row in train.iterrows():
    paragraphs = split_text_to_paragraphs(row['full_text'])
    for i, paragraph in enumerate(paragraphs):
        train_paragraphs.append({
            'title': row['title'],
            'paragraph_index': i,
            'paragraph_text': paragraph,
            'generated': row['generated']
        })

train_para_df = pd.DataFrame(train_paragraphs)
print(f"원본 훈련 데이터: {len(train)}개 글")
print(f"변환된 훈련 데이터: {len(train_para_df)}개 문단")
print(f"평균 문단 수: {len(train_para_df) / len(train):.1f}개/글")


In [None]:
# 클래스 가중치 계산 및 데이터셋 클래스
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_para_df['generated']),
    y=train_para_df['generated']
)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
print(f"클래스 가중치: {class_weight_dict}")

class AIDetectionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# title 기준 데이터 분할 (데이터 누수 방지)
title_labels = train.set_index('title')['generated'].to_dict()
unique_titles = list(title_labels.keys())
title_y = [title_labels[title] for title in unique_titles]

train_titles, val_titles = train_test_split(
    unique_titles, test_size=0.2, random_state=42,
    stratify=title_y
)

train_mask = train_para_df['title'].isin(train_titles)
val_mask = train_para_df['title'].isin(val_titles)

X_train = train_para_df[train_mask]['paragraph_text']
y_train = train_para_df[train_mask]['generated']
X_val = train_para_df[val_mask]['paragraph_text']
y_val = train_para_df[val_mask]['generated']

print(f"훈련 제목: {len(train_titles)}개")
print(f"검증 제목: {len(val_titles)}개")
print(f"훈련 문단: {len(X_train)}개")
print(f"검증 문단: {len(X_val)}개")


In [None]:
# 모델 및 토크나이저 로딩
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

# 데이터셋 생성
train_dataset = AIDetectionDataset(X_train, y_train, tokenizer, MAX_LENGTH)
val_dataset = AIDetectionDataset(X_val, y_val, tokenizer, MAX_LENGTH)

print(f"데이터셋 생성 완료")
print(f"훈련 샘플: {len(train_dataset)}")
print(f"검증 샘플: {len(val_dataset)}")


In [None]:
# 가중치 적용 트레이너 및 평가 메트릭
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')

        class_weights_tensor = torch.tensor([
            class_weight_dict[0],
            class_weight_dict[1]
        ], dtype=torch.float).to(logits.device)

        loss_fct = nn.CrossEntropyLoss(
            weight=class_weights_tensor,
            label_smoothing=LABEL_SMOOTHING
        )
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    probs = torch.softmax(torch.tensor(eval_pred.predictions), dim=-1)[:, 1].numpy()

    try:
        auc = roc_auc_score(labels, probs)
    except:
        auc = 0.5

    accuracy = (predictions == labels).mean()
    return {'accuracy': accuracy, 'auc': auc}

print("트레이너 클래스 정의 완료")


In [None]:
# 훈련 설정 및 트레이너 초기화
training_args = TrainingArguments(
    output_dir='./results_step1',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_steps=WARMUP_STEPS,
    max_grad_norm=MAX_GRAD_NORM,
    logging_dir='./logs_step1',
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_auc",
    greater_is_better=True,
    dataloader_pin_memory=True,
    fp16=True,
    dataloader_num_workers=4,
    report_to=[],
    disable_tqdm=False,
    remove_unused_columns=False,
)

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=EARLY_STOPPING_PATIENCE,
    early_stopping_threshold=0.001
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

print("훈련 준비 완료")


In [None]:
# 모델 훈련 실행
print("기본 모델 훈련 시작...")

try:
    trainer.train()
    
    # 최종 평가
    eval_results = trainer.evaluate()
    print(f"최종 검증 성능: {eval_results}")
    
    # 모델 저장
    model.save_pretrained('./base_model')
    tokenizer.save_pretrained('./base_model')
    
    # 메타데이터 저장
    metadata = {
        'model_name': MODEL_NAME,
        'eval_results': eval_results,
        'class_weights': class_weight_dict,
        'train_titles': train_titles,
        'val_titles': val_titles
    }
    
    with open('step1_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print("기본 모델 훈련 및 저장 완료")
    print(f"검증 AUC: {eval_results.get('eval_auc', 0):.4f}")
    
except Exception as e:
    print(f"훈련 중 오류: {e}")
    # 에러 발생 시에도 현재 모델 저장
    model.save_pretrained('./base_model_backup')
    tokenizer.save_pretrained('./base_model_backup')


In [None]:
# 데이터 저장 (다음 단계에서 사용)
train_para_df.to_pickle('train_para_df.pkl')
test.to_pickle('test_df.pkl')

# 추가 정보 저장
with open('data_info.pkl', 'wb') as f:
    pickle.dump({
        'train_titles': train_titles,
        'val_titles': val_titles,
        'class_weight_dict': class_weight_dict
    }, f)

print("1단계 완료 - 기본 모델 훈련 및 데이터 준비")
print("다음 단계: 02_hierarchical_modeling.ipynb 실행")
