<a href="https://colab.research.google.com/github/listentothecity/insta/blob/main/Untitled42.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# BERT를 이용한 가짜뉴스 감지기 - 완전 수정 버전
import random
import numpy as np
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

# 0) 시드 고정
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# 1) 데이터 로드
file_path = '/content/250612_final_EN_SOSValencia_valenciadana_2283.xlsx'
df = pd.read_excel(file_path)
print("데이터셋 정보:")
print(f"전체 행 수: {len(df)}")
print("컬럼명:", df.columns.tolist())
print("\n레이블 분포:")
print(df['Label'].value_counts())

# 2) 데이터 전처리 (0=진짜, 1=가짜)
# 텍스트가 비어있거나 레이블이 올바르지 않은 데이터 제거
texts = df['text'].astype(str).tolist()
labels = df['Label'].tolist()

# 데이터 정제
clean_data = []
for text, label in zip(texts, labels):
    if text.strip() and str(text).lower() not in ['nan', 'none', ''] and label in [0, 1]:
        clean_data.append((text.strip(), int(label)))

if not clean_data:
    raise ValueError("정제된 데이터가 없습니다. 데이터를 확인해주세요.")

texts, labels = zip(*clean_data)
print(f"\n정제 후 데이터 수: {len(texts)}")
print("정제 후 레이블 분포:", {i: labels.count(i) for i in set(labels)})

# 3) train/validation 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels,
    test_size=0.1,
    random_state=42,
    stratify=labels
)

print(f"\n훈련 데이터: {len(train_texts)}개")
print(f"검증 데이터: {len(val_texts)}개")

# 4) 토크나이저 및 토크나이징
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize_data(texts, max_length=128):
    return tokenizer(
        list(texts),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)

# 5) Dataset 클래스 정의
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# 데이터셋 생성
train_dataset = FakeNewsDataset(train_encodings, train_labels)
val_dataset = FakeNewsDataset(val_encodings, val_labels)

# 6) 모델 로드
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

# GPU 사용 가능시 모델을 GPU로 이동
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"\n사용 디바이스: {device}")

# 7) 평가 지표 함수 정의
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # 정확도 계산
    accuracy = accuracy_score(labels, predictions)

    # 정밀도, 재현율, F1 점수 계산
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions,
        average='binary',
        zero_division=0
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# 8) 훈련 인수 설정
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',  # evaluation_strategy 대신 eval_strategy 사용
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    seed=42,
    report_to=[],  # wandb 등 로깅 비활성화
    dataloader_pin_memory=False,  # 메모리 최적화
)

# 9) Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 10) 학습 실행
print("\n=== 모델 훈련 시작 ===")
try:
    trainer.train()
    print("\n=== 훈련 완료 ===")

    # 최종 평가
    print("\n=== 최종 평가 결과 ===")
    eval_results = trainer.evaluate()
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")

    # 모델 저장
    trainer.save_model('./best_model')
    tokenizer.save_pretrained('./best_model')
    print("\n모델이 './best_model' 폴더에 저장되었습니다.")

except Exception as e:
    print(f"훈련 중 오류 발생: {str(e)}")
    print("오류 상세 정보:")
    import traceback
    traceback.print_exc()

# 11) 예측 함수 (선택사항)
def predict_fake_news(text, model, tokenizer, device):
    """단일 텍스트에 대해 가짜뉴스 여부 예측"""
    model.eval()

    # 텍스트 토크나이징
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors='pt'
    )

    # 디바이스로 이동
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # 예측
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()

    result = "가짜뉴스" if predicted_class == 1 else "진짜뉴스"
    return result, confidence

# 사용 예시
print("\n=== 예측 테스트 ===")
test_text = "This is a sample news text for testing."
try:
    result, confidence = predict_fake_news(test_text, model, tokenizer, device)
    print(f"텍스트: {test_text}")
    print(f"예측 결과: {result} (신뢰도: {confidence:.4f})")
except Exception as e:
    print(f"예측 테스트 중 오류: {str(e)}")

데이터셋 정보:
전체 행 수: 2282
컬럼명: ['Hahstag', 'text', 'Date', 'Likes', 'Comments', 'Type', 'Label', 'Sentiments']

레이블 분포:
Label
0    2217
1      65
Name: count, dtype: int64

정제 후 데이터 수: 2280
정제 후 레이블 분포: {0: 2216, 1: 64}

훈련 데이터: 2052개
검증 데이터: 228개


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



사용 디바이스: cuda

=== 모델 훈련 시작 ===


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0599,0.07297,0.97807,0.666667,0.333333,0.444444
2,0.0535,0.119516,0.973684,0.5,0.166667,0.25
3,0.0579,0.087025,0.973684,0.5,0.166667,0.25



=== 훈련 완료 ===

=== 최종 평가 결과 ===


eval_loss: 0.0730
eval_accuracy: 0.9781
eval_precision: 0.6667
eval_recall: 0.3333
eval_f1: 0.4444
eval_runtime: 1.5349
eval_samples_per_second: 148.5420
eval_steps_per_second: 2.6060
epoch: 3.0000

모델이 './best_model' 폴더에 저장되었습니다.

=== 예측 테스트 ===
텍스트: This is a sample news text for testing.
예측 결과: 진짜뉴스 (신뢰도: 0.9335)
