In [None]:
!pip install evaluate
!pip install wandb


import torch
import numpy as np
import tensorflow as tf

import pandas as pd


from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

import torch.nn as nn
import evaluate
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, XLMRobertaModel

from datasets import load_metric
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, EarlyStoppingCallback



In [None]:
# GPU 환경 설정
gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[0], "GPU")
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    except RuntimeError as e:
        print(e)
else:
    strategy = tf.distribute.get_strategy()
    print("복제 수 : ", strategy.num_replicas_in_sync)

In [None]:
# 환경 변수 설정
import os
os.environ["WANDB_API_KEY"] = "0"

In [None]:
# 버전 에러 방지
import warnings
warnings.filterwarnings("ignore")

! pip install -U accelerate
! pip install -U transformers



In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch

# 데이터 불러오기
file_path = "/content/fake_or_real_news.csv"
data = pd.read_csv(file_path)

# 불필요한 열 제거
data = data.drop(columns=['Unnamed: 0'])

# 훈련 데이터와 테스트 데이터로 분리
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# label을 0 또는 1로 변환
train_data["fake"] = train_data["label"].apply(lambda x: 0 if x == "REAL" else 1)
train_data = train_data.drop(["label"], axis=1)

# test 데이터에서 label 열 제거
test_data = test_data.drop(["label"], axis=1)

# 훈련 데이터와 검증 데이터로 분리
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# XLM-RoBERTa 모델 및 토크나이저 로드
model_name = "symanto/xlm-roberta-base-snli-mnli-anli-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# 토크나이징 및 데이터셋 클래스 정의
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = str(self.data.iloc[idx]['title'])
        text = str(self.data.iloc[idx]['text'])

        # 텍스트를 쪼개고 다시 합치기
        encoded_text = self.tokenizer.batch_encode_plus(
            [title + " " + text],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        inputs = {
            'input_ids': encoded_text['input_ids'].squeeze(),
            'attention_mask': encoded_text['attention_mask'].squeeze(),
            'label': torch.tensor(self.data.iloc[idx]['fake'], dtype=torch.long)
        }

        return inputs

train_dataset = NewsDataset(train_data, tokenizer)
val_dataset = NewsDataset(val_data, tokenizer)
test_dataset = NewsDataset(test_data, tokenizer)

# Trainer를 위한 TrainingArguments 설정
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=100,  # 스텝마다 로깅할 주기
    evaluation_strategy="steps",  # 평가 전략을 스텝마다로 변경
    eval_steps=100,  # 스텝마다 검증 수행
)

# 정확도 계산 함수 정의
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    accuracy = (preds == labels).mean()
    return {'accuracy': accuracy, 'f1': accuracy}

# Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # 검증 데이터셋 추가
    compute_metrics=compute_metrics,
)

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [None]:
# 환경 변수 설정
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

# 훈련이 완료된 후 최종 평가 결과 얻기
final_eval_results = trainer.evaluate()

# 최종 정확도 출력
print(f"Final Accuracy: {final_eval_results['eval_accuracy']}")

Step,Training Loss,Validation Loss,Accuracy,F1
100,0.4181,0.61434,0.87574,0.87574
200,0.2693,0.236814,0.946746,0.946746
300,0.2467,0.276625,0.93787,0.93787
400,0.1956,0.284817,0.952663,0.952663
500,0.1271,0.229811,0.959566,0.959566
600,0.0844,0.235577,0.965483,0.965483
700,0.1411,0.201328,0.954635,0.954635
800,0.1082,0.26152,0.957594,0.957594
900,0.098,0.150945,0.972387,0.972387
1000,0.0945,0.130463,0.976331,0.976331


Final Accuracy: 0.9763313609467456


In [None]:
def validate_text_file(file_path, model, tokenizer, max_length=512):
    # 텍스트 파일의 내용 읽기
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # 모델과 데이터를 동일한 디바이스로 이동
    device = model.device
    encoded_text = tokenizer(
        content,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)

    inputs = {
        'input_ids': encoded_text['input_ids'],
        'attention_mask': encoded_text['attention_mask'],
    }

    # 모델을 사용하여 추론 수행
    with torch.no_grad():
        model.eval()
        logits = model(**inputs).logits

    # 예측된 레이블 가져오기
    predicted_label = torch.argmax(logits, dim=1).item()

    # 결과 출력
    if predicted_label == 0:
        print("예측: REAL")
    else:
        print("예측: FAKE")

file_to_validate = "sample_input.txt"
validate_text_file(file_to_validate, model, tokenizer)

예측: REAL
