In [29]:
from google.colab import drive

# Google Drive 연결
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# 필요한 라이브러리 설치
!pip install transformers datasets gensim nltk -q

In [19]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
import re
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
import torch
from gensim.models import KeyedVectors

In [20]:
# 데이터 전처리 함수 정의
def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)  # URL 제거
    text = re.sub(r"@\w+", "", text)    # 사용자 태그 제거
    text = re.sub(r"[^a-zA-Z]", " ", text)  # 특수 문자 제거
    text = text.lower()  # 소문자로 변환
    return text.strip()

# 데이터 로드
url = "https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv"
data = pd.read_csv(url)

# 데이터 전처리
data['clean_text'] = data['tweet'].apply(preprocess_text)

# 데이터셋 분리
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['clean_text'], data['class'], test_size=0.2, random_state=42
)

# 토크나이저 및 데이터 토큰화
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

# 데이터셋 클래스 정의
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item

train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)

In [21]:
# 모델 로드
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

# Trainer 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,  # 에포크를 20으로 설정
    weight_decay=0.01,
    report_to="none"
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import matplotlib.pyplot as plt
from transformers import TrainerCallback

# 에폭별 결과를 저장하는 콜백 클래스 정의
class MetricsCallback(TrainerCallback):
    def __init__(self):
        self.metrics = {"epoch": [], "accuracy": [], "loss": [], "f1": []}

    def on_epoch_end(self, args, state, control, **kwargs):
        # 에폭별 측정값 저장
        logs = state.log_history[-1]  # 가장 최근 로그
        self.metrics["epoch"].append(state.epoch)
        self.metrics["accuracy"].append(logs.get("eval_accuracy", None))
        self.metrics["loss"].append(logs.get("eval_loss", None))
        self.metrics["f1"].append(logs.get("eval_f1", None))

# 콜백 객체 생성
metrics_callback = MetricsCallback()

In [23]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[metrics_callback],
)

In [24]:
# 모델 학습
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2801,0.262963,0.910026,0.906977
2,0.2377,0.272307,0.912851,0.903976
3,0.1904,0.308223,0.907605,0.90586


TrainOutput(global_step=3720, training_loss=0.25019106095837007, metrics={'train_runtime': 766.5834, 'train_samples_per_second': 77.588, 'train_steps_per_second': 4.853, 'total_flos': 3912364964906496.0, 'train_loss': 0.25019106095837007, 'epoch': 3.0})

In [25]:
# GloVe 모델 다운로드
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glove

--2024-12-09 09:04:27--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-12-09 09:04:27--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-12-09 09:04:27--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2

In [26]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import torch

# GloVe 파일 경로
glove_file = "glove/glove.6B.100d.txt"
word2vec_output_file = "glove.6B.100d.word2vec.txt"

# GloVe 형식을 Word2Vec 형식으로 변환 및 로드
try:
    glove2word2vec(glove_file, word2vec_output_file)
    glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    print("GloVe 모델 로드 성공!")
except Exception as e:
    print(f"GloVe 모델 로드 실패: {e}")
    raise

# 반복 검증을 위한 함수 정의
def verify_and_neutralize(sentence, model, tokenizer, glove_model, max_attempts=5, epsilon=1.0, noise_scale=0.1):
    device = next(model.parameters()).device
    harmful_words = {"stupid", "idiot", "hate", "terrible", "dumb"}  # 확장된 유해 단어 리스트
    for attempt in range(max_attempts):
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
        model.eval()
        with torch.no_grad():
            logits = model(**inputs).logits
        predicted_label = logits.argmax(axis=-1).item()

        if predicted_label == 0:
            print(f"Attempt {attempt + 1}: Sentence is classified as neutral.")
            return sentence

        tokens = tokenizer.tokenize(sentence)
        neutralized_tokens = []
        for token in tokens:
            if token in harmful_words and token in glove_model:
                similar_words = glove_model.most_similar(positive=[token], topn=1)
                replacement = similar_words[0][0]
                print(f"Detected token: {token}, Similar words: {similar_words}")
                neutralized_tokens.append(replacement)
            else:
                neutralized_tokens.append(token)

        for i, token in enumerate(neutralized_tokens):
            if token in glove_model:
                vector = glove_model[token].copy()
                noise = torch.distributions.Laplace(0, noise_scale).sample(torch.tensor(vector).size())
                print(f"Original vector: {glove_model[token]}, Noised vector: {vector}")
                vector += noise.cpu().numpy()
                neutralized_tokens[i] = glove_model.most_similar(positive=[vector], topn=1)[0][0]

        sentence = tokenizer.convert_tokens_to_string(neutralized_tokens)
        print(f"Attempt {attempt + 1}: Neutralized Sentence: {sentence}")

    print("Max attempts reached. Sentence remains harmful. Applying censorship.")
    return "[CENSORED]"


  glove2word2vec(glove_file, word2vec_output_file)


GloVe 모델 로드 성공!


In [27]:
def evaluate_model(test_sentences, model, tokenizer, glove_model, max_attempts=5):
    """
    모델을 평가하고 결과를 출력합니다.
    """
    device = next(model.parameters()).device  # 모델이 위치한 디바이스
    original_labels, predicted_labels = [], []
    success_count = 0
    cosine_scores, bleu_scores = [], []

    for sentence in test_sentences:
        print(f"Evaluating sentence: {sentence}")  # 현재 문장 디버깅

        # 원본 문장의 유해성 분류
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)  # 데이터 이동
        with torch.no_grad():
            logits = model(**inputs).logits
        original_label = logits.argmax(axis=-1).item()
        original_labels.append(original_label)

        # 중립화된 문장 생성
        neutralized_sentence = verify_and_neutralize(sentence, model, tokenizer, glove_model, max_attempts)
        print(f"Neutralized sentence: {neutralized_sentence}")  # 중립화 결과 디버깅

        # 중립화된 문장의 유해성 분류
        inputs = tokenizer(neutralized_sentence, return_tensors="pt", truncation=True, padding=True).to(device)  # 데이터 이동
        with torch.no_grad():
            logits = model(**inputs).logits
        predicted_label = logits.argmax(axis=-1).item()
        predicted_labels.append(predicted_label)

        # 중립화 성공 여부
        if predicted_label == 0:
            success_count += 1

        # 유사도 계산
        original_vectors = [glove_model[token] for token in tokenizer.tokenize(sentence) if token in glove_model]
        neutralized_vectors = [glove_model[token] for token in tokenizer.tokenize(neutralized_sentence) if token in glove_model]
        if original_vectors and neutralized_vectors:  # GloVe 벡터가 존재하는 경우에만 계산
            cosine_sim = cosine_similarity(
                np.mean(original_vectors, axis=0).reshape(1, -1),
                np.mean(neutralized_vectors, axis=0).reshape(1, -1)
            )[0, 0]
            cosine_scores.append(cosine_sim)

        # BLEU Score 계산
        bleu_scores.append(sentence_bleu([sentence.split()], neutralized_sentence.split(), weights=(0.5, 0.5)))

    # 평가 지표 계산
    precision = precision_score(original_labels, predicted_labels, average="binary")
    recall = recall_score(original_labels, predicted_labels, average="binary")
    f1 = f1_score(original_labels, predicted_labels, average="binary")
    neutralization_rate = (success_count / len(test_sentences)) * 100
    avg_cosine_similarity = np.mean(cosine_scores) if cosine_scores else 0
    avg_bleu_score = np.mean(bleu_scores) if bleu_scores else 0

    # 결과 출력
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Neutralization Rate: {neutralization_rate:.2f}%")
    print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")
    print(f"Average BLEU Score: {avg_bleu_score:.4f}")


In [30]:
import pandas as pd
import re
import torch
from transformers import RobertaTokenizer

# 데이터 전처리 함수 정의
def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)  # URL 제거
    text = re.sub(r"@\w+", "", text)    # 사용자 태그 제거
    text = re.sub(r"[^a-zA-Z]", " ", text)  # 특수 문자 제거
    text = text.lower()  # 소문자로 변환
    return text.strip()

# 데이터 로드
file_path = "/content/drive/MyDrive/Colab Notebooks/조현수/팀플실험/Dataset/test.csv"
data = pd.read_csv(file_path)

# `comment_text` 컬럼의 값 전처리
data['clean_text'] = data['comment_text'].apply(preprocess_text)

# 전처리된 텍스트를 리스트로 저장
test_sentences = data['clean_text'].tolist()
print(f"First 5 processed sentences: {test_sentences[:5]}")

# 토크나이저 로드
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# 테스트 데이터 토큰화
test_encodings = tokenizer(test_sentences, truncation=True, padding=True, max_length=128)

# 데이터셋 클래스 정의
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

# 테스트 데이터셋 생성
test_dataset = TestDataset(test_encodings)

# 테스트 데이터셋 확인
print(f"Test dataset length: {len(test_dataset)}")


First 5 processed sentences: ['yo bitch ja rule is more succesful then you ll ever be whats up with you and hating you sad mofuckas   i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me  ja rule is about pride in da music man  dont diss that shit on him  and nothin is wrong bein like tupac he was a brother too   fuckin white boys get things right next time', 'from rfc       the title is fine as it is  imo', 'sources         zawe ashton on lapland', 'if you have a look back at the source  the information i updated was the correct form  i can only guess the source hadn t updated  i shall update the information once again but thank you for your message', 'i don t anonymously edit articles at all']
Test dataset length: 153164


In [None]:

# 함수 호출
evaluate_model(test_sentences, model, tokenizer, glove_model)


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  0.30801    0.59964   -0.26991   -0.76035    0.94222   -0.46919
 -0.18278    0.90652    0.79671    0.24825    0.25713    0.6232
 -0.44768    0.65357    0.76902   -0.51229   -0.44333   -0.21867
  0.3837    -1.1483    -0.94398   -0.15062    0.30012   -0.57806
  0.20175   -1.6591    -0.079195   0.026423   0.22051    0.99714
 -0.57539   -2.7266     0.31448    0.70522    1.4381     0.99126
  0.13976    1.3474    -1.1753     0.0039503  1.0298     0.064637
  0.90887    0.82872   -0.47003   -0.10575    0.5916    -0.4221
  0.57331   -0.54114    0.10768    0.39784   -0.048744   0.064596
 -0.61437   -0.286      0.5067    -0.49758   -0.8157     0.16408
 -1.963     -0.26693   -0.37593   -0.95847   -0.8584    -0.71577
 -0.32343   -0.43121    0.41392    0.28374   -0.70931    0.15003
 -0.2154    -0.37616   -0.032502   0.8062   ], Noised vector: [-0.046539   0.61966    0.56647   -0.46584   -1.189      0.44599
  0.066035   0.3191     0.14679   -0.22119 