In [None]:
import pandas as pd
import re
from konlpy.tag import Komoran
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from torch.utils.data import Dataset
from sklearn.metrics import precision_recall_fscore_support

df = pd.read_csv('news.csv')
senti_df = pd.read_csv('SentiWord_Dict.txt', sep='\t', header=None, names=['word', 'polarity'])

dict_data = pd.read_excel('dict.xlsx', sheet_name=None)
stopwords = dict_data['불용어']['stopwords'].tolist()
hanja_changes = dict_data['한자'].set_index('hanja')['change'].to_dict()

def preprocess_text(text):
    for hanja, change in hanja_changes.items():
        text = text.replace(hanja, change)
    text = re.sub(r'[\u4e00-\u9fff]+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'[a-z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text

df['processed_content'] = df['content'].apply(preprocess_text)

senti_df = dict(zip(senti_df['word'], senti_df['polarity']))

komoran = Komoran()

def analyze_sentiment(text):
    tokens = komoran.morphs(text)
    sentiment_score = 0
    
    for token in tokens:
        if token in senti_df:
            sentiment_score += senti_df[token]
    if sentiment_score > 0:
        return 'Positive'
    elif sentiment_score < 0:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment'] = df['content'].apply(analyze_sentiment)

In [None]:
sentiment_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
df['sentiment_label'] = df['sentiment'].map(sentiment_mapping)

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

train_dataset = SentimentDataset(train_df['processed_content'].tolist(), train_df['sentiment_label'].tolist(), tokenizer)
test_dataset = SentimentDataset(test_df['processed_content'].tolist(), test_df['sentiment_label'].tolist(), tokenizer)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=5e-5,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)
trainer.train()

model_path = "./model_save"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [16]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer
from transformers import TrainingArguments
import torch
import pandas as pd

model_path = "./model_save"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

df = pd.read_csv('new_news.csv')

new_sentences = df['content'].tolist()

encodings = tokenizer(new_sentences, truncation=True, padding=True, max_length=512)

new_data = SentimentDataset(encodings, [0] * len(new_sentences))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_predict=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=None
)

predictions = trainer.predict(new_data)

pred_labels = predictions.predictions.argmax(-1)
sentiment_mapping_reverse = {0: "Negative", 1: "Neutral", 2: "Positive"}
pred_labels = [sentiment_mapping_reverse[label] for label in pred_labels]