In [1]:
import pandas as pd
import numpy as np

import re
import emoji
from soynlp.normalizer import repeat_normalize

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset, TensorDataset

from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
from transformers import ElectraTokenizerFast, ElectraModel, TFElectraModel
from transformers import AdamW

from sklearn.model_selection import train_test_split




In [2]:
df = pd.read_csv('Dataset2.csv', nrows=10000)
X = df['문법교정']
y = df['malicious']

In [3]:
emojis = ''.join(emoji.EMOJI_DATA.keys())
pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣{emojis}]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

def clean(x): 
    x = pattern.sub(' ', x)
    x = emoji.replace_emoji(x, replace='')
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

X = X.apply(lambda x: clean(x))

In [4]:
# model = T5ForConditionalGeneration.from_pretrained('j5ng/et5-typos-corrector')
# tokenizer = T5Tokenizer.from_pretrained('j5ng/et5-typos-corrector')

# typos_corrector = pipeline(
#     "text2text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     device=0 if torch.cuda.is_available() else -1,
#     framework="pt",
# )

# X = X.apply(lambda x: typos_corrector(x,
#             max_length=128,
#             num_beams=5,
#             early_stopping=True)[0]['generated_text'])

In [5]:
from transformers import AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")

X_list = X.values.tolist()
y = y.values
sequences = tokenizer(X_list, padding=True, truncation=True, return_tensors="pt")

X_train, X_test, y_train, y_test = train_test_split(sequences['input_ids'], y, test_size=0.2, random_state=42)
X_train_mask, X_test_mask, _, _ = train_test_split(sequences['attention_mask'], y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(X_train, X_train_mask, torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(X_test, X_test_mask, torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=16, shuffle=False)

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True, dropout=self.dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.dropout_layer = nn.Dropout(p=self.dropout)
    
    def forward(self, x):
        embedded = self.dropout_layer(self.embedding(x))
        out, _ = self.lstm(embedded)
        out = self.fc(out[:, -1, :])
        return out
    
input_size = len(tokenizer)
hidden_size = 128
num_layers = 2
num_classes = 2

model = LSTMClassifier(input_size, hidden_size, num_layers, num_classes).to(device)

optimizer = Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    for input_ids, attention_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    val_losses = []
    val_accs = []
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        
        with torch.no_grad():
            outputs = model(input_ids)
            val_losses.append(criterion(outputs, labels).item())
            preds = torch.argmax(outputs, dim=1)
            acc = (preds == labels).float().mean().item()
            val_accs.append(acc)

    val_loss = np.mean(val_losses)
    val_acc = np.mean(val_accs)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Validation Loss: {val_loss}, Validation Accuracy: {val_acc}")

Epoch 1/10, Validation Loss: 0.6920426301956176, Validation Accuracy: 0.5235
Epoch 2/10, Validation Loss: 0.6921260976791381, Validation Accuracy: 0.5235
Epoch 3/10, Validation Loss: 0.6920457892417907, Validation Accuracy: 0.5235
Epoch 4/10, Validation Loss: 0.6920511875152587, Validation Accuracy: 0.5235
Epoch 5/10, Validation Loss: 0.6925025310516357, Validation Accuracy: 0.5235
Epoch 6/10, Validation Loss: 0.6926042494773865, Validation Accuracy: 0.5235
Epoch 7/10, Validation Loss: 0.6922600531578064, Validation Accuracy: 0.5235
Epoch 8/10, Validation Loss: 0.6920434393882752, Validation Accuracy: 0.5235
Epoch 9/10, Validation Loss: 0.6920547103881836, Validation Accuracy: 0.5235
Epoch 10/10, Validation Loss: 0.6924946007728576, Validation Accuracy: 0.5235
