In [None]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer
from torch.optim import Adam, AdamW, SGD
from pytorch_lightning import trainer
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from torch.optim.lr_scheduler import ExponentialLR
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
class ClassificationModel(pl.LightningModule):
    def __init__(self,  hyper_parameter: dict):
        super().__init__()


        self.DATA_SET = hyper_parameter["data_path"]

        ### 하이퍼파라미터 ###
        self.MAX_LENGTH = hyper_parameter["max_length"] if ("max_length" in hyper_parameter) else 150
        self.LEARNING_RATE = hyper_parameter["lr"] if ("lr" in hyper_parameter) else 5e-6
        self.EPOCHS = hyper_parameter["epochs"] if ("epochs" in hyper_parameter) else 5
        self.OPTIMIZER = hyper_parameter["optimizer"] if ("optimizer" in hyper_parameter) else "adamw"
        self.GAMMA = hyper_parameter["gamma"] if ("gamma" in hyper_parameter) else 0.5
        self.BATCH_SIZE = hyper_parameter["batch_size"] if ("batch_size" in hyper_parameter) else 32


        self.model = AlbertForSequenceClassification.from_pretrained("albert-base-v2")
        self.model.classifier = torch.nn.Linear(self.model.config.hidden_size, 2) #출력 변경
        #self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        self.tokenizer = Tokenizer(char_level=True)
        # self.tokenizer.add_special_tokens({
        #     "cls_token": "[CLS]", "sep_token": "[SEP]", "unk_token": "<unk>",
        #     "pad_token": "<pad>", "mask_token": "[MASK]"})
        self.loss = torch.nn.BCEWithLogitsLoss() #이진분류를 위한 손실 함수 수정

    def forward(self, **kwargs):
        output = self.model(**kwargs)
        return output[0]

    def __step(self, batch, batch_idx):
        data, labels = batch
        output = self.forward(input_ids=data)
        logits = output[:,1]
        loss = self.loss(logits.unsqueeze(-1), labels.unsqueeze(-1))

        preds = output.argmax(dim=-1)

        y_true = list(labels.cpu().numpy())
        y_pred = list(preds.cpu().numpy())

        return {
            "loss": loss,
            "y_true": y_true,
            "y_pred": y_pred
        }

    def training_step(self, batch, batch_idx):
        return self.__step(batch, batch_idx)

    def validation_step(self, batch, batch_idx):
        return self.__step(batch, batch_idx)

    def test_step(self, batch, batch_idx):
        return self.__step(batch, batch_idx)

    def __epoch_end(self, outputs, state="train"):
        loss = torch.tensor(0, dtype=torch.float)
        y_true, y_pred = [], []

        for i in outputs:
            loss += i["loss"].cpu().detach()
            y_true += i["y_true"]
            y_pred += i["y_pred"]

        loss = loss / len(outputs)
        cm = confusion_matrix(y_true, y_pred)
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, labels=np.unique(y_pred), zero_division=1)
        rec = recall_score(y_true, y_pred, labels=np.unique(y_pred), zero_division=1)
        f1 = f1_score(y_true, y_pred, labels=np.unique(y_pred), zero_division=1)

        print(f"[Epoch {self.trainer.current_epoch} {state.upper()}]",
              f"Loss={loss}, Acc={acc}, Prec={prec}, Rec={rec}, F1={f1},",
              "CM={}".format(str(cm).replace("\n", "")))

        return {"loss": loss, "acc": acc, "prec": prec, "rec": rec, "f1": f1}

    def training_epoch_end(self, outputs):
        self.__epoch_end(outputs, state="train")

    def validation_epoch_end(self, outputs):
        self.__epoch_end(outputs, state="val")

    def test_epoch_end(self, outputs):
        self.__epoch_end(outputs, state="test")

    def configure_optimizers(self):
        if self.OPTIMIZER == "adam":
            optimizer = Adam(self.parameters(), lr=self.LEARNING_RATE)
        elif self.OPTIMIZER == "adamw":
            optimizer = AdamW(self.parameters(), lr=self.LEARNING_RATE)
        elif self.OPTIMIZER == "sgd":
            optimizer = SGD(self.parameters(), lr=self.LEARNING_RATE)
        else:
            raise NotImplementedError(f"'{self.OPTIMIZER}' is not available.")

        scheduler = ExponentialLR(optimizer, gamma=self.GAMMA)

        return {
            "optimizer": optimizer,
            "scheduler": scheduler
        }
    
    def encode_data(self, text):
        encoded_text = self.tokenizer.texts_to_sequences(text)
        pad_text = pad_sequences(encoded_text, maxlen=self.MAX_LENGTH, padding='post')
        return pad_text

    def prepare_data(self):
        # Prepare your dataset here

        df = pd.read_csv(self.DATA_SET)

        df['phishing'] = (df['status'] == 'phishing')
        df.drop('status', inplace=True, axis=1)

        df_URL=df[['url', 'length_url', 'phishing', 'shortening_service']]

        X_train, X_test, y_train, y_test = train_test_split(df_URL[['url', 'shortening_service']], 
                                                            df_URL['phishing'], 
                                                            test_size = 0.25, 
                                                            random_state = 32)
        
        X_test = X_test['url']

        X_train=X_train[X_train['shortening_service']==0]
        y_train = y_train.loc[X_train.index]
        X_train = X_train['url']



        X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                            y_train, 
                                                            test_size = 0.1)
        self.tokenizer.fit_on_texts(X_train)
        train_texts = X_train
        train_labels = y_train
        val_texts = X_val
        val_labels = y_val
        test_texts = X_test
        test_labels = y_test

        train_inputs = self.encode_data(train_texts)
        val_inputs = self.encode_data(val_texts)
        test_inputs = self.encode_data(test_texts)

        self.train_dataset = TensorDataset(torch.tensor(train_inputs, dtype=torch.long), 
                                           torch.tensor(train_labels.to_list(), dtype=torch.float))
        self.val_dataset = TensorDataset(torch.tensor(val_inputs, dtype=torch.long), 
                                         torch.tensor(val_labels.to_list(), dtype=torch.float))
        self.test_dataset = TensorDataset(torch.tensor(test_inputs, dtype=torch.long), 
                                          torch.tensor(test_labels.to_list(), dtype=torch.float))

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.BATCH_SIZE, shuffle=True, num_workers=32)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.BATCH_SIZE, shuffle=False, num_workers=32)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.BATCH_SIZE, shuffle=False, num_workers=32)

In [None]:
hyper_parameter = {
    'lr': 5e-6,  # Starting Learning Rate
    'epochs' : 10,
    'optimizer' : 'adamw',
    'gamma' : 0.9,
    'max_length': 400,
    'batch_size': 64,
    'data_path' : 'shortURL/dataset_phishing.csv'
}

In [None]:
model = ClassificationModel(hyper_parameter)

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model.to(device)

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    filename='epoch{epoch}'
)

In [None]:
trainer = pl.trainer.Trainer(
    #callbacks=[checkpoint_callback],
    max_epochs=1,
    deterministic=torch.cuda.is_available(),
    accelerator='gpu' if torch.cuda.is_available() else None, 
    devices=[0]
    #tpu_cores=1
)

In [None]:
model.model.resize_token_embeddings(128)

In [None]:
trainer.fit(model)

In [None]:
model = ClassificationModel(hyper_parameter)
model = torch.load("shortURL/URL_classification0.pt")
model.eval()

In [None]:
import time


start_time = time.process_time()

trainer.test(model)

end_time = time.process_time()
print(f"time elapsed : {int(round((end_time - start_time) * 1000))}ms")


In [None]:
torch.save(model, "shortURL/URL_classification1.pt")

In [None]:
df = pd.read_csv(model.DATA_SET)

df['phishing'] = (df['status'] == 'phishing')
df.drop('status', inplace=True, axis=1)

df_URL=df[['url', 'length_url', 'phishing', 'shortening_service']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_URL[['url', 'shortening_service']], 
                                                            df_URL['phishing'], 
                                                            test_size = 0.25, 
                                                            random_state = 32)

In [None]:
X_test=X_test[X_test['shortening_service']==0]
y_test = y_test.loc[X_test.index]


In [None]:
X_test = X_test['url']

In [None]:
X_test_input=model.encode_data(X_test)

In [None]:
test_dataset = TensorDataset(torch.tensor(X_test_input[:1], dtype=torch.long), 
                                    torch.tensor(y_test[:1].to_list(), dtype=torch.float))

In [None]:
test_dataset[:]

In [None]:
len(test_dataset)

In [None]:
len(X_test_input)

In [None]:
len(y_test[:1])

In [None]:
import time


start_time = time.process_time()

trainer.test(model, dataloaders=DataLoader(test_dataset, shuffle=False, num_workers=32))

end_time = time.process_time()
print(f"time elapsed : {int(round((end_time - start_time) * 1000))}ms")

In [None]:
def infer(x):
    print(model(input_ids=x).argmax(dim=-1))

In [None]:
X_test_input[:1]

In [None]:
start_time = time.process_time()

infer(torch.tensor(X_test_input[:1]))

end_time = time.process_time()
print(f"time elapsed : {int(round((end_time - start_time) * 1000))}ms")