In [None]:
#코랩 셀레니움 설치
!apt-get update
!apt-get install -y chromium-chromedriver
!pip install selenium
!pip install beautifulsoup4
#konlpy 설치. 3분정도 걸려요
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [None]:
!pip install datasets
!pip install peft
!pip install gdown

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, DistilBertForSequenceClassification, get_scheduler
from peft import get_peft_model, LoraConfig, TaskType
import gdown

import time
import re
from bs4 import BeautifulSoup
from konlpy.tag import Okt
from selenium import webdriver

In [None]:
url = 'https://drive.google.com/uc?id=12MOGiCveDE8CTvtHKqmEhyJIXc3gEscd'

device = 'cpu'

model_name = 'TwoStageDistilBERT_LoRA.pt'
checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

gdown.download(url, model_name, quiet = False)

model_checkpoint = torch.load(model_name, map_location = device)

In [None]:
class TwoStageDistilBERT_LoRA(nn.Module):
  def __init__(self, distilbert_checkpoint, num_labels_1stage = 2, num_labels_2stage = 3):
    super(TwoStageDistilBERT_LoRA, self).__init__()


    # 첫 번째 stage
    self.distilbert1 = DistilBertForSequenceClassification.from_pretrained(distilbert_checkpoint,
                                                                           num_labels = num_labels_1stage, ignore_mismatched_sizes = True,
                                                                           output_hidden_states=True)

    lora_config1 = LoraConfig(task_type = TaskType.SEQ_CLS, r = 8, lora_alpha = 32, target_modules = ['q_lin', 'v_lin'], lora_dropout = 0.1 )
    self.distilbert1 = get_peft_model(self.distilbert1, lora_config1)

    # 두 번째 stage
    self.distilbert2 = DistilBertForSequenceClassification.from_pretrained(distilbert_checkpoint,
                                                                           num_labels = num_labels_2stage, ignore_mismatched_sizes = True)

    lora_config2 = LoraConfig(task_type = TaskType.SEQ_CLS, r = 8, lora_alpha = 32, target_modules = ['q_lin', 'v_lin'], lora_dropout = 0.1 )
    self.distilbert1 = get_peft_model(self.distilbert1, lora_config2)


  def forward(self, input_ids,  attention_mask, labels1 = None, labels2 = None):
    output1 = self.distilbert1(input_ids = input_ids, attention_mask = attention_mask, labels = labels1)
    hidden1 = output1.hidden_states[-1] # 마지막 레이어의 hidden state 가져오기
    logits1 = output1.logits

    pred1 = torch.argmax(logits1, dim = 1)

    output2 = self.distilbert2(inputs_embeds = hidden1, attention_mask = attention_mask, labels = labels2)
    logits2 = output2.logits

    total_loss = output1.loss + output2.loss


    return total_loss, logits1, logits2

In [None]:
def load_checkpoint(model, model_checkpoint):
  model.load_state_dict(model_checkpoint['model_state_dict'])

  print(f"Checkpoint loaded!")
  return model


model = TwoStageDistilBERT_LoRA(distilbert_checkpoint = checkpoint)

model = load_checkpoint(model, model_checkpoint)

In [None]:
def output2label(logits1, logits2, label_to_site_dict):
  N = logits1.shape[0]
  logits1 = np.array(logits1)
  logits2 = np.array(logits2)



  output1 = np.argmax(logits1, axis = 1)
  #print(output1, "\n",[label_to_site_dict[out] for out in output1])

  mask = (output1 != 0)
  output2 = np.argmax(logits2[mask], axis = 1)
  #print(output2, "\n", [label_to_site_dict[out] for out in output2])

  result = []
  idx = 0
  for out in output1:
    if out == 0:
      result.append(label_to_site_dict[out])
    else:
      result.append(label_to_site_dict[output2[idx]])
      idx += 1

  return result


In [None]:
#셀레니움 설정
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--disable-blink-features=AutomationControlled')

In [None]:
def extraction(url):
  driver = webdriver.Chrome(options=options)
  driver.get(url)
  time.sleep(5)#셀레니움이 해당 웹사이트까지 이동하는데 기다려줘야함

  html = driver.page_source #html긁어오기
  soup = BeautifulSoup(html, 'html.parser')
  texts= soup.get_text()#텍스트만 추출
  driver.quit()#셀레니움 종료

  okt=Okt()
  pattern = re.compile('[ㄱ-ㅎ가-힣]+')#한글만
  korean=pattern.findall(texts)
  nouns_list=[] #명사만 추출해서 리스트에 넣을예정
  for text in korean:
    noun = okt.nouns(text) #ex) '페이지를'->'페이지'
    for i in noun:
      nouns_list.append(i)# 명사 append

  bow={} #빈도수 체크 딕셔너리
  for i in nouns_list:
    if i not in bow.keys():
      bow[i]=1
    else:
      bow[i]+=1
  bow_list=list(zip(bow.keys(),bow.values()))
  bow_list.sort(key=lambda x:x[1],reverse=True)#내림차순 정렬

  keyword=[]
  for i in range(len(bow_list)):
    if i==50:#키워드가 50개보다 많을경우 50개까지만
      break
    else:
      keyword.append(bow_list[i][0])

  keyword_sentence=' '.join(keyword)
  return keyword_sentence

# 도박사이트 분류 시작
> 아래에 있는 url_list에 url을 넣어주세요

> 이 셀 위에 있는 셀들은 첫 실행시에 한 번만 실행하면 됩니다

In [None]:
url_list=['https://www.snu.ac.kr/', 'https://newtoki.biz/manhwa', 'https://kkr-0708.com/?ref=1875']

In [None]:
label_to_site_dict = {0: "일반사이트", 1: "도박사이트", 2: "도박 제외 불법사이트"}

model.eval()


with torch.no_grad():
  text = []

  for raw_url in url_list:
    text.append(extraction(raw_url))


  # 텍스트를 토큰화
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

  #입력에 대한 추론 (추론에서는 gradient 필요없음)
  input_ids = inputs['input_ids']
  attention_mask = inputs['attention_mask']

  # 1단계 모델에 입력
  output1 = model.distilbert1(input_ids=input_ids, attention_mask=attention_mask)
  logits1 = output1.logits

  hidden1 = output1.hidden_states[-1]  # 마지막 레이어의 hidden state
  output2 = model.distilbert2(inputs_embeds=hidden1, attention_mask=attention_mask)
  logits2 = output2.logits
  result = output2label(logits1, logits2, label_to_site_dict)



  for keyword in text:
    print(keyword)
  print(f"Result: {'/'.join(result)}")