In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device for Train : {device}")

Device for Train : cpu


In [4]:
import os

In [5]:
os.listdir('./')

['.ipynb_checkpoints',
 '2022-10-25_mu_model.pt',
 '2022-10-27_mu_model.pt',
 '25일 기준 자기소개글_가족소개글 검수(프로필 노출 회원 돌리기).ipynb',
 'checkpoint-3500',
 'checkpoint-4000',
 'logs',
 '무성의모델학습데이터(1025).csv',
 '무성의모델학습데이터(1025).xlsx',
 '자기소개글_무성의_모델_학습(1025).ipynb']

In [6]:
data = pd.read_csv('무성의모델학습데이터(1025).csv')
data

Unnamed: 0,conts,label
0,부산 사상구.157/53.간호조무사.세미베지테리언.코로나 이후 간호업으로 전업하게 ...,0
1,부산 사상구15753간호조무사세미베지테리언코로나 이후 간호업으로 전업하게 되었어요 ...,0
2,"가나다라마바사아자차카타파하 잘쓰는 사람이고요,서울 강서구에 삽니다.",1
3,안녕하세요 감사합니다 ㅋㄱㄴ8898858888888888888899955666666...,1
4,Korean-born Canadian. Looking for a serious r...,0
...,...,...
64656,사별후................... 2년 지나고 달로 3년 대가느둥,0
64657,사별후 2년 지나고 달로 3년 대가느둥,0
64658,1남1녀 두아이 아빠임1111111111111111111111111111111111...,0
64659,편안하게 같이 나이를 맞이할수 있으면 좋겠습니다... 다시한번 소중한것들을 가꿔가며...,0


In [7]:
mu_df = data.copy()

In [8]:
mu_df = mu_df.drop_duplicates()

In [9]:
mu_df.isnull().sum()

conts    0
label    0
dtype: int64

In [10]:
mu_df.reset_index(drop=True, inplace=True)

In [11]:
label_0 = mu_df.loc[mu_df['label']==0]
label_1 = mu_df.loc[mu_df['label']==1]

print(label_0.shape, label_1.shape)

final_mu_df = pd.concat([label_0, label_1]).reset_index(drop=True)

(33456, 2) (31205, 2)


In [12]:
# 학습 데이터 : 검증 데이터 8:2

train_data = final_mu_df.sample(frac=0.8, random_state=2022)[['conts','label']]
test_data = final_mu_df.drop(train_data.index)[['conts','label']]

print(train_data.shape, test_data.shape)

(51729, 2) (12932, 2)


In [13]:
model_name = 'beomi/KcELECTRA-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:
# train dataset 토크나이징
tokenized_train_sentence = tokenizer(
    list(train_data['conts']),
    max_length=128,
    return_tensors='pt',  #pyotorch의 tensor 형태로 return
    padding=True,        #제로패딩 설정
    truncation=True,     # max_length 초과 토큰 truncate
    add_special_tokens=True)  # special token 추가


print(tokenized_train_sentence[0])
print(tokenized_train_sentence[0].tokens)
print(tokenized_train_sentence[0].ids)
print(tokenized_train_sentence[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', 'ㄱㄱ', '##ㄱㄱ', '##ㄱㄱ', '##ㄱㄱ', '##ㄱㄱ', '##ㄱㄱ', '##ㅣ', '##ㄱ', '##ㅣ', '##ㅣ', '##ㅣ', '##ㅣ', '##ㅣ', '##ㅣ', '##ㅣ', '##ㅣ', 'ㅣ', '##ㄱ', 'ㄱㄱ', '##ㄱㄱ', '##ㄱㄱ', '##ㄱ', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[

In [15]:
# test dataset 토크나이징
tokenized_test_sentence = tokenizer(
    list(test_data['conts']),
    max_length=128,
    return_tensors='pt',
    padding=True,
    truncation=True,
    add_special_tokens=True)

In [16]:
print(tokenized_test_sentence[0])
print(tokenized_test_sentence[0].tokens)
print(tokenized_test_sentence[0].ids)
print(tokenized_test_sentence[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', '부산', '사상', '##구', '.', '157', '/', '53', '.', '간호', '##조', '##무사', '.', '세미', '##베', '##지', '##테리', '##언', '.', '코로나', '이후', '간호', '##업', '##으로', '전업', '##하게', '되었', '##어요', '.', '성격', '##은', '애교', '##가', '많고', '솔직', ',', '활발', '##해요', '이상', '##형은', '마르', '##지도', '뚱뚱', '##하지', '않', '##으며', '대화가', '유쾌', '##하고', '다정', '##한', '말투', '##에', '공감', '##력이', '좋고', '무엇보다', '까칠', '##하지', '않은', ',', '이해', '##심', '많은', '분이', '##에요', '.', '정직', ',', '신뢰', ',', '존중', ',', '예의', ',', '배려', '##는', '서로', '##가', '바탕', '##이', '되어야', '##겠죠', '##♡', '##좋아하는', '##것', ':', '여행', ',', '산책', ',', '운동', ',', '카페', ',', '문화', '##생활', ',', '음악', ',', '로맨틱', '##한', '것', ',', '동물', '@', '싫어하는', '##것', ':', '거짓말', ',', '무책임한', ',', '담배', '(', '전담', '##포함', ')', ',', '벌레', ',', '큰', '##목', '##소리', '##와', '소음', ',', '[SEP]']
[2, 9077, 11128, 4230, 18, 34087, 19, 27773, 18, 18815, 4180, 1917

In [17]:
class CustomDataset(torch.utils.data.Dataset) :
    def __init__(self, encodings, labels) :
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self) :
        return len(self.labels)    
        
    def __getitem__(self, idx) :
        item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [18]:
# dataset 형성(텐서로 변환)
train_label = train_data['label'].values
test_label = test_data['label'].values

train_dataset = CustomDataset(tokenized_train_sentence, train_label)
test_dataset = CustomDataset(tokenized_test_sentence, test_label)

In [19]:
# 사전학습 모델 로드
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.ou

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [20]:
# train option setting
train_arguments = TrainingArguments(
                    output_dir='./',
                    num_train_epochs=5,
                    per_device_train_batch_size=64,
                    per_device_eval_batch_size=64,
                    logging_dir='./logs',
                    logging_steps=500,
                    save_total_limit=2)

In [21]:
# metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score, recall_score


def compute_metrics(pred) :
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision = precision_score(labels, preds) 
    recall = recall_score(labels, preds)
    f1 = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy' : acc,
        'f1' : f1,
        'precision' : precision,
        'recall' : recall
    }

In [22]:
%%time

# Train
train = Trainer(
                model=model,
                args=train_arguments,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                compute_metrics=compute_metrics)

train.train()

***** Running training *****
  Num examples = 51729
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 4045


Step,Training Loss
500,0.1488
1000,0.1168
1500,0.1018
2000,0.0888
2500,0.0778
3000,0.0574
3500,0.05
4000,0.0439


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500\config.json
Model weights saved in ./checkpoint-500\pytorch_model.bin
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000\config.json
Model weights saved in ./checkpoint-1000\pytorch_model.bin
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500\config.json
Model weights saved in ./checkpoint-1500\pytorch_model.bin
Deleting older checkpoint [checkpoint-500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-2000
Configuration saved in ./checkpoint-2000\config.json
Model weights saved in ./checkpoint-2000\pytorch_model.bin
Deleting older checkpoint [checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-2500
Configuration saved in ./checkpoint-2500\config.json
Model weights saved in ./checkpoint-2500\pytorch_model.bin
Deleting older checkpoint [checkpoint-1500] due to args.save_total_limit


CPU times: total: 4d 19h 52min 55s
Wall time: 19h 23min 24s


TrainOutput(global_step=4045, training_loss=0.08514303928252645, metrics={'train_runtime': 69804.617, 'train_samples_per_second': 3.705, 'train_steps_per_second': 0.058, 'total_flos': 1.70130897283968e+16, 'train_loss': 0.08514303928252645, 'epoch': 5.0})

In [23]:
train.evaluate(eval_dataset=test_dataset)

***** Running Evaluation *****
  Num examples = 12932
  Batch size = 64


Trainer is attempting to log a value of "(0.9524273421009904, 0.9363128491620112, 0.9443013522215068, None)" of type <class 'tuple'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.241476371884346,
 'eval_accuracy': 0.9464893287967832,
 'eval_f1': (0.9524273421009904, 0.9363128491620112, 0.9443013522215068, None),
 'eval_precision': 0.9524273421009904,
 'eval_recall': 0.9363128491620112,
 'eval_runtime': 1264.7495,
 'eval_samples_per_second': 10.225,
 'eval_steps_per_second': 0.161,
 'epoch': 5.0}

        {'eval_loss': 0.241476371884346,
         'eval_accuracy': 0.9464893287967832,
         'eval_f1': (0.9524273421009904, 0.9363128491620112, 0.9443013522215068, None),
         'eval_precision': 0.9524273421009904,
         'eval_recall': 0.9363128491620112,
         'eval_runtime': 1264.7495,
         'eval_samples_per_second': 10.225,
         'eval_steps_per_second': 0.161,
         'epoch': 5.0}

In [24]:
import re

class predictModel() :

    def __init__(self, model_path):
        # load model
        self.model_path = model_path
        self.model = torch.load(model_path)
        # set device
        self.device = torch.device('cpu')
        # load tokenizer
        model_name = 'beomi/KcELECTRA-base'
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def predict_sentence(self, sent):
        self.model.eval()
#         sent = self.clean_sentence(sent)
        # tokenizing
        tokenized_sent = self.tokenizer(
            sent,
            return_tensors='pt',
            truncation=True,
            add_special_tokens=True,
            max_length=128
        )
        tokenized_sent.to(self.device)

        # prediction
        with torch.no_grad():
            outputs = self.model(
                input_ids=tokenized_sent['input_ids'],
                attention_mask=tokenized_sent['attention_mask'],
                token_type_ids=tokenized_sent['token_type_ids']
            )

        # result
        if len(sent) < 10 :
            # 10자 이하는 연락처 로 탐지하지 않음
            return 0

        result = outputs[0].detach().cpu().argmax(-1)
        #     print(outputs[0].detach().cpu())

        return int(result)
        # 0 : 정상, 1 : ( 무성의 or 연락처 탐지 )

In [25]:
from datetime import datetime

today = str(datetime.today())[:10]
path = f"{today}_mu_model.pt"

# save model 
torch.save(model, path)

In [26]:
from datetime import datetime

today = str(datetime.today())[:10]
path = f"{today}_mu_model.pt"

# save model 
torch.save(model, path)

In [27]:
# load model
mod = torch.load(f"{today}_mu_model.pt")
mod.eval()

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

------

In [6]:
import re

class Preprocessing():

    def __init__(self, sent):
        self.sent = sent

    def clean_sentence(self):
        try:  # 무성의
            self.sent = re.sub('[^\w\s]', ' ', self.sent).strip()
            self.sent = re.sub('[.,?!ᆢ~]', ', ', self.sent)
            self.sent = re.sub('[ㄱ-ㅎ|ㅏ-ㅣ]', 'ㅋ', self.sent)
            self.sent = re.sub('니다', '니다. ', self.sent)
            self.sent = re.sub('어요', '어요. ', self.sent)
            self.sent = re.sub('\n', ' ', self.sent).strip()

        except:
            print('clean_sentence method fail')
            pass

        return self.sent

    def f_clean_sentence(self):
        try:
            self.sent = re.sub('[^\w\s]', ' ', self.sent).strip()
            self.sent = re.sub('[.,?!ᆢ~]', ', ', self.sent)
            self.sent = self.return_text(self.sent)

        except:
            print('f_clean_sentence method fail')
            pass
        return self.sent

    def sub_num(self, sent):
        hannum_list = ['일', '이', '삼', '사', '오', '육', '륙', '칠', '팔', '구', '십', '영']
        sent = re.sub(r'[0-9]', ' ', sent)

        for i in hannum_list:
            sent = re.sub(rf'{i}', '  ', sent)
        return sent

    def return_target_list(self, pattern, sent):
        word_list = []
        for i in pattern.finditer(sent):
            target_word = sent[i.start(): i.end()]
            word_list.append(target_word)
        return word_list

    def return_text(self, sent):

        nam = '[남]+'
        nyeo = '[녀]+'
        yeo = '[여]+'
        son = '[아들]+'
        ddal = '[딸]+'
        num = '[\s]*(\d){1,2}[\s]*'
        han_num = '[일이삼사오육륙칠팔구십]+'

        person_list = [nam, son, nyeo, yeo, ddal]
        num_list = [num, han_num]

        for person_pattern in person_list:
            for num_pattern in num_list:
                pattern_list = [person_pattern + num_pattern, num_pattern + person_pattern]
                target_list = []

                for pattern in pattern_list:
                    add_pattern = re.compile(pattern)
                    m = add_pattern.findall(sent)
                    if m != []:
                        target_words = self.return_target_list(add_pattern, sent)
                        target_list += target_words
                    else:
                        pass

                for i in target_list:
                    sent = sent.replace(i, self.sub_num(i))
        return sent

In [7]:
import torch.nn.functional as F

class predictModel():

    def __init__(self, model_path):
        # load model
        self.model_path = model_path
        self.model = torch.load(model_path)
        # set device
        self.device = torch.device('cpu')
        # load tokenizer
        model_name = 'beomi/KcELECTRA-base'
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def predict_sentence(self, sent):
        self.model.eval()
        Pr = Preprocessing(sent)

        if 'call' in str(self.model_path):
            sent = Pr.f_clean_sentence()

        else:
            sent = Pr.clean_sentence()

        # tokenizing
        tokenized_sent = self.tokenizer(
            sent,
            return_tensors='pt',
            truncation=True,
            add_special_tokens=True,
            max_length=512
        )
        tokenized_sent.to(self.device)

        # prediction
        with torch.no_grad():
            outputs = self.model(
                input_ids=tokenized_sent['input_ids'],
                attention_mask=tokenized_sent['attention_mask'],
                token_type_ids=tokenized_sent['token_type_ids']
            )

        # result
        per = int(str(np.array(F.softmax(outputs[0][0], dim=0).detach().cpu())[1] * 100).split('.')[0])
        result = outputs[0].detach().cpu().argmax(-1)

        if int(per) >= 99:
            return 1
        
        else:
            return 0

In [8]:
os.listdir()

['.ipynb_checkpoints',
 '2022-10-25_mu_model.pt',
 '2022-10-27_mu_model.pt',
 '25일 기준 자기소개글_가족소개글 검수(프로필 노출 회원 돌리기).ipynb',
 'checkpoint-3500',
 'checkpoint-4000',
 'logs',
 '무성의모델학습데이터(1025).csv',
 '무성의모델학습데이터(1025).xlsx',
 '자기소개글_무성의_모델_학습(1025).ipynb']

In [9]:
mu_model = predictModel(model_path = "2022-10-27_mu_model.pt")