In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device for Train : {device}")

Device for Train : cpu


In [4]:
import os

In [6]:
os.listdir('./data/')

['conts_label_data_mu_1006.csv']

In [8]:
data = pd.read_csv('./data/conts_label_data_mu_1006.csv')
data

Unnamed: 0.1,Unnamed: 0,conts,label,len
0,10409,ㆍ정상적인상식과노후을행복하게함께동행할친구같은사람이면참좋겠어요그동안 열심히 살아왔는데...,0,1
1,10410,ㆍ다가올 은퇴를 생각합니다 ㆍ생각만 반복하던걸 이제는 할려합니다ㆍ 차가운 계절엔 ...,0,11
2,10411,ㆍ금융회사퇴사후 법인회사설립 공동운영중 임 ㆍ청순하고 말을 예쁘게하는 분 ㆍ사진...,0,7
3,10412,ㆍ금융기관퇴사후 법인회사설립 공동운영중 임 ㆍ청순하고 말을 예쁘게하는 여성분 ㆍ사진...,0,7
4,10413,ㆍ건강은 등산하면서 몸관리잘하구요 ㆍ맘편하신분만나면좋겠네요ㆍ친구같은연인분계실까요ㆍ한...,0,10
...,...,...,...,...
79642,79645,'''''''''''''''''''''''''''''''''''''''''''''',1,46
79643,79646,''''''''''''''''''''''''''''''''''''''''',1,41
79644,79647,'''''''''''''''''''''''''''''''''''''',1,38
79645,79648,''''''''''''''''''''''''''''''''''''',1,37


**`무성의 1, 정상글 0`**

In [9]:
data['label'].value_counts()

0    51965
1    27682
Name: label, dtype: int64

In [10]:
mu_df = data.copy()

In [11]:
print(mu_df.shape[0])

79647


In [12]:
mu_df.loc[mu_df['conts'].isnull()]

Unnamed: 0.1,Unnamed: 0,conts,label,len


In [13]:
label_0 = mu_df.loc[mu_df['label']==0].sample(n=35000)
label_1 = mu_df.loc[mu_df['label']==1]

print(label_0.shape, label_1.shape)

final_mu_df = pd.concat([label_0, label_1]).reset_index(drop=True)

(35000, 4) (27682, 4)


In [14]:
final_mu_df['label'].value_counts()

0    35000
1    27682
Name: label, dtype: int64

In [15]:
# 학습 데이터 : 검증 데이터 8:2

train_data = final_mu_df.sample(frac=0.8, random_state=2022)[['conts','label']]
test_data = final_mu_df.drop(train_data.index)[['conts','label']]

print(train_data.shape, test_data.shape)

(50146, 2) (12536, 2)


In [16]:
model_name = 'beomi/KcELECTRA-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
# train dataset 토크나이징
tokenized_train_sentence = tokenizer(
    list(train_data['conts']),
    max_length=128,
    return_tensors='pt',  #pyotorch의 tensor 형태로 return
    padding=True,        #제로패딩 설정
    truncation=True,     # max_length 초과 토큰 truncate
    add_special_tokens=True)  # spcial token 추가


print(tokenized_train_sentence[0])
print(tokenized_train_sentence[0].tokens)
print(tokenized_train_sentence[0].ids)
print(tokenized_train_sentence[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', '응', '##포', '##인트', '##그', '##그', '##그', '##그', '##그', '##크', '##그', '##ㅡ', '##그', '##그', '##그', '##그', '##ㅡ', '##그', '##크', '##ㅡ', '##크', '##그', '##ㅡ', '##크', '##극', '##ㅂㄷ', '##ㄱ', '##ㄷ', '##ㄱ', '##ㄷ', '##ㄱ', '##ㄷ', '##ㄱ', '##ㄷ', '##ㄱ', '##ㄷ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㄷ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㄷ', '##ㅋ', '##ㅊ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㅊ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㅈ', '##ㄱ', '##ㅈ', '##ㅋ', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

In [18]:
# test dataset 토크나이징
tokenized_test_sentence = tokenizer(
    list(test_data['conts']),
    max_length=128,
    return_tensors='pt',
    padding=True,
    truncation=True,
    add_special_tokens=True)

In [19]:
print(tokenized_test_sentence[0])
print(tokenized_test_sentence[0].tokens)
print(tokenized_test_sentence[0].ids)
print(tokenized_test_sentence[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', '딸', '##2', '##명', '##대학', '##4년', '##졸', '##업', '##후', '결혼해서', '##잘', '##살고', '##있', '##으며', '형제', '##는', '3', '##남', '##3', '##녀', '##중', '5', '##번째', '##이고', '현재', '##혼자', '##서', '직원들', '##과', '##사업', '##을', '##하고있', '##습니다', '##한번', '##씩', '##시간', '##되면', '등산', '##과', '##여행', '##을', '##다니고', '##있습니다', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD

In [20]:
class CustomDataset(torch.utils.data.Dataset) :
    def __init__(self, encodings, labels) :
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self) :
        return len(self.labels)    
        
    def __getitem__(self, idx) :
        item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [21]:
# dataset 형성(텐서로 변환)
train_label = train_data['label'].values
test_label = test_data['label'].values

train_dataset = CustomDataset(tokenized_train_sentence, train_label)
test_dataset = CustomDataset(tokenized_test_sentence, test_label)

In [22]:
# 사전학습 모델 로드
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.ou

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [23]:
# train option setting
train_arguments = TrainingArguments(
                    output_dir='./',
                    num_train_epochs=5,
                    per_device_train_batch_size=64,
                    per_device_eval_batch_size=64,
                    logging_dir='./logs',
                    logging_steps=500,
                    save_total_limit=2)

In [24]:
# metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score, recall_score


def compute_metrics(pred) :
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision = precision_score(labels, preds) 
    recall = recall_score(labels, preds)
    f1 = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy' : acc,
        'f1' : f1,
        'precision' : precision,
        'recall' : recall
    }

In [25]:
%%time

# Train
train = Trainer(
                model=model,
                args=train_arguments,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                compute_metrics=compute_metrics)

train.train()

***** Running training *****
  Num examples = 50146
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3920


Step,Training Loss
500,0.067
1000,0.0409
1500,0.0303
2000,0.0187
2500,0.014
3000,0.0086
3500,0.0059


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500\config.json
Model weights saved in ./checkpoint-500\pytorch_model.bin
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000\config.json
Model weights saved in ./checkpoint-1000\pytorch_model.bin
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500\config.json
Model weights saved in ./checkpoint-1500\pytorch_model.bin
Deleting older checkpoint [checkpoint-500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-2000
Configuration saved in ./checkpoint-2000\config.json
Model weights saved in ./checkpoint-2000\pytorch_model.bin
Deleting older checkpoint [checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-2500
Configuration saved in ./checkpoint-2500\config.json
Model weights saved in ./checkpoint-2500\pytorch_model.bin
Deleting older checkpoint [checkpoint-1500] due to args.save_total_limit


Wall time: 19h 37min 42s


TrainOutput(global_step=3920, training_loss=0.024182650750997115, metrics={'train_runtime': 70662.6854, 'train_samples_per_second': 3.548, 'train_steps_per_second': 0.055, 'total_flos': 1.64924587276032e+16, 'train_loss': 0.024182650750997115, 'epoch': 5.0})

In [None]:
# train.evaluate(eval_dataset=test_dataset)

In [26]:
class predictModel() :

    def __init__(self, model_path):
        # load model
        self.model_path = model_path
        self.model = torch.load(model_path)
        # set device
        self.device = torch.device('cpu')
        # load tokenizer
        model_name = 'beomi/KcELECTRA-base'
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def predict_sentence(self, sent):
        self.model.eval()
#         sent = self.clean_sentence(sent)
        # tokenizing
        tokenized_sent = self.tokenizer(
            sent,
            return_tensors='pt',
            truncation=True,
            add_special_tokens=True,
            max_length=128
        )
        tokenized_sent.to(self.device)

        # prediction
        with torch.no_grad():
            outputs = self.model(
                input_ids=tokenized_sent['input_ids'],
                attention_mask=tokenized_sent['attention_mask'],
                token_type_ids=tokenized_sent['token_type_ids']
            )

        # result
        if len(sent) < 10 :
            # 10자 이하는 연락처 로 탐지하지 않음
            return 0

        result = outputs[0].detach().cpu().argmax(-1)
        #     print(outputs[0].detach().cpu())

        return int(result)
        # 0 : 정상, 1 : ( 무성의 or 연락처 탐지 )

In [27]:
from datetime import datetime

today = str(datetime.today())[:10]
path = f"{today}_mu_model.pt"

# save model 
torch.save(model, path)

In [28]:
# load model
mod = torch.load(f"{today}_mu_model.pt")
mod.eval()

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm