In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device for Train : {device}")

Device for Train : cpu


In [4]:
import os

In [5]:
os.listdir('./data/')

['conts_label_data_call_1006.csv']

In [6]:
data = pd.read_csv('./data/conts_label_data_call_1006.csv')
data

Unnamed: 0,conts,label
0,이시세존종삼메안상이기고사리불 삼육구하나 제불지혜심심 부량 이이사사 기지혜문난해난입일...,1
1,안녕하세요? 여자만필요해요 01080075449 다른거는필요가없어요,1
2,연락처주심톡드리겟습니다^^,0
3,전외식업체근무하다퇴직하고지금은 시골에서농사지어요 머소개랄게크게업네요 진실하면그만않닌...,0
4,사별한이후 전원주택에 혼자 생활하고 있습니다 애들은 아들ㆍ딸 직장생활중이구요 직장은...,1
...,...,...
53636,안녕하세요 작으마한 중소기업 다니고 있는 평범한 직딩이구요~일에 묻쳐서 바쁘게 생활...,1
53637,안녕하세요 작으마한 중소기업 다니고 있는 평범한 직딩이구요~일에 묻쳐서 바쁘게 생활...,1
53638,안녕하세요 작으마한 중소기업 다니고 있는 평범한 직딩이구요~일에 묻쳐서 바쁘게 생활...,1
53639,안녕하세요 작으마한 중소기업 다니고 있는 평범한 직딩이구요.바쁘게 생활하다보니 결혼...,1


**`연락처 1, 정상글 0`**

In [7]:
data['label'].value_counts()

0    29998
1    23643
Name: label, dtype: int64

In [8]:
call_df = data.copy()

In [9]:
print(call_df.shape[0])

53641


In [10]:
call_df.loc[call_df['conts'].isnull()]

Unnamed: 0,conts,label


In [11]:
label_0 = call_df.loc[call_df['label']==0]
label_1 = call_df.loc[call_df['label']==1]

print(label_0.shape, label_1.shape)

final_call_df = pd.concat([label_0, label_1]).reset_index(drop=True)

(29998, 2) (23643, 2)


In [13]:
final_call_df['label'].value_counts()

0    29998
1    23643
Name: label, dtype: int64

In [14]:
# 학습 데이터 : 검증 데이터 8:2

train_data = final_call_df.sample(frac=0.8, random_state=2022)[['conts','label']]
test_data = final_call_df.drop(train_data.index)[['conts','label']]

print(train_data.shape, test_data.shape)

(42913, 2) (10728, 2)


In [15]:
model_name = 'beomi/KcELECTRA-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
# train dataset 토크나이징
tokenized_train_sentence = tokenizer(
    list(train_data['conts']),
    max_length=128,
    return_tensors='pt',  #pyotorch의 tensor 형태로 return
    padding=True,        #제로패딩 설정
    truncation=True,     # max_length 초과 토큰 truncate
    add_special_tokens=True)  # spcial token 추가


print(tokenized_train_sentence[0])
print(tokenized_train_sentence[0].tokens)
print(tokenized_train_sentence[0].ids)
print(tokenized_train_sentence[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', '두', '##아들이', '있습니다', '큰', '##아들', '##은', '26', '##살이', '##며', '상주', '##시', '환경', '##사업', '##소', '##다니', '##구요', '작은', '##아들', '##은', '22', '##살이', '##며', '전시', '##근로', '##역', '민', '##방위', '##대원', '##이', '##예요', '두', '##아들이', '성별', '##만', '아들이', '##지', '성격', '##은', '딸', '같아요', '저는', '운명', '##이', '끝', '##없는', '세계', '##대통령이', '##구요', '끝', '##없는', '유엔', '##사무', '##총장이', '##구요', '2022', '##년', '##9', '##월', '##28', '##일', '윤석렬', '##씨', '##와', '바뀌', '##는', '취임식', '##을', '한국', '국회의', '##사', '##당에서', '대통령', '취임식', '##을', '할', '예정', '##이', '##구요', '미국', '##뉴', '##욕', '##유', '##엔', '##본부', '##와', '세계', '##26', '##7', '##개', '##국에', '취임', '##선', '##서', '##와', '해외', '##정치', '##도', '겸하', '##게', '##됩니다', '현재', '뱃속', '##에는', '넷', '##쌍', '##둥', '##아들', '##들이', '임신', '##중이', '##라', '몸무게', '##가', '65', '##k', '##g', '##이', '##예요', '출산', '##은', '취임식', '##이후', '##에', '스위스', '쮜

In [17]:
# test dataset 토크나이징
tokenized_test_sentence = tokenizer(
    list(test_data['conts']),
    max_length=128,
    return_tensors='pt',
    padding=True,
    truncation=True,
    add_special_tokens=True)

In [18]:
print(tokenized_test_sentence[0])
print(tokenized_test_sentence[0].tokens)
print(tokenized_test_sentence[0].ids)
print(tokenized_test_sentence[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', '연락처', '##주', '##심', '##톡', '##드리', '##겟', '##습니다', '^', '^', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

In [19]:
class CustomDataset(torch.utils.data.Dataset) :
    def __init__(self, encodings, labels) :
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self) :
        return len(self.labels)    
        
    def __getitem__(self, idx) :
        item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [20]:
# dataset 형성(텐서로 변환)
train_label = train_data['label'].values
test_label = test_data['label'].values

train_dataset = CustomDataset(tokenized_train_sentence, train_label)
test_dataset = CustomDataset(tokenized_test_sentence, test_label)

In [21]:
# 사전학습 모델 로드
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.bias', 'classifier

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [22]:
# train option setting
train_arguments = TrainingArguments(
                    output_dir='./',
                    num_train_epochs=5,
                    per_device_train_batch_size=64,
                    per_device_eval_batch_size=64,
                    logging_dir='./logs',
                    logging_steps=500,
                    save_total_limit=2)

In [27]:
# metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


def compute_metrics(pred) :
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision, recall, ## 3. 모델 학습 및 평가 진행 f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy' : acc,
        'f1' : f1,
        'precision' : precision,
        'recall' : recall
    }

In [28]:
%%time

# Train
train = Trainer(
                model=model,
                args=train_arguments,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                compute_metrics=compute_metrics)

train.train()

***** Running training *****
  Num examples = 42913
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3355


Step,Training Loss
500,0.0661
1000,0.0208
1500,0.0138
2000,0.0091
2500,0.0057
3000,0.0038


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500\config.json
Model weights saved in ./checkpoint-500\pytorch_model.bin
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000\config.json
Model weights saved in ./checkpoint-1000\pytorch_model.bin
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500\config.json
Model weights saved in ./checkpoint-1500\pytorch_model.bin
Deleting older checkpoint [checkpoint-500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-2000
Configuration saved in ./checkpoint-2000\config.json
Model weights saved in ./checkpoint-2000\pytorch_model.bin
Deleting older checkpoint [checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-2500
Configuration saved in ./checkpoint-2500\config.json
Model weights saved in ./checkpoint-2500\pytorch_model.bin
Deleting older checkpoint [checkpoint-1500] due to args.save_total_limit


CPU times: total: 4d 4h 22min 21s
Wall time: 16h 44min 45s


TrainOutput(global_step=3355, training_loss=0.017994351109875652, metrics={'train_runtime': 60285.9006, 'train_samples_per_second': 3.559, 'train_steps_per_second': 0.056, 'total_flos': 1.41136058983296e+16, 'train_loss': 0.017994351109875652, 'epoch': 5.0})

In [29]:
train.evaluate(eval_dataset=test_dataset)

***** Running Evaluation *****
  Num examples = 10728
  Batch size = 64


NameError: name 'precision' is not defined

In [30]:
class predictModel() :

    def __init__(self, model_path):
        # load model
        self.model_path = model_path
        self.model = torch.load(model_path)
        # set device
        self.device = torch.device('cpu')
        # load tokenizer
        model_name = 'beomi/KcELECTRA-base'
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def predict_sentence(self, sent):
        self.model.eval()
#         sent = self.clean_sentence(sent)
        # tokenizing
        tokenized_sent = self.tokenizer(
            sent,
            return_tensors='pt',
            truncation=True,
            add_special_tokens=True,
            max_length=128
        )
        tokenized_sent.to(self.device)

        # prediction
        with torch.no_grad():
            outputs = self.model(
                input_ids=tokenized_sent['input_ids'],
                attention_mask=tokenized_sent['attention_mask'],
                token_type_ids=tokenized_sent['token_type_ids']
            )

        # result
        if len(sent) < 10 :
            # 10자 이하는 연락처 로 탐지하지 않음
            return 0

        result = outputs[0].detach().cpu().argmax(-1)
        #     print(outputs[0].detach().cpu())

        return int(result)
        # 0 : 정상, 1 : ( 무성의 or 연락처 탐지 )

In [31]:
from datetime import datetime

today = str(datetime.today())[:10]
path = f"{today}_call_model.pt"

# save model 
torch.save(model, path)

In [32]:
# load model
mod = torch.load(f"{today}_call_model.pt")
mod.eval()

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm