In [1]:
import json
import re
import pandas as pd
import numpy as np
import random

import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
import transformers
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig
from transformers import DistilBertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


In [2]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
jd_data = pd.read_json("jd_7632.json")

In [4]:
jd_data.head()

Unnamed: 0,wd_id,position,main_tasks,requirements,preferred_points,category
0,61852,고객 경험(CX) 전략 매니저 (신입/인턴),• 24시간 365일 운영되는 로켓펀치와 집무실 고객 경험 관리\n• 고객 경험 사...,• 소셜 네트워크 또는 플랫폼 커뮤니티 등 온라인 플랫폼 운영에 관심이 있으신 분\...,• 회사와 함께 성장하고자 하는 의지를 가진 분\n• 다양한 이슈에 빠르게 대응하고...,"경영, 비즈니스"
1,61851,집무실 IoT 개발자,"• 집무실 인프라 구축: (IoT)전원 관리 설계 및 구성, QR 코드 기반 출입 ...",• IoT를 통한 공간 관리 및 자동화에 대한 높은 관심이 있어야 합니다.\n• ...,• 적극적으로 서비스 개선에 필요한 업무를 찾고 개선할수 있는 인성과 역량\n• ...,개발
2,61850,공간(인테리어) 디자이너,"- 공간 아이덴티티 : 숙박시설, 공용공간, 객실 등 공간의 아이덴티티를 기획하고 ...","• 여행, 숙박업, 접객업, 프랜차이즈업에 대한 경험과 이해\n• CAD, Sket...",• 숙박 운영 및 관리를 경험하신 분\n• 시니어 디자이너와 팀을 이뤄 프로젝트...,디자인
3,61849,Python 웹 어플리케이션 개발자,• 로켓펀치 웹 서비스 개발,• 웹 개발 업무 2년 이상이 있어야 합니다.\n• Python 및 Django/F...,• 적극적으로 서비스 개선에 필요한 업무를 찾고 개선할수 있는 인성과 역량\n• S...,개발
4,61848,여행 숙박 콘텐츠 마케터,• 투숙고객 대상 브랜딩-프로모션-예약-투숙-후기를 아우르는 일관된 마케팅 캠페인 ...,"• 2~5년간 미디어, PR, 광고, 콘텐츠 기획/제작을 경험하신 분\n• 복잡...","• 숙박운영, 스타트업 업무문화 경험자\n• 유튜브, 브런치 등 뉴미디어를 다양하...","마케팅, 광고"


In [5]:
jd_data["category"].unique()

array(['경영, 비즈니스', '개발', '디자인', '마케팅, 광고'], dtype=object)

In [6]:
# 결측치 확인
jd_data.isnull().sum()

wd_id                 0
position              6
main_tasks            0
requirements          6
preferred_points    265
category              0
dtype: int64

In [7]:
# category 비율 확인
jd_data.category.value_counts()

개발          3419
경영, 비즈니스    1799
마케팅, 광고     1361
디자인         1053
Name: category, dtype: int64

In [8]:
possible_labels = jd_data.category.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'경영, 비즈니스': 0, '개발': 1, '디자인': 2, '마케팅, 광고': 3}

In [9]:
jd_data['category'] = jd_data.category.replace(label_dict)

In [10]:
# 문장을 함쳐 하나의 문장으로 정리 
jd_data["total"] = jd_data.position + " " + jd_data.main_tasks + " " +jd_data.requirements + " " +jd_data.preferred_points 
jd_data['total']

0       고객 경험(CX) 전략 매니저 (신입/인턴) • 24시간 365일 운영되는 로켓펀치...
1       집무실 IoT 개발자 • 집무실 인프라 구축: (IoT)전원 관리 설계 및 구성, ...
2       공간(인테리어) 디자이너 - 공간 아이덴티티 : 숙박시설, 공용공간, 객실 등 공간...
3       Python 웹 어플리케이션 개발자 • 로켓펀치 웹 서비스 개발 • 웹 개발 업무 ...
4       여행 숙박 콘텐츠 마케터 • 투숙고객 대상 브랜딩-프로모션-예약-투숙-후기를 아우르...
                              ...                        
7627    Senior Product Designer (UX) 1. Product UX Des...
7628    WIPPY, QUAT 데이터 분석가 (시니어) • 서비스 고도화를 위한 현황 분석 ...
7629    UI Designer • 신규 모바일 앱 디자인 및 고도화 • 유저 경험에 기반한 ...
7630    Data Engineer 담당 개발자 1. 운영중인 AWS기반의 ETL/ELT 개발...
7631                                                  NaN
Name: total, Length: 7632, dtype: object

In [11]:
# 무의미한 정보를 정규화 작업을 통해서 제거 
def clean_text(text):
#     text_clean = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", " ", str(text))
    text_clean = re.sub(r'\d+','',str(text))
    text_clean = re.sub(r"^\s+", '', text_clean) #remove space from start 
    text_clean = re.sub(r'\s+$', '', text_clean) #remove space from the end corpus.append(review) return corpus
    text_clean = re.sub(r'<[^>]+>','',text_clean) #remove Html tags
    text_clean = re.sub(r'^\s*\d+\s*[-\\.)]?\s+|^\s*[-•■⦿*:º○▶️⏩✔•]\s+', '', text_clean).strip()# 불필요한 유니코드 및 특수문자 제거
    text_clean = re.sub(r"[\<\(\[\]\)\>]", " ", text_clean) # 불필요한 괄호 제거
    text_clean = re.sub(r"(http|https)?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text_clean).strip()
    text_clean = re.sub(r"pic\.(\w+\.)+\S*", "", text_clean).strip() # URL형태의 주소 제거
    text_clean = re.sub(r"\s+", " ", text_clean).strip() # 두개이상의 연속된 공백을 제거
    text_clean = text_clean.replace("•", " ") # 슬래쉬 기호 제거
    text_clean = text_clean.replace("/", " ") # 슬래쉬 기호 제거
    return text_clean

In [12]:
# text_clean process
jd_data["total_clean"] = jd_data["total"].map(lambda text : clean_text(text))

In [13]:
df_train = jd_data[['total_clean', 'category']]
df_train.head()

Unnamed: 0,total_clean,category
0,고객 경험 CX 전략 매니저 신입 인턴 시간 일 운영되는 로켓펀치와 집무실 고객...,0
1,"집무실 IoT 개발자 집무실 인프라 구축: IoT 전원 관리 설계 및 구성, Q...",1
2,"공간 인테리어 디자이너 - 공간 아이덴티티 : 숙박시설, 공용공간, 객실 등 공간의...",2
3,Python 웹 어플리케이션 개발자 로켓펀치 웹 서비스 개발 웹 개발 업무 ...,1
4,여행 숙박 콘텐츠 마케터 투숙고객 대상 브랜딩-프로모션-예약-투숙-후기를 아우르...,3


In [14]:
X_train, X_val, y_train, y_val = train_test_split(df_train.index.values, 
                                                  df_train.category.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df_train.category.values)

df_train['data_type'] = ['not_set']*df_train.shape[0]

df_train.loc[X_train, 'data_type'] = 'train'
df_train.loc[X_val, 'data_type'] = 'val'

df_train.groupby(['category', 'data_type']).count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_clean
category,data_type,Unnamed: 2_level_1
0,train,1529
0,val,270
1,train,2906
1,val,513
2,train,895
2,val,158
3,train,1157
3,val,204


In [15]:
MAX_LENGTH = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

In [16]:
MODEL_NAME = 'distilbert-base-multilingual-cased'

In [17]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True )

In [18]:
encoded_data_train = tokenizer.batch_encode_plus(
    df_train[df_train.data_type=='train'].total_clean.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df_train[df_train.data_type=='val'].total_clean.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df_train[df_train.data_type=='train'].category.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df_train[df_train.data_type=='val'].category.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'pre_cla

In [20]:
batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [21]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [22]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    print(f'label_dict_inverse: {label_dict_inverse}')
    
#     preds_test = np.argmax(preds, axis=0)w
#     print(f'preds: {preds_test}')www
    preds_flat = np.argmax(preds, axis=1).flatten()
#     print(f'preds_flat: {preds_flat}')
    labels_flat = labels.flatten()
#     print(f'labels_flat: {labels_flat}')

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
#         print(f'Class: {y_preds}')
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [23]:
device = "cuda:0"
model = model.to(device)

In [24]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [25]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.pt')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                | 0/2163 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                           | 0/2163 [00:00<?, ?it/s, training_loss=0.436][A
Epoch 1:   0%|                                                   | 1/2163 [00:00<28:35,  1.26it/s, training_loss=0.436][A
Epoch 1:   0%|                                                   | 1/2163 [00:00<28:35,  1.26it/s, training_loss=0.448][A
Epoch 1:   0%|                                                   | 2/2163 [00:00<16:04,  2.24it/s, training_loss=0.448][A
Epoch 1:   0%|                                                   | 2/2163 [00:01<16:04,  2.24it/s, training_loss=0.432][A
Epoch 1:   0%|                                                   | 3/2163 [00:01<11:58,  3.00it/s, training_loss=0.432][A
Epoch 1:   0%|     


Epoch 1
Training loss: 0.7593340957233742


 20%|████████████████▌                                                                  | 1/5 [07:26<29:45, 446.28s/it]

Validation loss: 0.6598660746291864
F1 Score (Weighted): 0.7885505941312129



Epoch 2:   0%|                                                                                | 0/2163 [00:00<?, ?it/s][A
Epoch 2:   0%|                                                           | 0/2163 [00:00<?, ?it/s, training_loss=0.537][A
Epoch 2:   0%|                                                   | 1/2163 [00:00<07:10,  5.03it/s, training_loss=0.537][A
Epoch 2:   0%|                                                   | 1/2163 [00:00<07:10,  5.03it/s, training_loss=0.069][A
Epoch 2:   0%|                                                   | 2/2163 [00:00<07:08,  5.04it/s, training_loss=0.069][A
Epoch 2:   0%|                                                   | 2/2163 [00:00<07:08,  5.04it/s, training_loss=0.829][A
Epoch 2:   0%|                                                   | 3/2163 [00:00<07:10,  5.02it/s, training_loss=0.829][A
Epoch 2:   0%|                                                   | 3/2163 [00:00<07:10,  5.02it/s, training_loss=0.006][A
Epoch 2:   0%| 


Epoch 2
Training loss: 0.5810916088767578


 40%|█████████████████████████████████▏                                                 | 2/5 [14:50<22:15, 445.11s/it]

Validation loss: 0.6909982910242933
F1 Score (Weighted): 0.7963468312002876



Epoch 3:   0%|                                                                                | 0/2163 [00:00<?, ?it/s][A
Epoch 3:   0%|                                                           | 0/2163 [00:00<?, ?it/s, training_loss=0.142][A
Epoch 3:   0%|                                                   | 1/2163 [00:00<07:11,  5.02it/s, training_loss=0.142][A
Epoch 3:   0%|                                                   | 1/2163 [00:00<07:11,  5.02it/s, training_loss=0.004][A
Epoch 3:   0%|                                                   | 2/2163 [00:00<07:00,  5.14it/s, training_loss=0.004][A
Epoch 3:   0%|                                                   | 2/2163 [00:00<07:00,  5.14it/s, training_loss=0.093][A
Epoch 3:   0%|                                                   | 3/2163 [00:00<07:04,  5.09it/s, training_loss=0.093][A
Epoch 3:   0%|                                                   | 3/2163 [00:00<07:04,  5.09it/s, training_loss=0.574][A
Epoch 3:   0%| 


Epoch 3
Training loss: 0.4909268686808047


 60%|█████████████████████████████████████████████████▊                                 | 3/5 [22:14<14:48, 444.40s/it]

Validation loss: 0.6554167190522775
F1 Score (Weighted): 0.8035043885790801



Epoch 4:   0%|                                                                                | 0/2163 [00:00<?, ?it/s][A
Epoch 4:   0%|                                                           | 0/2163 [00:00<?, ?it/s, training_loss=0.500][A
Epoch 4:   0%|                                                   | 1/2163 [00:00<07:10,  5.02it/s, training_loss=0.500][A
Epoch 4:   0%|                                                   | 1/2163 [00:00<07:10,  5.02it/s, training_loss=0.356][A
Epoch 4:   0%|                                                   | 2/2163 [00:00<07:10,  5.02it/s, training_loss=0.356][A
Epoch 4:   0%|                                                   | 2/2163 [00:00<07:10,  5.02it/s, training_loss=0.002][A
Epoch 4:   0%|                                                   | 3/2163 [00:00<07:03,  5.10it/s, training_loss=0.002][A
Epoch 4:   0%|                                                   | 3/2163 [00:00<07:03,  5.10it/s, training_loss=0.099][A
Epoch 4:   0%| 


Epoch 4
Training loss: 0.4190523163264052


 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [29:36<07:23, 443.67s/it]

Validation loss: 0.6909795755360697
F1 Score (Weighted): 0.8164110873797517



Epoch 5:   0%|                                                                                | 0/2163 [00:00<?, ?it/s][A
Epoch 5:   0%|                                                           | 0/2163 [00:00<?, ?it/s, training_loss=0.001][A
Epoch 5:   0%|                                                   | 1/2163 [00:00<06:53,  5.23it/s, training_loss=0.001][A
Epoch 5:   0%|                                                   | 1/2163 [00:00<06:53,  5.23it/s, training_loss=0.000][A
Epoch 5:   0%|                                                   | 2/2163 [00:00<06:54,  5.21it/s, training_loss=0.000][A
Epoch 5:   0%|                                                   | 2/2163 [00:00<06:54,  5.21it/s, training_loss=0.000][A
Epoch 5:   0%|                                                   | 3/2163 [00:00<06:55,  5.20it/s, training_loss=0.000][A
Epoch 5:   0%|                                                   | 3/2163 [00:00<06:55,  5.20it/s, training_loss=0.001][A
Epoch 5:   0%| 


Epoch 5
Training loss: 0.3605781163620422


100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [36:58<00:00, 443.71s/it]

Validation loss: 0.7166140007874678
F1 Score (Weighted): 0.8131687857607165





# Loading and Evaluating the Model

In [26]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'pre_cla

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [27]:
model.load_state_dict(torch.load('finetuned_BERT_epoch_1.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [28]:
_, predictions, true_vals = evaluate(dataloader_validation)

In [29]:
accuracy_per_class(predictions, true_vals)

label_dict_inverse: {0: '경영, 비즈니스', 1: '개발', 2: '디자인', 3: '마케팅, 광고'}
Class: 경영, 비즈니스
Accuracy: 197/270

Class: 개발
Accuracy: 487/513

Class: 디자인
Accuracy: 94/158

Class: 마케팅, 광고
Accuracy: 129/204



# test

In [30]:
test_text =['''
머신러닝 응용 엔지니어 
• 머신러닝 모델 을 이용한 서비스, REST API 개발 및 제공 (Python)
• 머신러닝 인프라 설계 및 운영
• 원티드의 기업문화와 잘 맞는 분 (https://www.wantedlab.team/our-story)
• REST API에 대한 설계 및 개발 경험과 이해
• 클라우드(AWS, GCP, Azure등등)를 이용한 서비스 구축, 운영경험
• Tensorflow 또는 Pytorch 모델 서빙 경험
• MySQL 호환 DB 사용경험
• 머신러닝에 대한 이해
• 추천시스템 구축 경험
• 데이터 파이프라인 구축 경험
• 웹 대시보드 개발 경험
• 서버 외 다른 플랫폼 개발에 대한 오픈 마인드
''']


In [31]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True )

# model.load_state_dict(torch.load('finetuned_BERT_epoch_1.pt', map_location=torch.device('cpu')))
model.load_state_dict(torch.load('finetuned_BERT_epoch_1.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [32]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")

device(type='cuda')

In [33]:
encoded_inputs = tokenizer.batch_encode_plus(
    test_text,
    max_length=250,
    padding=True,
    truncation=True,
    return_token_type_ids=False,
)

In [34]:
input_ids = torch.tensor(encoded_inputs["input_ids"])
attention_masks = torch.tensor(encoded_inputs["attention_mask"])

In [35]:
def evaluate1(dataloader_val):
    model.eval()
    predictions, probs = [], []
    
    for batch in dataloader_val:
        batch = tuple(b.to("cuda") for b in batch)
#         batch = tuple(b for b in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs[0]
        predictions.append(logits.detach().cpu().numpy())
#         predictions.append(logits.detach().numpy())
        softmax = torch.nn.Softmax(dim=1)
        probs.append(softmax(logits).detach().cpu().numpy())
#         probs.append(softmax(logits).detach().numpy())

    predictions = np.concatenate(predictions, axis=0)
    probs = np.concatenate(probs, axis=0)
    return predictions, probs

In [36]:
tensor_ds = TensorDataset(input_ids, attention_masks)
data_loader = DataLoader(
    tensor_ds, sampler=SequentialSampler(tensor_ds), batch_size=1
)
classes, probs = evaluate1(data_loader)

In [37]:
class_mapping = {
        0: '경영, 비즈니스',
        1: "개발",
        2: "디자인",
        3: "마케팅, 광고",
    }

# {'경영, 비즈니스': 0, '개발': 1, '디자인': 2, '마케팅, 광고': 3}

In [38]:
results = []


for predicted_class, prob in zip(classes, probs):
#     print(np.argmax(predicted_class))
#     print(float(np.max(prob)))
    predicted_class = class_mapping[int(np.argmax(predicted_class))]
    confidence = float(np.max(prob))
    results.append(
        {"class": predicted_class, "confidence": confidence,}
    )
#     client.report_insights({"model_confidence": confidence})

print(results)

[{'class': '개발', 'confidence': 0.9987719655036926}]
