In [1]:
import tensorflow as tf

In [2]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads=8,** kwargs):
        super(MultiHeadAttention, self).__init__()
        self.embedding_dim = embedding_dim # d_model
        self.num_heads = num_heads

        assert embedding_dim % self.num_heads == 0

        self.projection_dim = embedding_dim // num_heads
        self.query_dense = tf.keras.layers.Dense(embedding_dim)
        self.key_dense = tf.keras.layers.Dense(embedding_dim)
        self.value_dense = tf.keras.layers.Dense(embedding_dim)
        self.dense = tf.keras.layers.Dense(embedding_dim)

    def scaled_dot_product_attention(self, query, key, value):
        matmul_qk = tf.matmul(query, key, transpose_b=True)
        depth = tf.cast(tf.shape(key)[-1], tf.float32)
        logits = matmul_qk / tf.math.sqrt(depth)
        attention_weights = tf.nn.softmax(logits, axis=-1)
        output = tf.matmul(attention_weights, value)
        return output, attention_weights

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def get_config(self) :
        config = super().get_config().copy()
        config.update({
            'embedding_dim' : self.embedding_dim,
            'num_heads' :self.num_heads,
            'projection_dim' : self.projection_dim,
            'query_dense' :self.query_dense,
            'key_dense' : self.key_dense,
            'value_dense' :self.value_dense,
            'dense' :self.dense,
        })
        return config

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]

        # (batch_size, seq_len, embedding_dim)
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)

        # (batch_size, num_heads, seq_len, projection_dim)
        query = self.split_heads(query, batch_size)  
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        scaled_attention, _ = self.scaled_dot_product_attention(query, key, value)
        # (batch_size, seq_len, num_heads, projection_dim)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  

        # (batch_size, seq_len, embedding_dim)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.embedding_dim))
        outputs = self.dense(concat_attention)
        return outputs


In [3]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, dff, rate=0.1,** kwargs):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(embedding_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(dff, activation="relu"),
             tf.keras.layers.Dense(embedding_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        
    def get_config(self) :
        config = super().get_config().copy()
        config.update({
            'att' : self.att,
            'ffn' :self.ffn,
            'layernorm1' : self.layernorm1,
            'layernorm2' :self.layernorm2,
            'dropout1' : self.dropout1,
            'dropout2' :self.dropout2,
        })
        return config

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [4]:
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, max_len, vocab_size, embedding_dim,** kwargs):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.pos_emb = tf.keras.layers.Embedding(max_len, embedding_dim)
    
    def get_config(self) :
        config = super().get_config().copy()
        config.update({
            'token_emb' : self.token_emb,
            'pos_emb' :self.pos_emb,
        })
        return config

    def call(self, x):
        max_len = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=max_len, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
import tensorflow_datasets as tfds
import tensorflow as tf
from konlpy.tag import Okt
import time
from tqdm import tqdm
from random import sample

converter = Okt()

In [6]:
total_df = pd.read_csv('total_df.csv')
shuffle_df1 = total_df.sample(frac=1).reset_index(drop=True)
shuffle_df2 = total_df.sample(frac=1).reset_index(drop=True)
shuffle_df3 = total_df.sample(frac=1).reset_index(drop=True)

title = total_df['title']
labels = pd.read_csv('label.csv')

sector_dict = dict(zip(list(labels['label']),list(labels['index'])))

sector_label = list(total_df['sector'])
shuffle1_label = list(shuffle_df1['sector'])
shuffle2_label = list(shuffle_df2['sector'])
shuffle3_label = list(shuffle_df3['sector'])
sector_label_num = list(total_df['sector_label'])
shuffle1_label_num = list(shuffle_df1['sector_label'])
shuffle2_label_num = list(shuffle_df2['sector_label'])
shuffle3_label_num = list(shuffle_df3['sector_label'])

In [7]:
sentences = []
label = []
for index,title in tqdm(enumerate(total_df['title'])) : #토큰
    okt_title = converter.pos(title)
    sentences.append(' '.join([tup[0].upper() for tup in okt_title if tup[1] == 'Noun' or tup[1] == 'Alpha']))

for index,title in tqdm(enumerate(shuffle_df1['title'])) : #전체 문장
    okt_title = converter.pos(title)
    token = [tup[0].upper() for tup in okt_title if tup[1] == 'Noun' or tup[1] == 'Alpha']
    sentences.append(' '.join(sample(token,len(token))))
    
for index,title in tqdm(enumerate(shuffle_df2['title'])) : #전체 문장 섞음
    okt_title = converter.pos(title)
    token = [tup[0].upper() for tup in okt_title if tup[1] == 'Noun' or tup[1] == 'Alpha']
    sentences.append(' '.join(sample(token,len(token))))
sentences

3481it [00:04, 777.51it/s] 
3481it [00:01, 2466.63it/s]
3481it [00:01, 2674.63it/s]


['NODE JS BACKEND 개발자',
 '단비 NODE JS REACT TYPESCRIPT 개발자',
 'JAVA 개발 비즈니스 플랫폼',
 'COUPANG PAY PRINCIPAL TECHNICAL PROGRAM MANAGEMENT',
 '핀 테크 파운트 FULL STACK ENGINEER',
 '핀 테크 파운트 프론트엔드 개발자',
 '백엔드 서비스 개발',
 '다짐 백엔드 개발자 이상',
 'PHP MYSQL JAVA 정규 경력 개발자 모집',
 '프론트엔드 개발자 신입 채용',
 '크립 토네이도 서버 개발자',
 '블록 체인 서비스 백엔드 개발자',
 '더 리움 클레이튼 기반 플랫폼 개발자',
 '어드민 풀 스택 개발자',
 'QA 담당자',
 '코리아 웹개발자 앱 개발자 경력 채용',
 '인슈 테크 스타트업 주 웰그램 개발자',
 '프리 윌린 프론트엔드 개발자 주니어',
 'QA QA ENGINEER 매 플랫',
 '프리 윌린 프론트엔드 개발자 시니어',
 '개발 IOS 개발자 B C',
 '프리 윌린 백엔드 개발자 시니어',
 'QA 엔지니어',
 'IOS 개발자',
 'BACKEND DEVELOPER',
 'FRONTEND ENGINEER',
 '사업 TYPESCRIPT BACKEND 엔지니어',
 'FRONT END DEVELOPER ANGULAR',
 '개발 팀 시니어 FULL STACK 개발자 정규직 경력 모집',
 '개발 팀 시니어 FRONT END 개발자 정규직 경력 모집',
 '개발 팀 사내 프로젝트 서버 개발 정규직 경력 모집',
 '프론트엔드 엔지니어 시니어',
 '데이터 엔지니어',
 '백엔드 엔지니어 시니어',
 'IOS 엔지니어',
 '안드로이드 엔지니어',
 '임 포트 개발 기획 동료 지원',
 '차이 카드 페이 QA ENGINEER',
 '차이 카드 페이 DEVOPS ENGINEER',
 '임 포트 SENIOR FRONT END ENGINEER',
 '임 포트 BACK END ENGINEER',
 '임 포트 IOS EN

In [8]:
sector_label = sector_label+shuffle1_label+shuffle2_label

In [9]:
sector_label_num = sector_label_num+shuffle1_label_num+shuffle2_label_num

In [10]:
sector_label_num = np.array(sector_label_num)

In [11]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    sentences + sector_label, target_vocab_size=2**13)

In [12]:
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
VOCAB_SIZE = tokenizer.vocab_size + 2

print('시작 토큰 번호 :',START_TOKEN)
print('종료 토큰 번호 :',END_TOKEN)
print('단어 집합의 크기 :',VOCAB_SIZE)

시작 토큰 번호 : [2814]
종료 토큰 번호 : [2815]
단어 집합의 크기 : 2816


In [13]:
# 최대 길이를 40으로 정의
MAX_LENGTH = 20

# 토큰화 / 정수 인코딩 / 시작 토큰과 종료 토큰 추가 / 패딩
def tokenize_and_filter(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []

    for (sentence1, sentence2) in zip(inputs, outputs):
        # encode(토큰화 + 정수 인코딩), 시작 토큰과 종료 토큰 추가
        sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
        sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN

        tokenized_inputs.append(sentence1)
        tokenized_outputs.append(sentence2)

        # 패딩
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=MAX_LENGTH, padding='post')
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_outputs, maxlen=MAX_LENGTH, padding='post')

    return tokenized_inputs, tokenized_outputs

In [14]:
questions, answers = tokenize_and_filter(sentences, sector_label)

In [15]:
print('질문 데이터의 크기(shape) :', questions.shape)
print('답변 데이터의 크기(shape) :', answers.shape)
print('답변 숫자 데이터의 크기(shape) :', sector_label_num.shape)

질문 데이터의 크기(shape) : (10443, 20)
답변 데이터의 크기(shape) : (10443, 20)
답변 숫자 데이터의 크기(shape) : (10443,)


In [16]:
sector_label_num[:5]

array([ 0,  2, 21, 19,  2])

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(questions,sector_label_num, test_size = 0.1,random_state = 0)
X_train.shape,X_val.shape,Y_train.shape,Y_val.shape

((9398, 20), (1045, 20), (9398,), (1045,))

In [18]:
embedding_dim = 64  # Embedding size for each token
num_heads = 8  # Number of attention heads
dff = 512  # Hidden layer size in feed forward network inside transformer
max_len = MAX_LENGTH
vocab_size = VOCAB_SIZE

inputs = tf.keras.layers.Input(shape=(max_len,))
embedding_layer = TokenAndPositionEmbedding(max_len, vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, dff)
x = transformer_block(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(20, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(26, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [19]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 20)]              0         
_________________________________________________________________
token_and_position_embedding (None, 20, 64)            181504    
_________________________________________________________________
transformer_block (Transform (None, 20, 64)            83008     
_________________________________________________________________
global_average_pooling1d (Gl (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)                0     

In [20]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, 
                                   verbose=1, mode='auto', min_delta=0.005, cooldown=5, min_lr=0.0001)

modelCheckpoint = ModelCheckpoint('best_test.h5', monitor='val_loss', verbose=1, save_best_only=True)

callbacks = [reduceLROnPlat, modelCheckpoint]

history = model.fit(X_train, Y_train, 
                    batch_size=64, 
                    epochs=10,
                    validation_data = (X_val, Y_val),
                    callbacks = callbacks)

Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.22456, saving model to best_test.h5
Epoch 2/10

Epoch 00002: val_loss improved from 1.22456 to 0.36845, saving model to best_test.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.36845 to 0.22498, saving model to best_test.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.22498 to 0.15130, saving model to best_test.h5
Epoch 5/10

Epoch 00005: val_loss improved from 0.15130 to 0.13384, saving model to best_test.h5
Epoch 6/10

Epoch 00006: val_loss improved from 0.13384 to 0.12154, saving model to best_test.h5
Epoch 7/10

Epoch 00007: val_loss improved from 0.12154 to 0.10075, saving model to best_test.h5
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.10075
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.10075
Epoch 10/10

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00010: val_loss did not improve from 0.10075


In [21]:
def proc_eval(inputs):
    inputs = [inputs]
    tokenized_inputs = []

    for sentence1 in inputs:
        # encode(토큰화 + 정수 인코딩), 시작 토큰과 종료 토큰 추가
        sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN

        tokenized_inputs.append(sentence1)

        # 패딩
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=MAX_LENGTH, padding='post')

    return tokenized_inputs

In [22]:
def predict(sentence) :
    okt_title = converter.pos(sentence)
    sentence = ' '.join([tup[0].upper() for tup in okt_title if tup[1] == 'Noun' or tup[1] == 'Alpha'])
    pred = model.predict(proc_eval(sentence))
    return list(labels['label'])[np.argmax(pred)]


In [23]:
predict("데이터 프로덕트 매니저")

'Data-engineer'

In [24]:
predict("클라이언트 개발자 (캐주얼 게임)")

'Game'

In [25]:
predict("Tech Lead Manager")

'CTO'

In [26]:
predict("머신러닝/데이터 분석가")

'Embedded/Robotics'

In [27]:
predict("풀스택 엔지니어(2년 이상)")

'WEB/Full-stack'

In [28]:
correct = 0
wrong = 0

for index,title in tqdm(enumerate(total_df['title'])) : 
    output = predict(title)
    if output == sector_label[index] :
        correct += 1
    else :
        wrong +=1
#         print('title :',title)
#         print('true :', sector_label[index])
#         print('pred :',output)
        
print(f'correct : {correct}, wrong : {wrong}')
print(f'real acc : {correct/3481}')

3481it [01:08, 50.58it/s]

correct : 3454, wrong : 27
real acc : 0.9922436081585752





In [29]:
predict("C++ 개발자")

'C#/C++/C'

In [30]:
predict("Exchange Admin Console Developer")

'C#/C++/C'

In [31]:
predict("UX 디자인")

'Web-publisher'

In [32]:
predict("커머스 앱 및 시스템 앱을 위한 React Native 개발자")

'Mobile'

In [33]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [34]:
model.save("sector_classifier.h5")

In [35]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [36]:
embedding_dim = 64  # Embedding size for each token
num_heads = 8  # Number of attention heads
dff = 512  # Hidden layer size in feed forward network inside transformer
max_len = MAX_LENGTH
vocab_size = VOCAB_SIZE

inputs = tf.keras.layers.Input(shape=(max_len,))
embedding_layer = TokenAndPositionEmbedding(max_len, vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, dff)
x = transformer_block(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(20, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(26, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [37]:
model.summary()
model.load_weights('sector_classifier.h5')

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 20)]              0         
_________________________________________________________________
token_and_position_embedding (None, 20, 64)            181504    
_________________________________________________________________
transformer_block_1 (Transfo (None, 20, 64)            83008     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 64)                0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 20)                1300      
_________________________________________________________________
dropout_7 (Dropout)          (None, 20)                0   

In [38]:
predict("Exchange Admin Console Developer")

'C#/C++/C'