In [1]:
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !pip install transformers

In [3]:
# !pip install torch

In [4]:
# import tensorflow as tf

# # GPU 디바이스 목록 가져오기
# gpu_devices = tf.config.list_physical_devices('GPU')

# if len(gpu_devices) > 0:
#     print("사용 가능한 GPU가 있습니다.")
#     for device in gpu_devices:
#         print("GPU 디바이스 이름:", device.name)
# else:
#     print("사용 가능한 GPU가 없습니다.")


In [5]:
# !pip install bert-for-tf2
# !pip install tensorflow_hub

In [6]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

In [7]:
data = pd.read_csv('../data/raw_postpro.csv') # 오류가나면 추가해주세요 .encoding = 'cp949'
# 컬럼 삭제
df = data.drop(['청구서번호','No.',  '선박입고','완료 여부','리드타임_음수제거','청구량','견적','견적수량','견적화폐','견적단가','발주번호','발주','발주수량','발주금액','미입고 기간','리드타임','창고입고','창고입고수량','입고창고','창고출고','창고출고수량','출고선박','출고운반선','선박입고','선박입고수량','완료 여부'], axis=1)

In [8]:
delete_list = ['COMPRESSOR', 'SEAT', 'TURBINE', 'LINE', 'ANODES', 'DAMPER', 'CARD', 'BELT', 'ARM', 'SWITCH',
 'CLIP', 'BATTERY', 'ADAPTER', 'TOOL', 'CONTROL', 'BRAKE', 'TRANSFORMER', 'WINCH']
df = df[~df['key2'].isin(delete_list)]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19367 entries, 0 to 20516
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Subject      19367 non-null  object
 1   Machinery    19367 non-null  object
 2   Assembly     19367 non-null  object
 3   청구품목         19367 non-null  object
 4   Part No.1    19367 non-null  object
 5   Part No.2    19367 non-null  object
 6   key1         19367 non-null  object
 7   key2         19367 non-null  object
 8   발주처          19367 non-null  object
 9   D/T          19367 non-null  object
 10  Control No.  19367 non-null  object
 11  leadtime     19367 non-null  int64 
dtypes: int64(1), object(11)
memory usage: 1.9+ MB


In [10]:
df = df[['청구품목','발주처','Machinery', 'Assembly' , "key1",'key2',"Part No.1", "Part No.2"]]
# 'Machinery', 'Assembly', '청구품목', 'Part No.1', 'Part No.2', 'key1', '발주처'

In [11]:
from sklearn import preprocessing
label_encoders = {}  # 각 열에 대한 LabelEncoder를 저장하기 위한 딕셔너리
columns_to_encode = ['key2']  # 인코딩을 수행할 열의 이름 리스트

for column in columns_to_encode:
    le = preprocessing.LabelEncoder()
    le.fit(df[column])
    label_encoders[column] = le # 딕셔너리에 저장
    df[column+"_encoded"] = le.transform(df[column]) # 새로운 encoding 된 컬럼 추가

In [12]:
df = df.drop(['key2'], axis=1)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19367 entries, 0 to 20516
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   청구품목          19367 non-null  object
 1   발주처           19367 non-null  object
 2   Machinery     19367 non-null  object
 3   Assembly      19367 non-null  object
 4   key1          19367 non-null  object
 5   Part No.1     19367 non-null  object
 6   Part No.2     19367 non-null  object
 7   key2_encoded  19367 non-null  int32 
dtypes: int32(1), object(7)
memory usage: 1.3+ MB


In [14]:
# text = df[[ '청구품목', '발주처','Machinery', 'Assembly' , "key1"]].apply(lambda row: ' '.join(row), axis=1)
# df_text = df[['청구품목', '발주처', 'Machinery', 'Assembly']].apply(lambda row: ' '.join(row), axis=1).to_frame(name='text')
df_text = df[['청구품목', '발주처','Machinery' , "Part No.1", "Part No.2"]].apply(lambda row: ' '.join(row), axis=1).to_frame(name='text')
df_text['key2'] = df['key2_encoded']

In [15]:
df_text[:20]

Unnamed: 0,text,key2
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan NO.1...,8
1,OIL COOLER & LINES HAEIN Coporation_Cheonan NO...,8
2,WASHER HAEIN Coporation_Cheonan NO.2 GENERATOR...,8
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan NO.1 G...,8
4,SEAL HAEIN Coporation_Cheonan NO.1 GENERATOR E...,8
5,CORE CHARGES FOR CYLINDER PACK AS HAEIN Copora...,8
6,PUMP GP-F TFR-REMAN HAEIN Coporation_Cheonan N...,8
7,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan NO.1...,8
8,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan NO.3...,8
9,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan NO.3...,8


In [90]:
import re
def preprocess_text(sen):
    sentence = remove_tags(sen)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)

en_text = []
sentences = list(df_text['text'])
for sen in sentences:
    en_text.append(preprocess_text(sen))

print(df_text.columns.values)

['text' 'key2']


In [91]:
en_text[0]

'SEAL RING STOR HAEIN Coporation Cheonan NO GENERATOR ENGINE '

In [92]:
y = df_text.key2.values

In [93]:
y[:20]

array([ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, 33,  0,  0,  0,
       14, 14, 14])

In [94]:
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y, num_classes=43)

In [95]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [21]:
# !pip uninstall -y tensorflow

In [22]:
# !pip3 install -U "tensorflow==2.11.1"

In [23]:
import tensorflow as tf
print(tf.__version__)
print(tf.__file__)

2.10.0
D:\ShipSupplies\DA\venv\lib\site-packages\tensorflow\__init__.py


In [96]:
def tokenize_text(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
tokenized_text = [tokenize_text(en) for en in en_text]

In [97]:
print('문장의 최대 길이 :',max(len(l) for l in tokenized_text))
print('문장의 평균 길이 :',sum(map(len, tokenized_text))/len(tokenized_text))

문장의 최대 길이 : 30
문장의 평균 길이 : 11.870191562967936


In [26]:
# max_len = 30

# tokenized_text = pad_sequences(tokenized_text, maxlen = max_len)
# X_test = pad_sequences(X_test, maxlen = max_len)

In [98]:
en_text[2],tokenized_text[2]

('WASHER HAEIN Coporation Cheonan NO GENERATOR ENGINE ',
 [9378, 2121, 5292, 12377, 8872, 21223, 18178, 7856, 2078, 2053, 13103, 3194])

In [99]:
# reviews_with_len = [[text, y[i], len(text)] # 토큰화된 text, key값, text 길이
#                  for i, text in enumerate(tokenized_text)]
# reviews_with_len[:5]

reviews_with_len = [[text, y_train_categorical[i], len(text)] # 토큰화된 text, key값, text 길이
                 for i, text in enumerate(tokenized_text)]

In [100]:
# 섞고 길이 기준으로 정렬
import random
random.shuffle(reviews_with_len)
reviews_with_len.sort(key=lambda x: x[2])
reviews_with_len[:10]

[[[23365, 3796, 3796],
  array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
  3],
 [[11307, 2364, 3194],
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
  3],
 [[7744, 5658, 4049, 3194],
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
  4],
 [[7682, 5658, 4049, 3194],
  array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
  4],
 [[10764, 

In [101]:

# sorted_text_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]
# processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_text_labels, output_types=(tf.int32, tf.int32))
# BATCH_SIZE = 32
# batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
# next(iter(batched_dataset))

sorted_text_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len if review_lab[0] is not None]
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_text_labels, output_types=(tf.int32, tf.int32))
BATCH_SIZE = 32
padded_shapes = ((None,), (43,))
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=padded_shapes)
next(iter(batched_dataset))


(<tf.Tensor: shape=(32, 4), dtype=int32, numpy=
 array([[23365,  3796,  3796,     0],
        [11307,  2364,  3194,     0],
        [ 7744,  5658,  4049,  3194],
        [ 7682,  5658,  4049,  3194],
        [10764, 16215,  8879, 10122],
        [ 5747,  5658,  4049,  3194],
        [ 7744,  5658,  4049,  3194],
        [21290,  5658,  4049,  3194],
        [ 2553,  5658,  4049,  3194],
        [ 5747,  5658,  4049,  3194],
        [27000, 16215,  8879, 10122],
        [10764,  5658,  4049,  3194],
        [ 9093,  5658,  4049,  3194],
        [ 3614,  5658,  4049,  3194],
        [ 7682,  8722,  2663,  2818],
        [ 8667,  5658,  4049,  3194],
        [ 3614,  5658,  4049,  3194],
        [ 9093,  5658,  4049,  3194],
        [ 2586,  5658,  4049,  3194],
        [ 7682,  8722,  2663,  2818],
        [ 3500,  5658,  4049,  3194],
        [ 3500,  5658,  4049,  3194],
        [10053,  5658,  4049,  3194],
        [ 1051,  3614,  2364,  3194],
        [ 7682,  5658,  4049,  3194],
  

In [102]:
import math

TOTAL_BATCHES = math.ceil(len(sorted_text_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [103]:
test_data

<TakeDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int32, name=None), TensorSpec(shape=(None, 43), dtype=tf.int32, name=None))>

In [33]:
# for inputs, targets in train_data:
#     print(targets)


In [104]:
TOTAL_BATCHES, TEST_BATCHES

(606, 60)

In [145]:
class TEXT_MODEL(tf.keras.Model):
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        self.embedding = tf.keras.layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = tf.keras.layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")

        self.lstm = tf.keras.layers.LSTM(128)
        
        self.pool = tf.keras.layers.GlobalMaxPool1D()
        self.dense_1 = tf.keras.layers.Dense(units=dnn_units, activation="relu")
        self.last_dense = tf.keras.layers.Dense(units=model_output_classes,
                                                activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 

        concatenated = tf.concat([l_1], axis=-1) 
        concatenated = self.dense_1(concatenated)
        lstm_output = self.lstm(tf.expand_dims(concatenated, axis=1))
        
        model_output = self.last_dense(lstm_output)
        return model_output

In [146]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 100 #200
CNN_FILTERS = 50 #100
DNN_UNITS = 128 #256
OUTPUT_CLASSES = 43
DROPOUT_RATE = 0.1 # 0.2
NB_EPOCHS = 15
VOCAB_LENGTH
# 100 50 128 61 0.1 10 =>0.88

30522

In [147]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [148]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="categorical_crossentropy",
                       optimizer="adam",
                       metrics=["categorical_accuracy"])

text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1e2681cd060>

In [149]:
results = text_model.evaluate(test_data)
print("15번 학습 : ", results)

15번 학습 :  [0.34367772936820984, 0.9286458492279053]


## pred 는각각 61개 컬럼 라벨에서의 확률

In [150]:
pred = text_model.predict(test_data)



In [151]:
pred.shape

(1920, 43)

In [60]:
# pred[0]

# predicted_result 50개의 배치(배치 사이즈 : 32) 데이터 마다 예측 라벨링을 리스트에 저장 

In [152]:
predicted_result = []
for j in range(60):
    temp=[]
    for i in range(32) :   
        predicted_class = tf.argmax(pred[i+ j*32]).numpy() ## 가장 높은 확률의 라벨링 데이터를 구함
        temp.append(predicted_class)
    predicted_result.append(temp)
        # print(predicted_class)

In [69]:
# for lis in predicted_result:
    # print(lis)

# true_result : 테스트 데이터에서의 실제 라벨링

In [153]:
true_result =[]
for inputs, targets in test_data.take(60):
    # 첫 번째 데이터 샘플에 대한 입력(inputs)과 라벨(targets)을 확인

    # Convert EagerTensor to numpy array
    targets_numpy = np.array(targets)

    # Convert numpy array to list
    targets_list = targets_numpy.tolist()
    true_result.append(targets_list)
    # print("Targets:", targets_list)


In [76]:
# for lis in true_result:
#     print(lis)

In [154]:
# 원핫 인코딩 사용시에 실행 
for lis in true_result:
    for i in range(32):
        lis[i] = tf.argmax(lis[i]).numpy()

In [82]:
# true_result

In [155]:
from sklearn.metrics import f1_score,classification_report

In [156]:
# Calculate F1 score
f1 = f1_score([item for sublist in true_result for item in sublist],
              [item for sublist in predicted_result for item in sublist], average='macro')
print("F1 score:", f1)

F1 score: 0.8590643353638463


In [157]:
print(classification_report([item for sublist in predicted_result for item in sublist],
                            [item for sublist in true_result for item in sublist]))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95       221
           2       0.90      1.00      0.95        19
           3       1.00      0.95      0.98       104
           4       1.00      0.91      0.95        46
           5       0.94      0.89      0.92        19
           6       1.00      0.67      0.80         3
           7       1.00      1.00      1.00         7
           8       0.59      1.00      0.74        10
           9       0.96      0.96      0.96        48
          10       0.81      0.92      0.86        24
          11       1.00      1.00      1.00        34
          12       1.00      1.00      1.00        27
          13       0.90      0.97      0.93       324
          14       0.95      0.86      0.90        42
          15       0.71      0.94      0.81        34
          16       0.50      1.00      0.67         3
          17       0.00      0.00      0.00         0
          18       0.92    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
