In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
data = pd.read_csv('../data/raw_postpro.csv', encoding = 'cp949')
# 컬럼 삭제
df = data.drop(['청구서번호','No.',  '선박입고','완료 여부','리드타임_음수제거','청구량','견적','견적수량','견적화폐','견적단가','발주번호','발주','발주수량','발주금액','미입고 기간','리드타임','창고입고','창고입고수량','입고창고','창고출고','창고출고수량','출고선박','출고운반선','선박입고','선박입고수량','완료 여부'], axis=1)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20517 entries, 0 to 20516
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Subject      20517 non-null  object
 1   Machinery    20517 non-null  object
 2   Assembly     20517 non-null  object
 3   청구품목         20517 non-null  object
 4   Part No.1    20517 non-null  object
 5   Part No.2    20517 non-null  object
 6   key1         20517 non-null  object
 7   key2         20517 non-null  object
 8   발주처          20517 non-null  object
 9   D/T          20517 non-null  object
 10  Control No.  20517 non-null  object
dtypes: object(11)
memory usage: 1.7+ MB


In [4]:
df = df[['청구품목','발주처','key2']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20517 entries, 0 to 20516
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   청구품목    20517 non-null  object
 1   발주처     20517 non-null  object
 2   key2    20517 non-null  object
dtypes: object(3)
memory usage: 481.0+ KB


In [6]:
from sklearn import preprocessing
label_encoders = {}  # 각 열에 대한 LabelEncoder를 저장하기 위한 딕셔너리
columns_to_encode = ['key2']  # 인코딩을 수행할 열의 이름 리스트

for column in columns_to_encode:
    le = preprocessing.LabelEncoder()
    le.fit(df[column])
    label_encoders[column] = le # 딕셔너리에 저장
    df[column+"_encoded"] = le.transform(df[column]) # 새로운 encoding 된 컬럼 추가

In [9]:
df = df.drop(['key2'], axis=1)

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20517 entries, 0 to 20516
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   청구품목          20517 non-null  object
 1   발주처           20517 non-null  object
 2   key2_encoded  20517 non-null  int32 
dtypes: int32(1), object(2)
memory usage: 400.8+ KB


In [11]:
df_desc = df[[ '청구품목', '발주처']].apply(lambda row: ' '.join(row), axis=1)

In [12]:
df_desc.head()

0      SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1    OIL COOLER & LINES HAEIN Coporation_Cheonan
2                WASHER HAEIN Coporation_Cheonan
3        BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4                  SEAL HAEIN Coporation_Cheonan
dtype: object

In [13]:
vocab_list = []
for words in df_desc.str.split():
  if isinstance(words, list):
    #print(type(words), words)
    vocab_list = vocab_list + words
len(vocab_list)

97928

In [14]:
desc = df_desc.to_frame()

In [16]:
desc.columns = ['origin']

In [17]:
desc.head()

Unnamed: 0,origin
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1,OIL COOLER & LINES HAEIN Coporation_Cheonan
2,WASHER HAEIN Coporation_Cheonan
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4,SEAL HAEIN Coporation_Cheonan


In [18]:
desc['split_desc'] = desc.origin.str.split()
desc

Unnamed: 0,origin,split_desc
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo..."
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]"
...,...,...
20512,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...,"[HYD., CYLINDER_NET, BOAT, DAVIT, MARINE, HYDR..."
20513,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...,"[HYD., CYLINDER_N.D, CRANE_BOOM_LIFTER, MARINE..."
20514,"PULLY PORT RELIEF ENGINEERING CO.,LTD.","[PULLY, PORT, RELIEF, ENGINEERING, CO.,LTD.]"
20515,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD.","[CYLINDER, PORT, RELIEF, ENGINEERING, CO.,LTD.]"


In [19]:
pre_desc = desc.dropna(subset=['split_desc'])

In [20]:
pre_desc

Unnamed: 0,origin,split_desc
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo..."
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]"
...,...,...
20512,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...,"[HYD., CYLINDER_NET, BOAT, DAVIT, MARINE, HYDR..."
20513,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...,"[HYD., CYLINDER_N.D, CRANE_BOOM_LIFTER, MARINE..."
20514,"PULLY PORT RELIEF ENGINEERING CO.,LTD.","[PULLY, PORT, RELIEF, ENGINEERING, CO.,LTD.]"
20515,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD.","[CYLINDER, PORT, RELIEF, ENGINEERING, CO.,LTD.]"


In [21]:
# 학습에 사용할 단어장
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Updates internal vocabulary based on a list of sequences.
tokenizer.fit_on_texts(pre_desc['split_desc'])

In [22]:
# 단어장을 이용해 문장을 숫자로 변경
encoded_text=tokenizer.texts_to_sequences(pre_desc['split_desc'])
pre_desc['encode_desc'] = tokenizer.texts_to_sequences(pre_desc['split_desc'])

In [23]:
len_list = pre_desc['encode_desc'].apply(lambda x: len(x))
max(len_list), sum(len_list)/len(len_list)

(21, 4.7730174976848465)

In [24]:
top_words = 1000
max_len = 10
vector_len = 30

In [25]:
# OOV: Out-Of-Vocabulary
# 사전에서 빈도수 높은 단어 1,000 개만 사용
# 사전에 없는 단어 <OOV> 처리
tokenizer2 = tf.keras.preprocessing.text.Tokenizer(num_words=top_words, oov_token='<OOV>')
tokenizer2.fit_on_texts(pre_desc['split_desc'])
# print('tokenizer2.index_docs', tokenizer2.index_docs)
# print('tokenizer2.index_word', tokenizer2.index_word)
# print('tokenizer2.word_index', tokenizer2.word_index)
pre_desc['encode_desc_1000'] = tokenizer2.texts_to_sequences(pre_desc['split_desc'])

In [26]:
pre_desc.head(10)

Unnamed: 0,origin,split_desc,encode_desc,encode_desc_1000
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]","[101, 1, 2]","[102, 2, 3]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo...","[16, 99, 63, 126, 1, 2]","[17, 100, 64, 127, 2, 3]"
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]","[53, 1, 2]","[54, 2, 3]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]","[309, 182, 1, 2]","[310, 183, 2, 3]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]","[14, 1, 2]","[15, 2, 3]"
5,CORE CHARGES FOR CYLINDER PACK AS HAEIN Copora...,"[CORE, CHARGES, FOR, CYLINDER, PACK, AS, HAEIN...","[28, 46, 24, 33, 92, 55, 1, 2]","[29, 47, 25, 34, 93, 56, 2, 3]"
6,PUMP GP-F TFR-REMAN HAEIN Coporation_Cheonan,"[PUMP, GP-F, TFR-REMAN, HAEIN, Coporation_Cheo...","[18, 242, 1099, 1, 2]","[19, 243, 1, 2, 3]"
7,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"
8,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"
9,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"


In [27]:
# 문장의 길이가 다르므로 일치
encode_desc_padding = tf.keras.preprocessing.sequence.pad_sequences(pre_desc['encode_desc_1000'], maxlen=max_len)

In [28]:
encode_desc_padding.shape

(20517, 10)

In [29]:
encode_desc_padding[:5]

array([[  0,   0,   0,   0,   0,   0,   0, 102,   2,   3],
       [  0,   0,   0,   0,  17, 100,  64, 127,   2,   3],
       [  0,   0,   0,   0,   0,   0,   0,  54,   2,   3],
       [  0,   0,   0,   0,   0,   0, 310, 183,   2,   3],
       [  0,   0,   0,   0,   0,   0,   0,  15,   2,   3]])

In [30]:
embedding = tf.keras.layers.Embedding(input_dim =top_words, output_dim=vector_len)(encode_desc_padding)
embedding.shape

TensorShape([20517, 10, 30])

## 임베딩 데이터 셋

In [38]:
batch_size = 5
def embedding_dataset(data, shuffle=True, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices({'desc': data})
  if shuffle:
    ds = ds.shuffle(buffer_size=len(data))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [39]:
embedding_dataset = embedding_dataset(encode_desc_padding, batch_size=batch_size)

In [40]:
for batch in embedding_dataset.take(1):
  print(batch)

{'desc': <tf.Tensor: shape=(5, 10), dtype=int32, numpy=
array([[  0,   0,   0,   0,   0,   0,   1,  35,   4,   5],
       [  0,   0,   0, 246,  17,   1,  70, 115, 325,  40],
       [  0,   0,   0,   0,   0,  32,  36,  37,   9,  38],
       [  0,   0,   0,   0,   0,   0,  22,   1,   1, 112],
       [  0,   0,   0,   0,   0, 529,  11,  10,   9,   8]])>}


In [41]:
embedding = tf.keras.layers.Embedding(input_dim =top_words, output_dim=vector_len)

In [42]:
embedding_layer = lambda feature: embedding(feature)

In [44]:
embedding_layer(encode_desc_padding).shape

TensorShape([20517, 10, 30])

In [45]:
embedding_dataset.element_spec['desc'].shape[1]

10

In [47]:
embedding_col = tf.keras.Input(shape=(), name='desc', dtype='int64')

In [49]:
def get_text_encoding_layer(name, dataset, top_words, vector_len):
   # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x: x[name])
  embedding = tf.keras.layers.Embedding(input_dim =top_words, output_dim=vector_len)
  return lambda feature: embedding(feature)

In [50]:
embedding_layer = get_text_encoding_layer('desc', embedding_dataset, top_words, vector_len)

In [51]:
embedding_layer(embedding_col)

<KerasTensor: shape=(None, 30) dtype=float32 (created by layer 'embedding_2')>

In [46]:
all_inputs = []
encoded_features = []

In [52]:
# text features.
for header in ['desc']:
  embedding_col = tf.keras.Input(shape=(), name='desc', dtype='int64')
  embedding_layer = get_text_encoding_layer('desc', embedding_dataset, top_words, vector_len)
  encoded_embedding_col = embedding_layer(embedding_col)
  all_inputs.append(embedding_col)
  encoded_features.append(encoded_embedding_col)

In [63]:
for d in all_inputs:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None,)


In [62]:
for d in encoded_features:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 30)


In [77]:
from keras.layers import Input, Embedding, Dense
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(64, activation="relu")(all_features)
# x = tf.keras.layers.Dropout(0.5)(x)
# output = tf.keras.layers.Dense(1)(x)
outputs = Dense(61, activation='softmax', name="Prediction_output")(x)

model2 = tf.keras.Model(all_inputs, outputs)

model2.summary()

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 desc (InputLayer)           [(None,)]                 0         
                                                                 
 embedding_3 (Embedding)     (None, 30)                30000     
                                                                 
 concatenate_12 (Concatenate  (None, 30)               0         
 )                                                               
                                                                 
 dense_12 (Dense)            (None, 64)                1984      
                                                                 
 Prediction_output (Dense)   (None, 61)                3965      
                                                                 
Total params: 35,949
Trainable params: 35,949
Non-trainable params: 0
______________________________________________________

In [78]:
model2.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                       optimizer=tf.keras.optimizers.Adam(),
                       metrics=['accuracy'])

In [81]:
df2 = pd.DataFrame()
df2['desc'] = pre_desc['origin']
df2['y'] = df['key2_encoded']

In [83]:
y = df2['y'].values

In [84]:
y

array([18, 18, 18, ...,  5, 20, 20])

In [85]:
X = df2['desc']

In [89]:
# 데이터를 학습용과 검증용으로 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,shuffle=True)

In [90]:
# 딥러닝 모델을 만들기 위해 categorical 함수 사용
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train, num_classes=61)
y_test_categorical = to_categorical(y_test, num_classes=61)

In [93]:
X_train

10862     O-RING, 1AG45.0 PORT RELIEF ENGINEERING CO.,LTD.
2088     SEAL..O-RING-CLAMP PLATE TO PIPE FLANGE MATSUI...
14882                                   BUSHING K.TH MARCO
981                     MCKISSICK 419 SNATCH BLOCK (주)고려기업
6473           CLIP, HOSE PORT RELIEF ENGINEERING CO.,LTD.
                               ...                        
11284         STUD M10X75 PORT RELIEF ENGINEERING CO.,LTD.
11964             SENSOR GP-PRESS HAEIN Coporation_Cheonan
5390                           #2 COMPRESSION RING (주)우림공사
860                                  DIVIDER PLATE (주)선진종합
15795                PLATE-SPACER HAEIN Coporation_Cheonan
Name: desc, Length: 14361, dtype: object

In [97]:
# train set
train_features = X_train.values
train_labels = y_train_categorical
# features= {
#    'desc': X_train.values,
# }
# labels = y_train_categorical

# tf.data.Dataset 생성
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))

In [98]:
model2.fit(train_dataset, epochs=10, batch_size=32)

Epoch 1/10


ValueError: in user code:

    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\input_spec.py", line 250, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer "model_10" "                 f"(type Functional).
    
    Input 0 of layer "dense_12" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (30,)
    
    Call arguments received by layer "model_10" "                 f"(type Functional):
      • inputs=tf.Tensor(shape=(), dtype=string)
      • training=True
      • mask=None


In [95]:
# test set
predict_features = {
   'desc': X_test.values,
}

In [96]:
# Apply the preprocessing in tf.data.Dataset.map.
dataset = tf.data.Dataset.from_tensor_slices((features, labels)).batch(1)
dataset = dataset.map(lambda x, y: (preprocessing_model(x), y),
                      num_parallel_calls=tf.data.AUTOTUNE)
# Display a preprocessed input sample.
next(dataset.take(1).as_numpy_iterator())

NameError: in user code:

    File "C:\Users\SW\AppData\Local\Temp\ipykernel_5500\1429949455.py", line 3, in None  *
        lambda x, y: (preprocessing_model(x), y)

    NameError: name 'preprocessing_model' is not defined


In [79]:
pre_desc.head(10)

Unnamed: 0,origin,split_desc,encode_desc,encode_desc_1000
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]","[101, 1, 2]","[102, 2, 3]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo...","[16, 99, 63, 126, 1, 2]","[17, 100, 64, 127, 2, 3]"
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]","[53, 1, 2]","[54, 2, 3]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]","[309, 182, 1, 2]","[310, 183, 2, 3]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]","[14, 1, 2]","[15, 2, 3]"
5,CORE CHARGES FOR CYLINDER PACK AS HAEIN Copora...,"[CORE, CHARGES, FOR, CYLINDER, PACK, AS, HAEIN...","[28, 46, 24, 33, 92, 55, 1, 2]","[29, 47, 25, 34, 93, 56, 2, 3]"
6,PUMP GP-F TFR-REMAN HAEIN Coporation_Cheonan,"[PUMP, GP-F, TFR-REMAN, HAEIN, Coporation_Cheo...","[18, 242, 1099, 1, 2]","[19, 243, 1, 2, 3]"
7,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"
8,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"
9,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"


In [None]:
# 입력 데이터와 레이블 데이터 준비
train_features = ...
train_labels = ...

# tf.data.Dataset 생성
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))

In [32]:
df.head()

Unnamed: 0,청구품목,발주처,key2_encoded
0,SEAL-O-RING-STOR,HAEIN Coporation_Cheonan,18
1,OIL COOLER & LINES,HAEIN Coporation_Cheonan,18
2,WASHER,HAEIN Coporation_Cheonan,18
3,BOLT-HIGH TEMP,HAEIN Coporation_Cheonan,18
4,SEAL,HAEIN Coporation_Cheonan,18


In [33]:
df2= df.copy()

In [34]:
df2

Unnamed: 0,청구품목,발주처,key2_encoded
0,SEAL-O-RING-STOR,HAEIN Coporation_Cheonan,18
1,OIL COOLER & LINES,HAEIN Coporation_Cheonan,18
2,WASHER,HAEIN Coporation_Cheonan,18
3,BOLT-HIGH TEMP,HAEIN Coporation_Cheonan,18
4,SEAL,HAEIN Coporation_Cheonan,18
...,...,...,...
20512,HYD. CYLINDER_NET BOAT DAVIT,"MARINE HYDROTEC CO.,LTD.",20
20513,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER,"MARINE HYDROTEC CO.,LTD.",20
20514,PULLY,"PORT RELIEF ENGINEERING CO.,LTD.",5
20515,CYLINDER,"PORT RELIEF ENGINEERING CO.,LTD.",20
