In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
data = pd.read_csv('../data/raw_postpro.csv', encoding = 'cp949')
# 컬럼 삭제
df = data.drop(['청구서번호','No.',  '선박입고','완료 여부','리드타임_음수제거','청구량','견적','견적수량','견적화폐','견적단가','발주번호','발주','발주수량','발주금액','미입고 기간','리드타임','창고입고','창고입고수량','입고창고','창고출고','창고출고수량','출고선박','출고운반선','선박입고','선박입고수량','완료 여부'], axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20517 entries, 0 to 20516
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Subject      20517 non-null  object
 1   Machinery    20517 non-null  object
 2   Assembly     20517 non-null  object
 3   청구품목         20517 non-null  object
 4   Part No.1    20517 non-null  object
 5   Part No.2    20517 non-null  object
 6   key1         20517 non-null  object
 7   key2         20517 non-null  object
 8   발주처          20517 non-null  object
 9   D/T          20517 non-null  object
 10  Control No.  20517 non-null  object
dtypes: object(11)
memory usage: 1.7+ MB


In [5]:
df = df[['Machinery', '청구품목', 'Part No.1', 'Part No.2', '발주처','key2']]

In [6]:
from sklearn import preprocessing
label_encoders = {}  # 각 열에 대한 LabelEncoder를 저장하기 위한 딕셔너리
columns_to_encode = ['key2']  # 인코딩을 수행할 열의 이름 리스트

for column in columns_to_encode:
    le = preprocessing.LabelEncoder()
    le.fit(df[column])
    label_encoders[column] = le # 딕셔너리에 저장
    df[column+"_encoded"] = le.transform(df[column]) # 새로운 encoding 된 컬럼 추가

In [7]:
df

Unnamed: 0,Machinery,청구품목,Part No.1,Part No.2,발주처,key2,key2_encoded
0,NO.1 GENERATOR ENGINE,SEAL-O-RING-STOR,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
1,NO.2 GENERATOR ENGINE,OIL COOLER & LINES,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
2,NO.2 GENERATOR ENGINE,WASHER,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
3,NO.1 GENERATOR ENGINE,BOLT-HIGH TEMP,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
4,NO.1 GENERATOR ENGINE,SEAL,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
...,...,...,...,...,...,...,...
20512,HYD. SYSTEM,HYD. CYLINDER_NET BOAT DAVIT,φ150 × φ85 × 695st,FF-SE-1CB150BB695ABD-YP2J,"MARINE HYDROTEC CO.,LTD.",CYLINDER,20
20513,HYD. SYSTEM,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER,φ160 × φ112 × 1130st,S-2CA160A20.6N1130-1,"MARINE HYDROTEC CO.,LTD.",CYLINDER,20
20514,6M NET BOAT,PULLY,Φ305X5V-3,0,"PORT RELIEF ENGINEERING CO.,LTD.",BELT,5
20515,7M NET BOAT,CYLINDER,Φ50 x S195 CYLINDER,0,"PORT RELIEF ENGINEERING CO.,LTD.",CYLINDER,20


In [8]:
df = df.drop(['key2'], axis=1)

In [9]:
df

Unnamed: 0,Machinery,청구품목,Part No.1,Part No.2,발주처,key2_encoded
0,NO.1 GENERATOR ENGINE,SEAL-O-RING-STOR,7.00E-275,0,HAEIN Coporation_Cheonan,18
1,NO.2 GENERATOR ENGINE,OIL COOLER & LINES,7.00E-275,0,HAEIN Coporation_Cheonan,18
2,NO.2 GENERATOR ENGINE,WASHER,7.00E-275,0,HAEIN Coporation_Cheonan,18
3,NO.1 GENERATOR ENGINE,BOLT-HIGH TEMP,7.00E-275,0,HAEIN Coporation_Cheonan,18
4,NO.1 GENERATOR ENGINE,SEAL,7.00E-275,0,HAEIN Coporation_Cheonan,18
...,...,...,...,...,...,...
20512,HYD. SYSTEM,HYD. CYLINDER_NET BOAT DAVIT,φ150 × φ85 × 695st,FF-SE-1CB150BB695ABD-YP2J,"MARINE HYDROTEC CO.,LTD.",20
20513,HYD. SYSTEM,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER,φ160 × φ112 × 1130st,S-2CA160A20.6N1130-1,"MARINE HYDROTEC CO.,LTD.",20
20514,6M NET BOAT,PULLY,Φ305X5V-3,0,"PORT RELIEF ENGINEERING CO.,LTD.",5
20515,7M NET BOAT,CYLINDER,Φ50 x S195 CYLINDER,0,"PORT RELIEF ENGINEERING CO.,LTD.",20


In [145]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])

In [78]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('key2_encoded')
  # df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: np.expand_dims(value, axis=1) for key, value in df.items()}  # np.expand_dims를 사용하여 NumPy 배열로 변환
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [79]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)

In [146]:
train

Unnamed: 0,Machinery,청구품목,Part No.1,Part No.2,발주처,key2_encoded
15228,POWER BLOCK,VAVLE C.BAL MARCO H-1180,60.229.02000.950,0,K.TH MARCO,57
5721,7M NET BOAT ENGINE,WASHER,123672-21861,0,"PORT RELIEF ENGINEERING CO.,LTD.",58
192,NO.1 MAIN AIR COMPRESSOR,"PRESSURE GAUGE, HP",3773,0,EAST WIND Gmbh,25
254,NO.1 MAIN AIR COMPRESSOR,OIL LEVEL SWITCH 3/ BSP VH 600M,4356,0,EAST WIND Gmbh,51
13824,NO.3 GENERATOR ENGINE,FERRULE,4N-0933,0,HAEIN Coporation_Cheonan,24
...,...,...,...,...,...,...
13506,NO.1 GENERATOR ENGINE,GASKET KIT SINGLE CYLINDER HEAD,466-0684,0,HAEIN Coporation_Cheonan,7
16465,NO.1 GENERATOR ENGINE,GUIDE,76.60.504,0,"SUNJIN ETECH Co.,Ltd.",25
16370,NO.1 GENERATOR ENGINE,"""O"" RING 189,3X5,7",72.16.171,0,"SUNJIN ETECH Co.,Ltd.",40
14469,NO.3 GENERATOR ENGINE,PUMP AS-F PRM,5M-2894,0,HAEIN Coporation_Cheonan,58


In [80]:
a = [(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of Machinery:', train_features['Machinery'])
print('A batch of targets:', label_batch )
# a

Every feature: ['Machinery', '청구품목', 'Part No.1', 'Part No.2', '발주처']
A batch of Machinery: tf.Tensor(
[[b'STBD BOOM NO.1 CARGO WINCH ']
 [b'PURSE WINCH']
 [b'HYD. DRIVE ENGINE']
 [b'NO.1 GENERATOR ENGINE']
 [b'PURSE WINCH']], shape=(5, 1), dtype=string)
A batch of targets: tf.Tensor([59 23 30 28  4], shape=(5,), dtype=int32)


# 2개 컬럼 합치고 사전으로 만들기

In [81]:
df_desc = df[[ '청구품목', '발주처']].apply(lambda row: ' '.join(row), axis=1)

In [82]:
df_desc.head()

0      SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1    OIL COOLER & LINES HAEIN Coporation_Cheonan
2                WASHER HAEIN Coporation_Cheonan
3        BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4                  SEAL HAEIN Coporation_Cheonan
dtype: object

In [83]:
# Xdf_desc
df_desc

0                SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1              OIL COOLER & LINES HAEIN Coporation_Cheonan
2                          WASHER HAEIN Coporation_Cheonan
3                  BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4                            SEAL HAEIN Coporation_Cheonan
                               ...                        
20512    HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...
20513    HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...
20514               PULLY PORT RELIEF ENGINEERING CO.,LTD.
20515           CYLINDER  PORT RELIEF ENGINEERING CO.,LTD.
20516    SEAL KIT FOR CYLINDER PORT RELIEF ENGINEERING ...
Length: 20517, dtype: object

In [84]:
vocab_list = []
for words in df_desc.str.split():
  if isinstance(words, list):
    #print(type(words), words)
    vocab_list = vocab_list + words
len(vocab_list)

97928

In [85]:
df_desc.head()

0      SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1    OIL COOLER & LINES HAEIN Coporation_Cheonan
2                WASHER HAEIN Coporation_Cheonan
3        BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4                  SEAL HAEIN Coporation_Cheonan
dtype: object

In [86]:
desc = df_desc.to_frame()

In [87]:
desc.columns = ['origin']

In [88]:
desc.head()

Unnamed: 0,origin
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1,OIL COOLER & LINES HAEIN Coporation_Cheonan
2,WASHER HAEIN Coporation_Cheonan
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4,SEAL HAEIN Coporation_Cheonan


In [89]:
desc['split_desc'] = desc.origin.str.split()
desc

Unnamed: 0,origin,split_desc
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo..."
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]"
...,...,...
20512,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...,"[HYD., CYLINDER_NET, BOAT, DAVIT, MARINE, HYDR..."
20513,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...,"[HYD., CYLINDER_N.D, CRANE_BOOM_LIFTER, MARINE..."
20514,"PULLY PORT RELIEF ENGINEERING CO.,LTD.","[PULLY, PORT, RELIEF, ENGINEERING, CO.,LTD.]"
20515,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD.","[CYLINDER, PORT, RELIEF, ENGINEERING, CO.,LTD.]"


In [90]:
pre_desc = desc.dropna(subset=['split_desc'])

In [91]:
pre_desc

Unnamed: 0,origin,split_desc
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo..."
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]"
...,...,...
20512,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...,"[HYD., CYLINDER_NET, BOAT, DAVIT, MARINE, HYDR..."
20513,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...,"[HYD., CYLINDER_N.D, CRANE_BOOM_LIFTER, MARINE..."
20514,"PULLY PORT RELIEF ENGINEERING CO.,LTD.","[PULLY, PORT, RELIEF, ENGINEERING, CO.,LTD.]"
20515,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD.","[CYLINDER, PORT, RELIEF, ENGINEERING, CO.,LTD.]"


In [92]:
# 학습에 사용할 단어장
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Updates internal vocabulary based on a list of sequences.
tokenizer.fit_on_texts(pre_desc['split_desc'])
# print('tokenizer.index_docs', tokenizer.index_docs)
# print('tokenizer.index_word', tokenizer.index_word)
# print('tokenizer.word_index', tokenizer.word_index)
# print('tokenizer.word_docs', tokenizer.word_docs)
# print('tokenizer.word_counts', tokenizer.word_counts)
# print('tokenizer.word_counts', sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True))

In [93]:
# 단어장을 이용해 문장을 숫자로 변경
encoded_text=tokenizer.texts_to_sequences(pre_desc['split_desc'])
pre_desc['encode_desc'] = tokenizer.texts_to_sequences(pre_desc['split_desc'])

In [94]:
pre_desc.head(10)

Unnamed: 0,origin,split_desc,encode_desc
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]","[101, 1, 2]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo...","[16, 99, 63, 126, 1, 2]"
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]","[53, 1, 2]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]","[309, 182, 1, 2]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]","[14, 1, 2]"
5,CORE CHARGES FOR CYLINDER PACK AS HAEIN Copora...,"[CORE, CHARGES, FOR, CYLINDER, PACK, AS, HAEIN...","[28, 46, 24, 33, 92, 55, 1, 2]"
6,PUMP GP-F TFR-REMAN HAEIN Coporation_Cheonan,"[PUMP, GP-F, TFR-REMAN, HAEIN, Coporation_Cheo...","[18, 242, 1099, 1, 2]"
7,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]"
8,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]"
9,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]"


In [95]:
len_list = pre_desc['encode_desc'].apply(lambda x: len(x))
max(len_list), sum(len_list)/len(len_list)

(21, 4.7730174976848465)

In [96]:
top_words = 1000
max_len = 10
vector_len = 30

In [97]:
# OOV: Out-Of-Vocabulary
# 사전에서 빈도수 높은 단어 1,000 개만 사용
# 사전에 없는 단어 <OOV> 처리
tokenizer2 = tf.keras.preprocessing.text.Tokenizer(num_words=top_words, oov_token='<OOV>')
tokenizer2.fit_on_texts(pre_desc['split_desc'])
# print('tokenizer2.index_docs', tokenizer2.index_docs)
# print('tokenizer2.index_word', tokenizer2.index_word)
# print('tokenizer2.word_index', tokenizer2.word_index)
pre_desc['encode_desc_1000'] = tokenizer2.texts_to_sequences(pre_desc['split_desc'])

In [98]:
pre_desc.head(10)

Unnamed: 0,origin,split_desc,encode_desc,encode_desc_1000
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]","[101, 1, 2]","[102, 2, 3]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo...","[16, 99, 63, 126, 1, 2]","[17, 100, 64, 127, 2, 3]"
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]","[53, 1, 2]","[54, 2, 3]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]","[309, 182, 1, 2]","[310, 183, 2, 3]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]","[14, 1, 2]","[15, 2, 3]"
5,CORE CHARGES FOR CYLINDER PACK AS HAEIN Copora...,"[CORE, CHARGES, FOR, CYLINDER, PACK, AS, HAEIN...","[28, 46, 24, 33, 92, 55, 1, 2]","[29, 47, 25, 34, 93, 56, 2, 3]"
6,PUMP GP-F TFR-REMAN HAEIN Coporation_Cheonan,"[PUMP, GP-F, TFR-REMAN, HAEIN, Coporation_Cheo...","[18, 242, 1099, 1, 2]","[19, 243, 1, 2, 3]"
7,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"
8,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"
9,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"


In [99]:
# 문장의 길이가 다르므로 일치
encode_desc_padding = tf.keras.preprocessing.sequence.pad_sequences(pre_desc['encode_desc_1000'], maxlen=max_len)

In [100]:
encode_desc_padding.shape

(20517, 10)

In [101]:
encode_desc_padding[:5]

array([[  0,   0,   0,   0,   0,   0,   0, 102,   2,   3],
       [  0,   0,   0,   0,  17, 100,  64, 127,   2,   3],
       [  0,   0,   0,   0,   0,   0,   0,  54,   2,   3],
       [  0,   0,   0,   0,   0,   0, 310, 183,   2,   3],
       [  0,   0,   0,   0,   0,   0,   0,  15,   2,   3]])

In [102]:
embedding = tf.keras.layers.Embedding(input_dim =top_words, output_dim=vector_len)(encode_desc_padding)
embedding.shape

TensorShape([20517, 10, 30])

## Make a Dataset with Embedding

In [152]:
df['desc'] = pre_desc['origin']

In [153]:
df

Unnamed: 0,Machinery,청구품목,Part No.1,Part No.2,발주처,key2_encoded,desc
0,NO.1 GENERATOR ENGINE,SEAL-O-RING-STOR,7.00E-275,0,HAEIN Coporation_Cheonan,18,SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1,NO.2 GENERATOR ENGINE,OIL COOLER & LINES,7.00E-275,0,HAEIN Coporation_Cheonan,18,OIL COOLER & LINES HAEIN Coporation_Cheonan
2,NO.2 GENERATOR ENGINE,WASHER,7.00E-275,0,HAEIN Coporation_Cheonan,18,WASHER HAEIN Coporation_Cheonan
3,NO.1 GENERATOR ENGINE,BOLT-HIGH TEMP,7.00E-275,0,HAEIN Coporation_Cheonan,18,BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4,NO.1 GENERATOR ENGINE,SEAL,7.00E-275,0,HAEIN Coporation_Cheonan,18,SEAL HAEIN Coporation_Cheonan
...,...,...,...,...,...,...,...
20512,HYD. SYSTEM,HYD. CYLINDER_NET BOAT DAVIT,φ150 × φ85 × 695st,FF-SE-1CB150BB695ABD-YP2J,"MARINE HYDROTEC CO.,LTD.",20,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...
20513,HYD. SYSTEM,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER,φ160 × φ112 × 1130st,S-2CA160A20.6N1130-1,"MARINE HYDROTEC CO.,LTD.",20,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...
20514,6M NET BOAT,PULLY,Φ305X5V-3,0,"PORT RELIEF ENGINEERING CO.,LTD.",5,"PULLY PORT RELIEF ENGINEERING CO.,LTD."
20515,7M NET BOAT,CYLINDER,Φ50 x S195 CYLINDER,0,"PORT RELIEF ENGINEERING CO.,LTD.",20,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD."


In [156]:
df = df.drop(['청구품목','발주처'] , axis=1)

KeyError: "['청구품목', '발주처'] not found in axis"

In [157]:
df

Unnamed: 0,청구서번호,No.,Subject,Machinery,Assembly,Part No.1,Part No.2,key1,key2,리드타임_음수제거,...,창고입고수량,Control No.,입고창고,창고출고,창고출고수량,출고선박,출고운반선,선박입고,선박입고수량,완료 여부
0,ELS-BS-ESP-2004002,12,NO.1 A/E C18 MAJOR O/H PARTS,NO.1 GENERATOR ENGINE,323-6480 LINES GP-FUEL,7.00E-275,0,7.00E-275,COOLER,194,...,1,혜인 발주,BS,2020-09-19,1,BLO,본선 선적,0,0,0
1,OCA-BS-ESP-2010004,5,운전시간 대비 #2 AUX ENG TOP END O/H PARTS,NO.2 GENERATOR ENGINE,GASKET KIT,7.00E-275,0,7.00E-275,COOLER,59,...,1,혜인 발주,BS,2020-09-18,1,JBG,본선 선적,0,0,0
2,MIR-BS-ESP-2011002,119,운전시간 대비 #2 AUX ENG(CAT 3512C) MAJOR O/H PARTS,NO.2 GENERATOR ENGINE,285-8374 MANIFOLD GP-EXH,7.00E-275,0,7.00E-275,COOLER,61,...,1,0,BS,2020-12-12,1,BON,SANWA FONTAINE,0,0,0
3,BLO-BS-ESP-2003002,16,#1 AUX ENG TOP END O/H PARTS,NO.1 GENERATOR ENGINE,159-8828 TURBO GP,7.00E-275,0,7.00E-275,COOLER,212,...,1,혜인 발주,BS,2021-06-16,1,MIR,체항수리 시 사용,0,0,0
4,DEO-BS-ESP-2105008,97,DEO-E-210512-01,NO.1 GENERATOR ENGINE,2N4727 INSTRUMNT PANEL GP,7.00E-275,0,7.00E-275,COOLER,220,...,1,0,BS,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20512,BLO-BS-ESP-2006006,1,"BLO-F-ENGINE-200515-03 BOAT DAVIT CYL 외, AIRCO...",HYD. SYSTEM,5-753-2E_BOAT DAVIT & N.D CRANE LINE,φ150 × φ85 × 695st,FF-SE-1CB150BB695ABD-YP2J,φ150 × φ85 × 695stFF-SE-1CB150BB695ABD-YP2J,CYLINDER,95,...,2,20-09-161,BW,0,0,0,0,0,0,0
20513,BLO-BS-ESP-2004008,1,BLO-F-ENGINE-200403-01 (NAVI DECK CRANE DERRIN...,HYD. SYSTEM,5-753-2E_BOAT DAVIT & N.D CRANE LINE,φ160 × φ112 × 1130st,S-2CA160A20.6N1130-1,φ160 × φ112 × 1130stS-2CA160A20.6N1130-1,CYLINDER,67,...,1,20-07-114,BW,2020-08-26,1,BLO,BLO 상가시 선적,0,0,0
20514,ELS-BS-ESP-1908002,2,ELS-F-ENG-190628-02,6M NET BOAT,ELECTRIC CLUTCH,Φ305X5V-3,0,Φ305X5V-3,BELT,25,...,1,19-08-164,BW,2019-09-04,1,ELS,ONENESS,2019-10-02,1,완료
20515,BLO-BS-ESP-1906014,2,BLO-F-ENGINE-190617-01 (NET BOAT PARTS),7M NET BOAT,HAND HYDRAULIC STEERING SYSTEM,Φ50 x S195 CYLINDER,0,Φ50 x S195 CYLINDER,CYLINDER,25,...,1,19-07-134,BW,2019-08-20,1,BLO,세이셀 컨테이너,2019-10-12,1,완료


In [104]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('key2_encoded')
  # df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: np.expand_dims(value, axis=1) for key, value in df.items()}  # np.expand_dims를 사용하여 NumPy 배열로 변환
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [105]:
batch_size = 5
all_dataset = df_to_dataset(df, batch_size=batch_size)

In [106]:
[(train_features, label_batch)] = all_dataset.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of Machinery:', train_features['청구품목'])
print('A batch of targets:', label_batch )

Every feature: ['Machinery', '청구품목', 'Part No.1', 'Part No.2', '발주처']
A batch of Machinery: tf.Tensor(
[[b'FLOAT SWITCH']
 [b'VALVE, SHUTTLE 1/4" BSP']
 [b'STERN TUBE OIL']
 [b'O-RING']
 [b'METAL, MAIN']], shape=(5, 1), dtype=string)
A batch of targets: tf.Tensor([51 57 34 40 44], shape=(5,), dtype=int32)


In [150]:
# all_dataset

In [107]:
all_inputs = []
encoded_features = []

In [108]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [109]:
categorical_cols = ['Machinery', 'Part No.1', 'Part No.2']

for header in categorical_cols:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(name=header,
                                               dataset=all_dataset,
                                               dtype='string')
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs.append(categorical_col)
  encoded_features.append(encoded_categorical_col)

In [110]:
for d in all_inputs:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)


In [111]:
for d in encoded_features:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 175)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 8129)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1284)


In [112]:
type(encode_desc_padding)

numpy.ndarray

In [113]:
def df_to_dataset(data, shuffle=True, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices({'desc': data})
  if shuffle:
    ds = ds.shuffle(buffer_size=len(data))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [114]:
embedding_dataset = df_to_dataset(encode_desc_padding, batch_size=batch_size)

In [115]:
for batch in embedding_dataset.take(1):
  print(batch)

{'desc': <tf.Tensor: shape=(5, 10), dtype=int32, numpy=
array([[  0,   0,   0,   0,   0,   0, 252,  43,   4,   5],
       [  0,   0,   0,   0,   0,   0,   0,  18,   7,   6],
       [  0,   0,   0,   0,   0,   0,   1, 526,   4,   5],
       [  0,   0,   0,   0,   0,   0,   0,   0,  32,  40],
       [  0,   0,   0,   0,   0,   0,   0,  15,   2,   3]])>}


In [116]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [117]:
for col in categorical_cols:
  test_type_col = df[col]
  test_type_layer = get_category_encoding_layer(name=col,
                                                dataset=all_dataset,
                                                dtype='string')
  print(test_type_layer(test_type_col))

tf.Tensor(
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1.], shape=(175,), dtype=float32)
tf.Tensor([0. 1. 1. ... 1. 1. 1.], shape=(8129,), dtype=float32)
tf.Tensor([0. 1. 1. ... 1. 1. 1.], shape=(1284,), dtype=float32)


In [118]:
for d in all_inputs:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)


In [119]:
for d in encoded_features:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 175)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 8129)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1284)


In [120]:
def get_text_encoding_layer(name, dataset, top_words, vector_len):
   # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x: x[name])
  embedding = tf.keras.layers.Embedding(input_dim =top_words, output_dim=vector_len)
  return lambda feature: embedding(feature)

In [121]:
embedding = tf.keras.layers.Embedding(input_dim =top_words, output_dim=vector_len)

In [122]:
embedding_layer = lambda feature: embedding(feature)

In [123]:
embedding_layer(encode_desc_padding).shape

TensorShape([20517, 10, 30])

In [124]:
embedding_dataset.element_spec['desc'].shape[1]

10

In [125]:
embedding_col = tf.keras.Input(shape=(), name='desc', dtype='int64')

In [126]:
embedding_layer = get_text_encoding_layer('desc', embedding_dataset, top_words, vector_len)

In [127]:
embedding_layer(embedding_col)

<KerasTensor: shape=(None, 30) dtype=float32 (created by layer 'embedding_3')>

In [128]:
# text features.
for header in ['desc']:
  embedding_col = tf.keras.Input(shape=(), name='desc', dtype='int64')
  embedding_layer = get_text_encoding_layer('desc', embedding_dataset, top_words, vector_len)
  encoded_embedding_col = embedding_layer(embedding_col)
  all_inputs.append(embedding_col)
  encoded_features.append(encoded_embedding_col)

In [129]:
for d in all_inputs:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None,)


In [130]:
for d in encoded_features:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 175)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 8129)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1284)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 30)


In [133]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [134]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Machinery (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 Part No.1 (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 Part No.2 (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 string_lookup_9 (StringLookup)  (None, 1)           0           ['Machinery[0][0]']              
                                                                                              

In [135]:
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [136]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                       optimizer=tf.keras.optimizers.Adam(),
                       metrics=['accuracy'])

In [143]:
train_ds

<PrefetchDataset element_spec=({'Machinery': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), '청구품목': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'Part No.1': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'Part No.2': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), '발주처': TensorSpec(shape=(None, 1), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [151]:
model.fit(all_dataset, epochs= 10, batch_size=32)

Epoch 1/10


ValueError: in user code:

    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\input_spec.py", line 197, in assert_input_compatibility
        raise ValueError(

    ValueError: Missing data for input "desc". You passed a data dictionary with keys ['Machinery', '청구품목', 'Part No.1', 'Part No.2', '발주처']. Expected the following keys: ['Machinery', 'Part No.1', 'Part No.2', 'desc']


In [142]:
train.dtypes

Machinery       object
청구품목            object
Part No.1       object
Part No.2       object
발주처             object
key2_encoded     int32
dtype: object