In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [177]:
data = pd.read_csv('../data/raw_postpro.csv', encoding = 'cp949')
# 컬럼 삭제
df = data.drop(['청구서번호','No.',  '선박입고','완료 여부','리드타임_음수제거','청구량','견적','견적수량','견적화폐','견적단가','발주번호','발주','발주수량','발주금액','미입고 기간','리드타임','창고입고','창고입고수량','입고창고','창고출고','창고출고수량','출고선박','출고운반선','선박입고','선박입고수량','완료 여부'], axis=1)

In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20517 entries, 0 to 20516
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Subject      20517 non-null  object
 1   Machinery    20517 non-null  object
 2   Assembly     20517 non-null  object
 3   청구품목         20517 non-null  object
 4   Part No.1    20517 non-null  object
 5   Part No.2    20517 non-null  object
 6   key1         20517 non-null  object
 7   key2         20517 non-null  object
 8   발주처          20517 non-null  object
 9   D/T          20517 non-null  object
 10  Control No.  20517 non-null  object
dtypes: object(11)
memory usage: 1.7+ MB


In [179]:
df = df[['Machinery', '청구품목', 'Part No.1', 'Part No.2', '발주처','key2']]

In [180]:
from sklearn import preprocessing
label_encoders = {}  # 각 열에 대한 LabelEncoder를 저장하기 위한 딕셔너리
columns_to_encode = ['key2']  # 인코딩을 수행할 열의 이름 리스트

for column in columns_to_encode:
    le = preprocessing.LabelEncoder()
    le.fit(df[column])
    label_encoders[column] = le # 딕셔너리에 저장
    df[column+"_encoded"] = le.transform(df[column]) # 새로운 encoding 된 컬럼 추가

In [181]:
df

Unnamed: 0,Machinery,청구품목,Part No.1,Part No.2,발주처,key2,key2_encoded
0,NO.1 GENERATOR ENGINE,SEAL-O-RING-STOR,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
1,NO.2 GENERATOR ENGINE,OIL COOLER & LINES,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
2,NO.2 GENERATOR ENGINE,WASHER,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
3,NO.1 GENERATOR ENGINE,BOLT-HIGH TEMP,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
4,NO.1 GENERATOR ENGINE,SEAL,7.00E-275,0,HAEIN Coporation_Cheonan,COOLER,18
...,...,...,...,...,...,...,...
20512,HYD. SYSTEM,HYD. CYLINDER_NET BOAT DAVIT,φ150 × φ85 × 695st,FF-SE-1CB150BB695ABD-YP2J,"MARINE HYDROTEC CO.,LTD.",CYLINDER,20
20513,HYD. SYSTEM,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER,φ160 × φ112 × 1130st,S-2CA160A20.6N1130-1,"MARINE HYDROTEC CO.,LTD.",CYLINDER,20
20514,6M NET BOAT,PULLY,Φ305X5V-3,0,"PORT RELIEF ENGINEERING CO.,LTD.",BELT,5
20515,7M NET BOAT,CYLINDER,Φ50 x S195 CYLINDER,0,"PORT RELIEF ENGINEERING CO.,LTD.",CYLINDER,20


In [182]:
df = df.drop(['key2'], axis=1)

In [183]:
df

Unnamed: 0,Machinery,청구품목,Part No.1,Part No.2,발주처,key2_encoded
0,NO.1 GENERATOR ENGINE,SEAL-O-RING-STOR,7.00E-275,0,HAEIN Coporation_Cheonan,18
1,NO.2 GENERATOR ENGINE,OIL COOLER & LINES,7.00E-275,0,HAEIN Coporation_Cheonan,18
2,NO.2 GENERATOR ENGINE,WASHER,7.00E-275,0,HAEIN Coporation_Cheonan,18
3,NO.1 GENERATOR ENGINE,BOLT-HIGH TEMP,7.00E-275,0,HAEIN Coporation_Cheonan,18
4,NO.1 GENERATOR ENGINE,SEAL,7.00E-275,0,HAEIN Coporation_Cheonan,18
...,...,...,...,...,...,...
20512,HYD. SYSTEM,HYD. CYLINDER_NET BOAT DAVIT,φ150 × φ85 × 695st,FF-SE-1CB150BB695ABD-YP2J,"MARINE HYDROTEC CO.,LTD.",20
20513,HYD. SYSTEM,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER,φ160 × φ112 × 1130st,S-2CA160A20.6N1130-1,"MARINE HYDROTEC CO.,LTD.",20
20514,6M NET BOAT,PULLY,Φ305X5V-3,0,"PORT RELIEF ENGINEERING CO.,LTD.",5
20515,7M NET BOAT,CYLINDER,Φ50 x S195 CYLINDER,0,"PORT RELIEF ENGINEERING CO.,LTD.",20


In [184]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])

In [185]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('key2_encoded')
  # df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: np.expand_dims(value, axis=1) for key, value in df.items()}  # np.expand_dims를 사용하여 NumPy 배열로 변환
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [186]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)

In [187]:
train

Unnamed: 0,Machinery,청구품목,Part No.1,Part No.2,발주처,key2_encoded
17601,M/E R/G L.O TRANSFER PUMP,SPARE PART KIT,90259-KB,0,EURO KYTEX ENGINEERING BV,24
7441,POWER BLOCK,SEAL,13.211.39676.000,0,K.TH MARCO,24
18980,STEERING GEAR,"CYLINDER ""O"" RING",JB-201263,0,MATSUI(U.S.A) COROPRATION,20
5659,HYD. DRIVE ENGINE,LINE AS-FUEL INJECTION,122-8856,0,HAEIN Coporation_Cheonan,24
1686,SKIFF BOAT ENGINE,"VALVE, EXHAUST",3803528,0,MATSUI(U.S.A) COROPRATION,57
...,...,...,...,...,...,...
17037,NO.1 REF COMPRESSOR,"GASKET NA, OIL TANK FLANGE A",82098413,REF NO. 7Z332,(주)우림공사,24
14148,NO.1 F.O PURIFIER,ELBOW,524773 02,0,EAST WIND Gmbh,42
5746,6M NET BOAT ENGINE,GASKET,123688-18201,123688-18200,(주)프러스엔지니어링,24
17492,NO.2 GENERATOR ENGINE,RING-PISTON,8N-7810,0,HAEIN Coporation_Cheonan,24


In [109]:
a = [(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of Machinery:', train_features['Machinery'])
print('A batch of targets:', label_batch )
# a

Every feature: ['Machinery', '청구품목', 'Part No.1', 'Part No.2', '발주처']
A batch of Machinery: tf.Tensor(
[[b'BINOCULAR']
 [b'STBD BOOM NO.1 CARGO WINCH ']
 [b'STEERING GEAR']
 [b'MAIN ENGINE']
 [b'NO.1 F.O PURIFIER']], shape=(5, 1), dtype=string)
A batch of targets: tf.Tensor([ 6 22 40 57 22], shape=(5,), dtype=int32)


# 2개 컬럼 합치고 사전으로 만들기

In [110]:
df_desc = df[[ '청구품목', '발주처']].apply(lambda row: ' '.join(row), axis=1)

In [111]:
df_desc.head()

0      SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1    OIL COOLER & LINES HAEIN Coporation_Cheonan
2                WASHER HAEIN Coporation_Cheonan
3        BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4                  SEAL HAEIN Coporation_Cheonan
dtype: object

In [112]:
# Xdf_desc
df_desc

0                SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1              OIL COOLER & LINES HAEIN Coporation_Cheonan
2                          WASHER HAEIN Coporation_Cheonan
3                  BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4                            SEAL HAEIN Coporation_Cheonan
                               ...                        
20512    HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...
20513    HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...
20514               PULLY PORT RELIEF ENGINEERING CO.,LTD.
20515           CYLINDER  PORT RELIEF ENGINEERING CO.,LTD.
20516    SEAL KIT FOR CYLINDER PORT RELIEF ENGINEERING ...
Length: 20517, dtype: object

In [113]:
vocab_list = []
for words in df_desc.str.split():
  if isinstance(words, list):
    #print(type(words), words)
    vocab_list = vocab_list + words
len(vocab_list)

97928

In [114]:
df_desc.head()

0      SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1    OIL COOLER & LINES HAEIN Coporation_Cheonan
2                WASHER HAEIN Coporation_Cheonan
3        BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4                  SEAL HAEIN Coporation_Cheonan
dtype: object

In [115]:
desc = df_desc.to_frame()

In [116]:
desc.columns = ['origin']

In [117]:
desc.head()

Unnamed: 0,origin
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1,OIL COOLER & LINES HAEIN Coporation_Cheonan
2,WASHER HAEIN Coporation_Cheonan
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4,SEAL HAEIN Coporation_Cheonan


In [118]:
desc['split_desc'] = desc.origin.str.split()
desc

Unnamed: 0,origin,split_desc
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo..."
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]"
...,...,...
20512,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...,"[HYD., CYLINDER_NET, BOAT, DAVIT, MARINE, HYDR..."
20513,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...,"[HYD., CYLINDER_N.D, CRANE_BOOM_LIFTER, MARINE..."
20514,"PULLY PORT RELIEF ENGINEERING CO.,LTD.","[PULLY, PORT, RELIEF, ENGINEERING, CO.,LTD.]"
20515,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD.","[CYLINDER, PORT, RELIEF, ENGINEERING, CO.,LTD.]"


In [119]:
pre_desc = desc.dropna(subset=['split_desc'])

In [120]:
pre_desc

Unnamed: 0,origin,split_desc
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo..."
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]"
...,...,...
20512,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...,"[HYD., CYLINDER_NET, BOAT, DAVIT, MARINE, HYDR..."
20513,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...,"[HYD., CYLINDER_N.D, CRANE_BOOM_LIFTER, MARINE..."
20514,"PULLY PORT RELIEF ENGINEERING CO.,LTD.","[PULLY, PORT, RELIEF, ENGINEERING, CO.,LTD.]"
20515,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD.","[CYLINDER, PORT, RELIEF, ENGINEERING, CO.,LTD.]"


In [121]:
# 학습에 사용할 단어장
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Updates internal vocabulary based on a list of sequences.
tokenizer.fit_on_texts(pre_desc['split_desc'])
# print('tokenizer.index_docs', tokenizer.index_docs)
# print('tokenizer.index_word', tokenizer.index_word)
# print('tokenizer.word_index', tokenizer.word_index)
# print('tokenizer.word_docs', tokenizer.word_docs)
# print('tokenizer.word_counts', tokenizer.word_counts)
# print('tokenizer.word_counts', sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True))

In [122]:
# 단어장을 이용해 문장을 숫자로 변경
encoded_text=tokenizer.texts_to_sequences(pre_desc['split_desc'])
pre_desc['encode_desc'] = tokenizer.texts_to_sequences(pre_desc['split_desc'])

In [123]:
pre_desc.head(10)

Unnamed: 0,origin,split_desc,encode_desc
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]","[101, 1, 2]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo...","[16, 99, 63, 126, 1, 2]"
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]","[53, 1, 2]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]","[309, 182, 1, 2]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]","[14, 1, 2]"
5,CORE CHARGES FOR CYLINDER PACK AS HAEIN Copora...,"[CORE, CHARGES, FOR, CYLINDER, PACK, AS, HAEIN...","[28, 46, 24, 33, 92, 55, 1, 2]"
6,PUMP GP-F TFR-REMAN HAEIN Coporation_Cheonan,"[PUMP, GP-F, TFR-REMAN, HAEIN, Coporation_Cheo...","[18, 242, 1099, 1, 2]"
7,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]"
8,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]"
9,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]"


In [124]:
len_list = pre_desc['encode_desc'].apply(lambda x: len(x))
max(len_list), sum(len_list)/len(len_list)

(21, 4.7730174976848465)

In [125]:
top_words = 1000
max_len = 10
vector_len = 30

In [126]:
# OOV: Out-Of-Vocabulary
# 사전에서 빈도수 높은 단어 1,000 개만 사용
# 사전에 없는 단어 <OOV> 처리
tokenizer2 = tf.keras.preprocessing.text.Tokenizer(num_words=top_words, oov_token='<OOV>')
tokenizer2.fit_on_texts(pre_desc['split_desc'])
# print('tokenizer2.index_docs', tokenizer2.index_docs)
# print('tokenizer2.index_word', tokenizer2.index_word)
# print('tokenizer2.word_index', tokenizer2.word_index)
pre_desc['encode_desc_1000'] = tokenizer2.texts_to_sequences(pre_desc['split_desc'])

In [127]:
pre_desc.head(10)

Unnamed: 0,origin,split_desc,encode_desc,encode_desc_1000
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]","[101, 1, 2]","[102, 2, 3]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo...","[16, 99, 63, 126, 1, 2]","[17, 100, 64, 127, 2, 3]"
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]","[53, 1, 2]","[54, 2, 3]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]","[309, 182, 1, 2]","[310, 183, 2, 3]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]","[14, 1, 2]","[15, 2, 3]"
5,CORE CHARGES FOR CYLINDER PACK AS HAEIN Copora...,"[CORE, CHARGES, FOR, CYLINDER, PACK, AS, HAEIN...","[28, 46, 24, 33, 92, 55, 1, 2]","[29, 47, 25, 34, 93, 56, 2, 3]"
6,PUMP GP-F TFR-REMAN HAEIN Coporation_Cheonan,"[PUMP, GP-F, TFR-REMAN, HAEIN, Coporation_Cheo...","[18, 242, 1099, 1, 2]","[19, 243, 1, 2, 3]"
7,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"
8,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"
9,GEAR-WTR PUMP DR HAEIN Coporation_Cheonan,"[GEAR-WTR, PUMP, DR, HAEIN, Coporation_Cheonan]","[685, 18, 246, 1, 2]","[686, 19, 247, 2, 3]"


In [128]:
# 문장의 길이가 다르므로 일치
encode_desc_padding = tf.keras.preprocessing.sequence.pad_sequences(pre_desc['encode_desc_1000'], maxlen=max_len)

In [129]:
encode_desc_padding.shape

(20517, 10)

In [130]:
encode_desc_padding[:5]

array([[  0,   0,   0,   0,   0,   0,   0, 102,   2,   3],
       [  0,   0,   0,   0,  17, 100,  64, 127,   2,   3],
       [  0,   0,   0,   0,   0,   0,   0,  54,   2,   3],
       [  0,   0,   0,   0,   0,   0, 310, 183,   2,   3],
       [  0,   0,   0,   0,   0,   0,   0,  15,   2,   3]])

In [131]:
embedding = tf.keras.layers.Embedding(input_dim =top_words, output_dim=vector_len)(encode_desc_padding)
embedding.shape

TensorShape([20517, 10, 30])

## Make a Dataset with Embedding

In [188]:
df['desc'] = pre_desc['origin']

In [189]:
df

Unnamed: 0,Machinery,청구품목,Part No.1,Part No.2,발주처,key2_encoded,desc
0,NO.1 GENERATOR ENGINE,SEAL-O-RING-STOR,7.00E-275,0,HAEIN Coporation_Cheonan,18,SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1,NO.2 GENERATOR ENGINE,OIL COOLER & LINES,7.00E-275,0,HAEIN Coporation_Cheonan,18,OIL COOLER & LINES HAEIN Coporation_Cheonan
2,NO.2 GENERATOR ENGINE,WASHER,7.00E-275,0,HAEIN Coporation_Cheonan,18,WASHER HAEIN Coporation_Cheonan
3,NO.1 GENERATOR ENGINE,BOLT-HIGH TEMP,7.00E-275,0,HAEIN Coporation_Cheonan,18,BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4,NO.1 GENERATOR ENGINE,SEAL,7.00E-275,0,HAEIN Coporation_Cheonan,18,SEAL HAEIN Coporation_Cheonan
...,...,...,...,...,...,...,...
20512,HYD. SYSTEM,HYD. CYLINDER_NET BOAT DAVIT,φ150 × φ85 × 695st,FF-SE-1CB150BB695ABD-YP2J,"MARINE HYDROTEC CO.,LTD.",20,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...
20513,HYD. SYSTEM,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER,φ160 × φ112 × 1130st,S-2CA160A20.6N1130-1,"MARINE HYDROTEC CO.,LTD.",20,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...
20514,6M NET BOAT,PULLY,Φ305X5V-3,0,"PORT RELIEF ENGINEERING CO.,LTD.",5,"PULLY PORT RELIEF ENGINEERING CO.,LTD."
20515,7M NET BOAT,CYLINDER,Φ50 x S195 CYLINDER,0,"PORT RELIEF ENGINEERING CO.,LTD.",20,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD."


In [190]:
df = df.drop(['청구품목','발주처' ] , axis=1)

In [191]:
df #= df.drop(['key2_encoded' ] , axis=1)

Unnamed: 0,Machinery,Part No.1,Part No.2,key2_encoded,desc
0,NO.1 GENERATOR ENGINE,7.00E-275,0,18,SEAL-O-RING-STOR HAEIN Coporation_Cheonan
1,NO.2 GENERATOR ENGINE,7.00E-275,0,18,OIL COOLER & LINES HAEIN Coporation_Cheonan
2,NO.2 GENERATOR ENGINE,7.00E-275,0,18,WASHER HAEIN Coporation_Cheonan
3,NO.1 GENERATOR ENGINE,7.00E-275,0,18,BOLT-HIGH TEMP HAEIN Coporation_Cheonan
4,NO.1 GENERATOR ENGINE,7.00E-275,0,18,SEAL HAEIN Coporation_Cheonan
...,...,...,...,...,...
20512,HYD. SYSTEM,φ150 × φ85 × 695st,FF-SE-1CB150BB695ABD-YP2J,20,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...
20513,HYD. SYSTEM,φ160 × φ112 × 1130st,S-2CA160A20.6N1130-1,20,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...
20514,6M NET BOAT,Φ305X5V-3,0,5,"PULLY PORT RELIEF ENGINEERING CO.,LTD."
20515,7M NET BOAT,Φ50 x S195 CYLINDER,0,20,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD."


In [192]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('key2_encoded')
  # df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: np.expand_dims(value, axis=1) for key, value in df.items()}  # np.expand_dims를 사용하여 NumPy 배열로 변환
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [193]:
batch_size = 5
all_dataset = df_to_dataset(df, batch_size=batch_size)

In [194]:
[(train_features, label_batch)] = all_dataset.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of Machinery:', train_features['Machinery'])
print('A batch of targets:', label_batch )

Every feature: ['Machinery', 'Part No.1', 'Part No.2', 'desc']
A batch of Machinery: tf.Tensor(
[[b'MAIN ENGINE']
 [b'LUB. OIL']
 [b'RING WINCH']
 [b'MAIN ENGINE']
 [b'MAIN BOOM PORT VANG WINCH ']], shape=(5, 1), dtype=string)
A batch of targets: tf.Tensor([ 8 34  5 11 24], shape=(5,), dtype=int32)


In [150]:
# all_dataset

In [141]:
all_inputs = []
encoded_features = []

In [142]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [143]:
categorical_cols = ['Machinery', 'Part No.1', 'Part No.2']

for header in categorical_cols:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(name=header,
                                               dataset=all_dataset,
                                               dtype='string')
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs.append(categorical_col)
  encoded_features.append(encoded_categorical_col)

In [144]:
for d in all_inputs:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)


In [145]:
for d in encoded_features:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 175)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 8129)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1284)


In [146]:
type(encode_desc_padding)

numpy.ndarray

In [147]:
def df_to_dataset(data, shuffle=True, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices({'desc': data})
  if shuffle:
    ds = ds.shuffle(buffer_size=len(data))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [148]:
embedding_dataset = df_to_dataset(encode_desc_padding, batch_size=batch_size)

In [149]:
for batch in embedding_dataset.take(1):
  print(batch)

{'desc': <tf.Tensor: shape=(5, 10), dtype=int32, numpy=
array([[  0,   0,   0,   0,   0,   0,   0,   1,   2,   3],
       [  0,   0,   0,   0,   1, 229, 271,  43,   4,   5],
       [  0,   0,   0,   0,   0,   0,   0, 229,   7,   6],
       [  0,   0,   0,   0,   0,   0,   0,  22,   1, 112],
       [  0,   0,   0,   0,  41,  19,  11,  10,   9,   8]])>}


In [150]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [151]:
for col in categorical_cols:
  test_type_col = df[col]
  test_type_layer = get_category_encoding_layer(name=col,
                                                dataset=all_dataset,
                                                dtype='string')
  print(test_type_layer(test_type_col))

tf.Tensor(
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1.], shape=(175,), dtype=float32)
tf.Tensor([0. 1. 1. ... 1. 1. 1.], shape=(8129,), dtype=float32)
tf.Tensor([0. 1. 1. ... 1. 1. 1.], shape=(1284,), dtype=float32)


In [152]:
for d in all_inputs:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)


In [153]:
for d in encoded_features:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 175)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 8129)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1284)


In [154]:
def get_text_encoding_layer(name, dataset, top_words, vector_len):
   # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x: x[name])
  embedding = tf.keras.layers.Embedding(input_dim =top_words, output_dim=vector_len)
  return lambda feature: embedding(feature)

In [155]:
embedding = tf.keras.layers.Embedding(input_dim =top_words, output_dim=vector_len)

In [156]:
embedding_layer = lambda feature: embedding(feature)

In [157]:
embedding_layer(encode_desc_padding).shape

TensorShape([20517, 10, 30])

In [158]:
embedding_dataset.element_spec['desc'].shape[1]

10

In [206]:
embedding_col = tf.keras.Input(shape=(), name='desc', dtype='int64')

In [207]:
embedding_layer = get_text_encoding_layer('desc', embedding_dataset, top_words, vector_len)

In [208]:
embedding_layer(embedding_col)

<KerasTensor: shape=(None, 30) dtype=float32 (created by layer 'embedding_7')>

In [162]:
# text features.
for header in ['desc']:
  embedding_col = tf.keras.Input(shape=(), name='desc', dtype='int64')
  embedding_layer = get_text_encoding_layer('desc', embedding_dataset, top_words, vector_len)
  encoded_embedding_col = embedding_layer(embedding_col)
  all_inputs.append(embedding_col)
  encoded_features.append(encoded_embedding_col)

In [163]:
for d in all_inputs:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1)
<class 'keras.engine.keras_tensor.KerasTensor'> (None,)


In [164]:
for d in encoded_features:
  print(type(d), d.shape)

<class 'keras.engine.keras_tensor.KerasTensor'> (None, 175)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 8129)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 1284)
<class 'keras.engine.keras_tensor.KerasTensor'> (None, 30)


In [165]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [166]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Machinery (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 Part No.1 (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 Part No.2 (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 string_lookup (StringLookup)   (None, 1)            0           ['Machinery[0][0]']              
                                                                                              

In [135]:
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [167]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                       optimizer=tf.keras.optimizers.Adam(),
                       metrics=['accuracy'])

In [168]:
train_ds

<PrefetchDataset element_spec=({'Machinery': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), '청구품목': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'Part No.1': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'Part No.2': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), '발주처': TensorSpec(shape=(None, 1), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [202]:
model.fit(train_dataset, epochs= 10, batch_size=32)

Epoch 1/10


UnimplementedError: Graph execution error:

Detected at node 'model/Cast' defined at (most recent call last):
    File "C:\Python310\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Python310\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "D:\ShipSupplies\DA\venv\lib\site-packages\traitlets\config\application.py", line 1043, in launch_instance
      app.start()
    File "D:\ShipSupplies\DA\venv\lib\site-packages\ipykernel\kernelapp.py", line 725, in start
      self.io_loop.start()
    File "D:\ShipSupplies\DA\venv\lib\site-packages\tornado\platform\asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "C:\Python310\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "C:\Python310\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "C:\Python310\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\ipykernel\kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "D:\ShipSupplies\DA\venv\lib\site-packages\ipykernel\kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\ipykernel\kernelbase.py", line 409, in dispatch_shell
      await result
    File "D:\ShipSupplies\DA\venv\lib\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "D:\ShipSupplies\DA\venv\lib\site-packages\ipykernel\ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "D:\ShipSupplies\DA\venv\lib\site-packages\ipykernel\zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "D:\ShipSupplies\DA\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "D:\ShipSupplies\DA\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "D:\ShipSupplies\DA\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\SW\AppData\Local\Temp\ipykernel_5056\3170525730.py", line 1, in <module>
      model.fit(all_dataset, epochs= 10, batch_size=32)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\functional.py", line 649, in _run_internal_graph
      y = self._conform_to_reference_input(y, ref_input=x)
    File "D:\ShipSupplies\DA\venv\lib\site-packages\keras\engine\functional.py", line 761, in _conform_to_reference_input
      tensor = tf.cast(tensor, dtype=ref_input.dtype)
Node: 'model/Cast'
Cast string to int64 is not supported
	 [[{{node model/Cast}}]] [Op:__inference_train_function_51177]

In [196]:
train.dtypes

Machinery       object
청구품목            object
Part No.1       object
Part No.2       object
발주처             object
key2_encoded     int32
dtype: object

In [197]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('key2_encoded')
  # df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: np.expand_dims(value, axis=1) for key, value in df.items()}  # np.expand_dims를 사용하여 NumPy 배열로 변환
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

<PrefetchDataset element_spec=({'Machinery': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'Part No.1': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'Part No.2': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'desc': TensorSpec(shape=(None, 1), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [212]:
batch_size = 5
train_dataset = df_to_dataset(df, batch_size=batch_size)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [200]:
train_dataset

<PrefetchDataset element_spec=({'Machinery': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'Part No.1': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'Part No.2': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'desc': TensorSpec(shape=(None, 1), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [210]:
df['desc'] = pre_desc['encode_desc_1000']

In [211]:
df

Unnamed: 0,Machinery,Part No.1,Part No.2,key2_encoded,desc
0,NO.1 GENERATOR ENGINE,7.00E-275,0,18,"[102, 2, 3]"
1,NO.2 GENERATOR ENGINE,7.00E-275,0,18,"[17, 100, 64, 127, 2, 3]"
2,NO.2 GENERATOR ENGINE,7.00E-275,0,18,"[54, 2, 3]"
3,NO.1 GENERATOR ENGINE,7.00E-275,0,18,"[310, 183, 2, 3]"
4,NO.1 GENERATOR ENGINE,7.00E-275,0,18,"[15, 2, 3]"
...,...,...,...,...,...
20512,HYD. SYSTEM,φ150 × φ85 × 695st,FF-SE-1CB150BB695ABD-YP2J,20,"[293, 1, 374, 1, 33, 48, 8]"
20513,HYD. SYSTEM,φ160 × φ112 × 1130st,S-2CA160A20.6N1130-1,20,"[293, 1, 1, 33, 48, 8]"
20514,6M NET BOAT,Φ305X5V-3,0,5,"[1, 11, 10, 9, 8]"
20515,7M NET BOAT,Φ50 x S195 CYLINDER,0,20,"[34, 11, 10, 9, 8]"


In [209]:
pre_desc

Unnamed: 0,origin,split_desc,encode_desc,encode_desc_1000
0,SEAL-O-RING-STOR HAEIN Coporation_Cheonan,"[SEAL-O-RING-STOR, HAEIN, Coporation_Cheonan]","[101, 1, 2]","[102, 2, 3]"
1,OIL COOLER & LINES HAEIN Coporation_Cheonan,"[OIL, COOLER, &, LINES, HAEIN, Coporation_Cheo...","[16, 99, 63, 126, 1, 2]","[17, 100, 64, 127, 2, 3]"
2,WASHER HAEIN Coporation_Cheonan,"[WASHER, HAEIN, Coporation_Cheonan]","[53, 1, 2]","[54, 2, 3]"
3,BOLT-HIGH TEMP HAEIN Coporation_Cheonan,"[BOLT-HIGH, TEMP, HAEIN, Coporation_Cheonan]","[309, 182, 1, 2]","[310, 183, 2, 3]"
4,SEAL HAEIN Coporation_Cheonan,"[SEAL, HAEIN, Coporation_Cheonan]","[14, 1, 2]","[15, 2, 3]"
...,...,...,...,...
20512,HYD. CYLINDER_NET BOAT DAVIT MARINE HYDROTEC C...,"[HYD., CYLINDER_NET, BOAT, DAVIT, MARINE, HYDR...","[292, 5057, 373, 1210, 32, 47, 7]","[293, 1, 374, 1, 33, 48, 8]"
20513,HYD. CYLINDER_N.D CRANE_BOOM_LIFTER MARINE HYD...,"[HYD., CYLINDER_N.D, CRANE_BOOM_LIFTER, MARINE...","[292, 5058, 5059, 32, 47, 7]","[293, 1, 1, 33, 48, 8]"
20514,"PULLY PORT RELIEF ENGINEERING CO.,LTD.","[PULLY, PORT, RELIEF, ENGINEERING, CO.,LTD.]","[5060, 10, 9, 8, 7]","[1, 11, 10, 9, 8]"
20515,"CYLINDER PORT RELIEF ENGINEERING CO.,LTD.","[CYLINDER, PORT, RELIEF, ENGINEERING, CO.,LTD.]","[33, 10, 9, 8, 7]","[34, 11, 10, 9, 8]"
