# 전복 데이터 LSTM 분류

## 1. 전복 데이터 불러오기

In [2]:
import pandas as pd

df = pd.read_csv(r"C:\Users\DoHyeonjik\GachonUniv\3-2\datasets\DL\abalone.csv")
print(df.columns, df.shape)

Index(['id', 'Sex', 'Length', 'Diameter', 'Height', 'Whole_weight',
       'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings'],
      dtype='object') (4177, 10)


## 2. 데이터 전처리

In [3]:
# 'sex' 열 데이터 라벨인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
print(df['Sex'])

0       2
1       2
2       0
3       2
4       1
       ..
4172    0
4173    2
4174    2
4175    0
4176    2
Name: Sex, Length: 4177, dtype: int32


In [4]:
# 분류 문제 ->레이블 범주화
print(df['Rings'].min(),"~",df['Rings'].max())
bins=[0,7,14, float('inf')]
df['Rings'] = pd.cut(df['Rings'], bins=bins, labels=[0,1,2], right=False)

1 ~ 29


In [5]:
print(df['Rings'])

0       2
1       1
2       1
3       1
4       1
       ..
4172    1
4173    1
4174    1
4175    1
4176    1
Name: Rings, Length: 4177, dtype: category
Categories (3, int64): [0 < 1 < 2]


## 3. 데이터 준비

In [6]:
# a. 데이터와 레이블로 분리
df_data = df.drop(columns=['Rings','id'], axis=1)
print(df_data.head())

df_label = df['Rings']
print(df_label.head())

   Sex  Length  Diameter  Height  Whole_weight  Shucked_weight  \
0    2   0.455     0.365   0.095        0.5140          0.2245   
1    2   0.350     0.265   0.090        0.2255          0.0995   
2    0   0.530     0.420   0.135        0.6770          0.2565   
3    2   0.440     0.365   0.125        0.5160          0.2155   
4    1   0.330     0.255   0.080        0.2050          0.0895   

   Viscera_weight  Shell_weight  
0          0.1010         0.150  
1          0.0485         0.070  
2          0.1415         0.210  
3          0.1140         0.155  
4          0.0395         0.055  
0    2
1    1
2    1
3    1
4    1
Name: Rings, dtype: category
Categories (3, int64): [0 < 1 < 2]


In [7]:
# b. 훈련데이터와 테스트 데이터로 분리
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_data, df_label, test_size=0.2, random_state=42)
print(x_train.shape)
print(x_test.shape)

(3341, 8)
(836, 8)


In [8]:
# c. LSTM에 사용하기 위해 split_sequence 적용
# c-1) 레이블과 데이터 병합
import numpy as np

y_train = np.array(y_train)
train_data = np.c_[x_train, y_train]

y_test = np.array(y_test)
test_data = np.c_[x_test, y_test]

In [9]:
print(train_data.shape)
print(test_data.shape)

(3341, 9)
(836, 9)


In [12]:
# c-2)split_sequence 함수
from collections import Counter
import numpy as np

def split_sequence(sequences, n_steps):
    x, y = list(), list()
    for i in range(len(sequences)):
        end_idx = i + n_steps
        if end_idx > len(sequences):
            break
        seq_x = sequences[i:end_idx, :-1]
        seq_y_values = sequences[i:end_idx, -1]

        most_common_label = Counter(seq_y_values).most_common(1)[0][0]
        x.append(seq_x)
        y.append(most_common_label)

    return np.array(x), np.array(y)


In [13]:
# c-3)split_sequence 적용
n_steps = 5
x_train, y_train = split_sequence(train_data, n_steps)
x_test, y_test = split_sequence(test_data, n_steps)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(3337, 5, 8) (3337,)
(832, 5, 8) (832,)


In [14]:
# d) 원핫인코딩
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)
y_train.shape

(3337, 3)

## 4. 모델 생성 및 학습

In [18]:
#a. LSTM 모델 생성
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(128, input_shape=(5,8)))
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', metrics=['accuracy'], loss='categorical_crossentropy')

model.summary()

  super().__init__(**kwargs)


In [19]:
#b. 학습
history = model.fit(x_train, y_train, epochs=20, batch_size=32, validation_split=0.1, shuffle=True)

Epoch 1/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9302 - loss: 0.4527 - val_accuracy: 0.9551 - val_loss: 0.2185
Epoch 2/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9439 - loss: 0.2570 - val_accuracy: 0.9551 - val_loss: 0.2196
Epoch 3/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9470 - loss: 0.2363 - val_accuracy: 0.9551 - val_loss: 0.2067
Epoch 4/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9414 - loss: 0.2490 - val_accuracy: 0.9551 - val_loss: 0.2069
Epoch 5/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9396 - loss: 0.2518 - val_accuracy: 0.9551 - val_loss: 0.2022
Epoch 6/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9359 - loss: 0.2556 - val_accuracy: 0.9551 - val_loss: 0.2014
Epoch 7/20
[1m94/94[0m [32m━━━━━━━━━━

In [23]:
print('[loss, acc]:',model.evaluate(x_test, y_test))

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9278 - loss: 0.2452 
[loss, acc]: [0.225311279296875, 0.9338942170143127]
