# 전복데이터 CNN 분류

### 1. 데이터 불러오기

In [120]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv(r"C:\Users\DoHyeonjik\GachonUniv\3-2\datasets\DL\abalone.csv")
df

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...,...
4172,4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [121]:
print(df.columns)
print(df.shape)

Index(['id', 'Sex', 'Length', 'Diameter', 'Height', 'Whole_weight',
       'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings'],
      dtype='object')
(4177, 10)


## 2. 데이터 전처리


In [122]:
# a. sex 열을 라벨인코딩
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['Sex'] = labelencoder.fit_transform(df['Sex'])
print(df['Sex'])

0       2
1       2
2       0
3       2
4       1
       ..
4172    0
4173    2
4174    2
4175    0
4176    2
Name: Sex, Length: 4177, dtype: int32


In [123]:
# b. 분류문제=> 레이블을 범주화
print(df['Rings'].min(),"~",df['Rings'].max())
bins =[0,7,14, float('inf')]
df['Rings'] = pd.cut(df['Rings'], bins=bins, labels=[0,1,2], right=False)


1 ~ 29


In [124]:
print(df['Rings'])

0       2
1       1
2       1
3       1
4       1
       ..
4172    1
4173    1
4174    1
4175    1
4176    1
Name: Rings, Length: 4177, dtype: category
Categories (3, int64): [0 < 1 < 2]


## 3. 데이터 준비

In [125]:
# a. df를 데이터와 레이블로 분리
df_data = df.drop(columns=['Rings','id'], axis=1) # df 에서 레이블을 제외한 모든 데이터 추출
print(df_data.head())

df_label = df['Rings'] # 레이블만 추출
print(df_label.head())

   Sex  Length  Diameter  Height  Whole_weight  Shucked_weight  \
0    2   0.455     0.365   0.095        0.5140          0.2245   
1    2   0.350     0.265   0.090        0.2255          0.0995   
2    0   0.530     0.420   0.135        0.6770          0.2565   
3    2   0.440     0.365   0.125        0.5160          0.2155   
4    1   0.330     0.255   0.080        0.2050          0.0895   

   Viscera_weight  Shell_weight  
0          0.1010         0.150  
1          0.0485         0.070  
2          0.1415         0.210  
3          0.1140         0.155  
4          0.0395         0.055  
0    2
1    1
2    1
3    1
4    1
Name: Rings, dtype: category
Categories (3, int64): [0 < 1 < 2]


In [126]:
# c. 훈련데이터와 테스트 데이터로 분리
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_data, df_label, test_size=0.25, random_state=42)
print(len(x_train))
print(len(x_test))

3132
1045


In [127]:
# d. CNN에 사용하기 위해 split_sequence 함수 적용
# d_1) 레이블과 데이터 병합
y_train = np.array(y_train) #넘파이 배열로 변환
train_data = np.c_[x_train, y_train] # 레이블과 데이터 병합

y_test = np.array(y_test)
test_data = np.c_[x_test, y_test]



In [128]:
print(train_data.shape)

(3132, 9)


In [129]:
# d_2)split_sequence 함수 적용 (!!!! 수정필요!!!!)
from collections import Counter
import numpy as np

def split_sequence(sequence, n_steps):
    x, y = list(), list()
    for i in range(len(sequence)):
        end_idx = i + n_steps
        if end_idx > len(sequence):
            break
        seq_x = sequence[i:end_idx, :-1] # 시작점부터 종료점 까지 label을 제외한 모든 데이터를 선택
        seq_y_values = sequence[i:end_idx, -1]

        #시퀀스에 해당하는 레이블을 설정하는 방법 >>> 수정이 필요해보임
        #n_steps가 너무 작기도 하고 csv같은 연속성없는 데이터에서는 다수결이 의미가 없을 것 같음
        most_common_label = Counter(seq_y_values).most_common(1)[0][0]
        x.append(seq_x)
        y.append(most_common_label)

    return np.array(x), np.array(y)

In [130]:
# d_3) split_sequence 적용
# 마찬가지로 steps를 5로 조금 키움
n_steps=5
x_train, y_train = split_sequence(train_data, n_steps)
x_test, y_test = split_sequence(test_data, n_steps)
print(x_train.shape, y_train.shape)

(3128, 5, 8) (3128,)


In [131]:
# b. 레이블 원핫 인코딩
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)
y_train.shape


(3128, 3)

## 4. 모델 생성 및 학습

In [132]:
# a. CNN 모델생성
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout

model = Sequential()

model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=(5,8,1)))

model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#모델 요약
model.summary()


  super().__init__(


In [133]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.1, shuffle=True)


Epoch 1/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8528 - loss: 0.5561 - val_accuracy: 0.9521 - val_loss: 0.2214
Epoch 2/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9443 - loss: 0.2674 - val_accuracy: 0.9521 - val_loss: 0.2183
Epoch 3/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9361 - loss: 0.2863 - val_accuracy: 0.9521 - val_loss: 0.2223
Epoch 4/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9383 - loss: 0.2733 - val_accuracy: 0.9521 - val_loss: 0.2216
Epoch 5/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9413 - loss: 0.2620 - val_accuracy: 0.9521 - val_loss: 0.2199
Epoch 6/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9449 - loss: 0.2500 - val_accuracy: 0.9521 - val_loss: 0.2163
Epoch 7/10
[1m88/88[0m [32m━━━━━━━━━━

In [117]:
print(model.evaluate(x_test,y_test))

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 877us/step - accuracy: 0.9287 - loss: 0.2471
[0.22969314455986023, 0.9346781969070435]
