# 1. 데이터 불러오기

In [2]:
import pandas as pd


diamond = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv" )
diamond

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [3]:
# 구조 보기
diamond.shape

(53940, 10)

In [4]:
# 특성 보기
diamond.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [5]:
# 레이블의 값 보기
print(pd.unique(diamond['price']))

[ 326  327  334 ... 2753 2755 2756]


# 2. 데이터 EDA

In [6]:
# cut의 특성에 대해 라벨인코딩 진행
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
diamond['cut'] = le.fit_transform(diamond['cut'])
print(le.classes_)
print(diamond['cut'])

['Fair' 'Good' 'Ideal' 'Premium' 'Very Good']
0        2
1        3
2        1
3        3
4        1
        ..
53935    2
53936    1
53937    4
53938    3
53939    2
Name: cut, Length: 53940, dtype: int32


In [7]:
#마찬가지로 color와 clarity에 대해서 라벨인코딩 진행
le = LabelEncoder()
diamond['color'] = le.fit_transform(diamond['color'])
print(le.classes_)
print(diamond['color'])
le = LabelEncoder()
diamond['clarity'] = le.fit_transform(diamond['clarity'])
print(le.classes_)
print(diamond['clarity'])

['D' 'E' 'F' 'G' 'H' 'I' 'J']
0        1
1        1
2        1
3        5
4        6
        ..
53935    0
53936    0
53937    0
53938    4
53939    0
Name: color, Length: 53940, dtype: int32
['I1' 'IF' 'SI1' 'SI2' 'VS1' 'VS2' 'VVS1' 'VVS2']
0        3
1        2
2        4
3        5
4        3
        ..
53935    2
53936    2
53937    2
53938    3
53939    3
Name: clarity, Length: 53940, dtype: int32


In [8]:
# 레이블과 데이터로 나눔
label = diamond['price']
print(label.shape)
data = diamond.drop('price', axis=1)
print(data.shape)

(53940,)
(53940, 9)


In [9]:
# 훈련세트와 테스트 세트로 분리
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data, label, random_state=42)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(40455, 9) (13485, 9)
(40455,) (13485,)


In [10]:
x_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
35965,0.25,1,1,7,64.9,58.0,3.95,3.97,2.57
52281,0.84,2,6,2,61.8,56.0,6.04,6.07,3.74
6957,1.05,3,6,5,61.1,58.0,6.56,6.51,3.99
9163,1.02,2,2,3,60.7,56.0,6.53,6.50,3.95
50598,0.61,2,2,4,61.8,57.0,5.43,5.47,3.37
...,...,...,...,...,...,...,...,...,...
11284,1.05,4,5,5,62.4,59.0,6.48,6.51,4.05
44732,0.47,2,0,4,61.0,55.0,5.03,5.01,3.06
38158,0.33,4,2,1,60.3,58.0,4.49,4.46,2.70
860,0.90,3,6,2,62.8,59.0,6.13,6.03,3.82


In [11]:
y_train.values
y_test.values

array([ 559, 2201, 1238, ..., 9215, 4416, 3564], dtype=int64)

## 3-3) split_sequence를 이용한 DL regression

In [25]:
# split_sequence 함수 적용을 위해 레이블과 데이터 병합
import numpy as np

train_set = np.c_[x_train, y_train]
test_set = np.c_[x_test, y_test]

print(train_set.shape)
print(test_set.shape)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 3 dimension(s) and the array at index 1 has 2 dimension(s)

In [26]:
#split sequence 함수 
from collections import Counter

def split_sequence(sequence, n_steps):
    x, y = list(), list()
    for i in range(len(sequence)):
        end_idx = i + n_steps
        if end_idx > len(sequence):
            break
        seq_x = sequence[i:end_idx, :-1]
        seq_y_values = sequence[i:end_idx, -1]

        most_common_values = Counter(seq_y_values).most_common(1)[0][0]
        x.append(seq_x)
        y.append(most_common_values)

    return np.array(x), np.array(y)  

In [33]:
# split_sequence 적용
n_steps = 5
x_train, y_train = split_sequence(train_set, n_steps)
x_test, y_test = split_sequence(test_set, n_steps)


# 모델 생성 : CNN

In [34]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten

model = Sequential()

model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=(5,9,1)))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='rmsprop', loss='mse')
model.summary()

In [35]:
history = model.fit(x_train, y_train, epochs=20, batch_size= 32, validation_split=0.1, shuffle=True)

Epoch 1/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 18815462.0000 - val_loss: 16499704.0000
Epoch 2/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 15978939.0000 - val_loss: 16474923.0000
Epoch 3/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 16003522.0000 - val_loss: 16422447.0000
Epoch 4/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15647109.0000 - val_loss: 16437291.0000
Epoch 5/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15751703.0000 - val_loss: 16287311.0000
Epoch 6/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15105841.0000 - val_loss: 15650833.0000
Epoch 7/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 14754327.0000 - val_loss: 14323131.0000
Epoch 8/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━

In [19]:
# 예측값
preds = model.predict(x_test)

[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 837us/step


In [20]:
# mse
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(preds, y_test)

In [21]:
print(preds)
print(mse)

[[ 980.0762]
 [1440.5908]
 [ 600.3467]
 ...
 [6733.7476]
 [2883.3098]
 [6217.7505]]
9179236.237420237


In [37]:
## Conv1D로 변환
from keras.layers import Conv1D, MaxPooling1D

model_1d = Sequential()

model_1d.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(5,9)))
model_1d.add(MaxPooling1D(pool_size=2))
model_1d.add(Flatten())
model_1d.add(Dense(128, activation='relu'))
model_1d.add(Dense(1))

model_1d.compile(optimizer='rmsprop', loss='mse')
model_1d.summary()

In [38]:
history = model_1d.fit(x_train, y_train, epochs=20, batch_size=32, validation_split=0.1, shuffle=True)

Epoch 1/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 18944966.0000 - val_loss: 16034785.0000
Epoch 2/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15082123.0000 - val_loss: 15094717.0000
Epoch 3/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 13392576.0000 - val_loss: 11417073.0000
Epoch 4/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 9994884.0000 - val_loss: 9972874.0000
Epoch 5/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 9181634.0000 - val_loss: 8863031.0000
Epoch 6/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 8488996.0000 - val_loss: 7911832.0000
Epoch 7/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 7609373.0000 - val_loss: 7438864.5000
Epoch 8/20
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [39]:
preds = model_1d.predict(x_test)

[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 892us/step


In [40]:
mse = mean_squared_error(preds, y_test)

In [43]:
print(preds)
print(mse)

[[ 1039.5659]
 [ 2933.8423]
 [ 1472.0989]
 ...
 [10604.112 ]
 [ 3377.7104]
 [ 2323.0425]]
3409148.367083478
