### **책의 제목을 보고 책의 가격을 예측하기**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Flatten
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

##### Load dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/Samples/book.csv'
df_1 = pd.read_csv(path, encoding='euc-kr')

##### Data Preprocessing

In [4]:
df_1.dropna(subset=['Title', 'Price'], inplace=True)
df_1['Price'] = pd.to_numeric(df_1['Price'].str.replace('[^\d.]', ''), errors='coerce')

  df_1['Price'] = pd.to_numeric(df_1['Price'].str.replace('[^\d.]', ''), errors='coerce')


In [5]:
df = df_1[df_1['Title'].str.contains('[가-힣]', regex=True)]

In [6]:
X = df['Title']
y = df['Price']

##### Tokenization, Vectorization, Padding

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [8]:
X_pad = pad_sequences(X_seq, maxlen=30)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

##### Modeling

In [10]:
model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=30))    # (25495 + 1) * 128 = 3263488
model_cnn.add(Conv1D(filters=64, kernel_size=5, activation='relu'))                                 # (30 - 5) / 1 + 1= 26, (5 * 128 + 1) * 64 = 41024
model_cnn.add(MaxPooling1D(pool_size=2))                                                            # (26 - 2) / 2 + 1 = 13
model_cnn.add(Conv1D(filters=32, kernel_size=3, activation='relu'))                                 # (13 - 3) / 1 + 1 = 11, ((3 * 64 + 1) * 32 = 6176
model_cnn.add(GlobalMaxPooling1D())                                                                 # 1 * 32 = 32
model_cnn.add(Dense(64, activation='relu'))                                                         # (32 + 1) * 64 = 2112
model_cnn.add(Dense(1, activation='linear'))                                                        # (64 + 1) * 1 = 65
model_cnn.compile(optimizer='adam', loss='mean_squared_error')
model_cnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 128)           3263488   
                                                                 
 conv1d (Conv1D)             (None, 26, 64)            41024     
                                                                 
 max_pooling1d (MaxPooling1  (None, 13, 64)            0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 11, 32)            6176      
                                                                 
 global_max_pooling1d (Glob  (None, 32)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 64)                2

In [11]:
model_cnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a8f20f014b0>

##### Evaluation

In [12]:
loss = model_cnn.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {loss}')

y_pred = model_cnn.predict(X_test).flatten()

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared (R^2): {r2}')

Test Loss: 318504928.0
Mean Squared Error (MSE): 318504944.5007661
Root Mean Squared Error (RMSE): 17846.70682509146
Mean Absolute Error (MAE): 6347.199897144459
R-squared (R^2): 0.15631425310685443
