In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Flatten
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

##### Load dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/Samples/book.csv'
df_1 = pd.read_csv(path, encoding='euc-kr')

##### Data Preprocessing

In [4]:
df_1.dropna(subset=['Title', 'Price'], inplace=True)
df_1['Price'] = pd.to_numeric(df_1['Price'].str.replace('[^\d.]', ''), errors='coerce')

  df_1['Price'] = pd.to_numeric(df_1['Price'].str.replace('[^\d.]', ''), errors='coerce')


In [5]:
df = df_1[df_1['Title'].str.contains('[가-힣]', regex=True)]

In [6]:
X = df['Title']
y = df['Price']

##### Tokenization, Vectorization, Padding

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [8]:
X_pad = pad_sequences(X_seq, maxlen=30)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

##### Modeling

In [10]:
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=30))    # (25495 + 1) * 128 = 3263488
model_rnn.add(SimpleRNN(64, return_sequences=False))                                                # (128 + 64 + 1) * 64 = 12352
model_rnn.add(Dense(64, activation='relu'))                                                         # (64 + 1) * 64 = 4160
model_rnn.add(Dense(1, activation='linear'))                                                        # (64 + 1) * 1 = 65
model_rnn.compile(optimizer='adam', loss='mean_squared_error')
model_rnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 128)           3263488   
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                12352     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3280065 (12.51 MB)
Trainable params: 3280065 (12.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
model_rnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d49932bca60>

##### Evaluation

In [12]:
loss = model_rnn.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {loss}')

y_pred = model_rnn.predict(X_test).flatten()

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared (R^2): {r2}')

Test Loss: 392022976.0
Mean Squared Error (MSE): 392022990.8972734
Root Mean Squared Error (RMSE): 19799.57047254494
Mean Absolute Error (MAE): 6406.823593252677
R-squared (R^2): -0.03842723821091387
