### 책의 제목을 보고 책의 가격을 예측하기

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

##### Load dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
path = '/content/drive/MyDrive/Samples/book.csv'
df_1 = pd.read_csv(path, encoding='euc-kr')


##### Data Preprocessing

In [11]:
df_1.dropna(subset=['Title', 'Price'], inplace=True)

In [12]:
df_1['Price'] = pd.to_numeric(df_1['Price'].str.replace('[^\d.]', ''), errors='coerce')

  df_1['Price'] = pd.to_numeric(df_1['Price'].str.replace('[^\d.]', ''), errors='coerce')


In [13]:
df = df_1[df_1['Title'].str.contains('[가-힣]', regex=True)]

In [14]:
X = df['Title']
y = df['Price']

##### Tokenization, Vectorization

**토큰화**: 문장을 단어별로 나누는 것 (형태소로 분석하고 나누는 것)

ex) 가방 / 에 / 들어가 / 시 / ㄴ다

**벡터화**: 단어를 추출하여 수치화하는 것 (토큰화된 텍스트를 벡터로 변환하는 것)

one-hot encoding: 각 단어를 하나의 인덱스가 1이고 나머지는 0인 벡터로 표현

word embedding: 각 단어를 고정된 크기의 벡터로 표현

In [16]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [18]:
X_pad = pad_sequences(X_seq, maxlen=30)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

##### Modeling

In [21]:
model_fnn = Sequential()
model_fnn.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=30))
model_fnn.add(Flatten())
model_fnn.add(Dense(64, activation='relu'))
model_fnn.add(Dense(1, activation='linear'))
model_fnn.compile(optimizer='adam', loss='mean_squared_error')
model_fnn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 128)           3263488   
                                                                 
 flatten_1 (Flatten)         (None, 3840)              0         
                                                                 
 dense_2 (Dense)             (None, 64)                245824    
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3509377 (13.39 MB)
Trainable params: 3509377 (13.39 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
model_fnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b42b0dc8f70>

##### Evaluation

In [24]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [25]:
loss = model_fnn.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {loss}')

y_pred = model_fnn.predict(X_test).flatten()

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

r2 = r2_score(y_test, y_pred)
print(f'R-sqared (R^2): {r2}')

Test Loss: 336550208.0
Mean Squared Error (MSE): 336550262.4616745
Root Mean Squared Error (RMSE): 18345.306278764456
Mean Absolute Error (MAE): 6779.866832116357
R-squared (R^2): 0.10851412370655034


In [26]:
y_pred

array([ 8438.515 ,  7607.8726,  5258.3374, ...,  8050.1753,  6341.824 ,
       10663.306 ], dtype=float32)

In [27]:
df['Price'].mean()

10204.678457335987