# 로이터 기사 데이터셋을 불러와 분류

In [1]:
NUM_WORDS = 2500

In [2]:
# 데이터셋 불러오기
from tensorflow.keras.datasets import reuters
(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=NUM_WORDS)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(8982,) (8982,) (2246,) (2246,)


In [3]:
# 데이터 라벨 확인하기
labels = reuters.get_label_names()
print(len(labels), labels)

46 ('cocoa', 'grain', 'veg-oil', 'earn', 'acq', 'wheat', 'copper', 'housing', 'money-supply', 'coffee', 'sugar', 'trade', 'reserves', 'ship', 'cotton', 'carcass', 'crude', 'nat-gas', 'cpi', 'money-fx', 'interest', 'gnp', 'meal-feed', 'alum', 'oilseed', 'gold', 'tin', 'strategic-metal', 'livestock', 'retail', 'ipi', 'iron-steel', 'rubber', 'heat', 'jobs', 'lei', 'bop', 'zinc', 'orange', 'pet-chem', 'dlr', 'gas', 'silver', 'wpi', 'hog', 'lead')


In [4]:
# 데이터 라벨 확인하기
import numpy as np
index, counts = np.unique(y_train, return_counts=True)
for i in range(len(labels)):
    print(i, labels[i], counts[i])

0 cocoa 55
1 grain 432
2 veg-oil 74
3 earn 3159
4 acq 1949
5 wheat 17
6 copper 48
7 housing 16
8 money-supply 139
9 coffee 101
10 sugar 124
11 trade 390
12 reserves 49
13 ship 172
14 cotton 26
15 carcass 20
16 crude 444
17 nat-gas 39
18 cpi 66
19 money-fx 549
20 interest 269
21 gnp 100
22 meal-feed 15
23 alum 41
24 oilseed 62
25 gold 92
26 tin 24
27 strategic-metal 15
28 livestock 48
29 retail 19
30 ipi 45
31 iron-steel 39
32 rubber 32
33 heat 11
34 jobs 50
35 lei 10
36 bop 49
37 zinc 19
38 orange 19
39 pet-chem 24
40 dlr 36
41 gas 30
42 silver 13
43 wpi 21
44 hog 12
45 lead 18


In [5]:
# DNN 모델 구현하기
from tensorflow.keras import Sequential, layers

model = Sequential([
    layers.Input(shape=(2500,)),
    layers.Dense(512, activation='relu'),
    layers.Dropout(0,5),
    layers.Dense(46, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               1280512   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 46)                23598     
                                                                 
Total params: 1304110 (4.97 MB)
Trainable params: 1304110 (4.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [6]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [7]:
# # 입력의 길이를 일정하게 - Tokenizer의 sequencts_to_matrix 사용
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=NUM_WORDS)
X_train_pad = tok.sequences_to_matrix(X_train, mode='count')
X_test_pad = tok.sequences_to_matrix(X_test, mode='count')

In [8]:
# # 입력의 길이를 일정하게 - pad_sequences 사용
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# X_train_pad = pad_sequences(X_train, maxlen=NUM_WORDS)
# X_test_pad = pad_sequences(X_test, maxlen=NUM_WORDS)

In [9]:
X_train_pad = X_train_pad.reshape(-1, 2500)
X_test_pad = X_test_pad.reshape(-1, 2500)

In [10]:
model.fit(X_train_pad, y_train, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x222313f76d0>

In [11]:
model.evaluate(X_test_pad, y_test)



[1.1223628520965576, 0.7951914668083191]

In [12]:
sample = X_test_pad[333, :].reshape(-1, 2500)

pred = model.predict(sample)
# print(pred)
import numpy as np
print(np.argmax(pred, axis=1))

[3]


## CNN으로 구현해보기

In [13]:
# Conv2D는 모델을 개선할 목적으로 추가한 것이 아닙니다.
# 인공신경망의 기본 이론을 알고 있으면 어떤 모델을 사용하더라도
# 입력과 출력의 shape만 맞춰주면 학습은 된다는 것을 보여주기 위함입니다.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten

model = Sequential()
model.add(Conv2D(32, (3,3), input_shape=(50,50,1), activation='relu'))
model.add(MaxPooling2D())
model.add(Conv2D(64, (3,3), activation='relu'))
model.add(MaxPooling2D())
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
#  model.add(Dense(256, activation='relu'))
model.add(Dense(46, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 48, 48, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 24, 24, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 22, 22, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 11, 11, 64)        0         
 g2D)                                                            
                                                                 
 flatten (Flatten)           (None, 7744)              0         
                                                                 
 dense_2 (Dense)             (None, 512)              

In [14]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [15]:
# # 입력의 길이를 일정하게 - Tokenizer의 sequencts_to_matrix 사용
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=NUM_WORDS)
X_train_pad = tok.sequences_to_matrix(X_train, mode='count')
X_test_pad = tok.sequences_to_matrix(X_test, mode='count')

In [16]:
# # 입력의 길이를 일정하게
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# X_train_pad = pad_sequences(X_train, maxlen=NUM_WORDS)
# X_test_pad = pad_sequences(X_test, maxlen=NUM_WORDS)

In [17]:
type(X_train_pad), X_train_pad.shape

(numpy.ndarray, (8982, 2500))

In [18]:
X_train_pad = X_train_pad.reshape(-1, 50, 50, 1)

In [19]:
# 모델 학습
model.fit(X_train_pad, y_train, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x2222f581950>

In [20]:
# 모델 평가
X_train_pad = X_test_pad.reshape(-1, 50, 50, 1)
model.evaluate(X_train_pad, y_test)



[1.2723637819290161, 0.7764915227890015]

In [21]:
# 샘플 데이터를 이용한 예측
sample_no = 333
# X_train_pad[sample_no].reshape(-1, NUM_WORDS).shape
sample = X_train_pad[sample_no].reshape(-1, 50, 50, 1)
pred = model.predict(sample)
y_pred = np.argmax(pred, axis=1)
print(y_test[sample_no], y_pred, labels[y_pred[0]])

3 [3] earn
