In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Embedding, Bidirectional

In [2]:
model = Sequential()

model.add(Input(shape=(200,))) # 입력하는 단어의 개수를 200개로 제한
model.add(Embedding(input_dim=5000, output_dim=64))
model.add(Dropout(0.5))

model.add(Bidirectional(LSTM(100)))
model.add(Dense(6, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 64)           320000    
                                                                 
 dropout (Dropout)           (None, 200, 64)           0         
                                                                 
 bidirectional (Bidirection  (None, 200)               132000    
 al)                                                             
                                                                 
 dense (Dense)               (None, 6)                 1206      
                                                                 
Total params: 453206 (1.73 MB)
Trainable params: 453206 (1.73 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [3]:
import nltk
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(stopwords)

{'weren', 'theirs', 'that', 'his', 'about', 'being', 'here', 'am', "couldn't", 'shouldn', 'which', 'when', 'themselves', 'all', 'they', 'wasn', 'did', 'only', 't', 'who', "doesn't", 'during', 'ain', "you've", 'your', 'too', "won't", 'and', 'haven', 'from', 'as', 'then', 'be', 'yours', 'under', 're', 'this', 'itself', 'of', 've', 'myself', 'above', 'very', 'between', 'you', 'don', 'whom', 'hasn', 'not', 'me', 'him', 'aren', 'm', 'just', 'having', 'ma', 'again', "you'll", 'yourselves', 'ours', 'in', "haven't", 'mustn', 'most', "don't", 'further', 'there', 'how', 'same', 'should', 'while', 'into', 'why', 'y', 'to', "didn't", 'is', 'so', 'no', 'than', 'my', "you're", 'up', 'where', 'what', "you'd", 'each', "it's", 'at', 'own', 'both', "wouldn't", 'were', 'against', 'a', 'by', 'for', 'its', 'these', 'on', 'd', 'i', 'needn', 'if', 'doesn', 'she', 'them', 'our', 'yourself', 'he', "should've", 'below', 'now', 'her', 'over', 'ourselves', 'their', 's', "mightn't", 'll', 'wouldn', 'have', 'couldn

In [5]:
import csv

original = [] # 기사 원본 저장
processed = [] # 전처리된 기사 저장
labels = [] # 기사 카테고리

with open('bbc-text.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader) # 첫 번째 행은 메타정보를 가지고 있으므로 건너뜀
    for row in reader:
        labels.append(row[0])
        original.append(row[1])

        news = row[1]
        for word in stopwords:
            token = ' ' + word + ' ' # 단어단위로 불용어를 제거해야 하므로...
            news = news.replace(token, ' ')
        processed.append(news)

In [6]:
# print(labels[0], processed[0])

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
A_token = Tokenizer(num_words=5000, oov_token='OOV') # 단어사전에 없는 토큰들은 OOV로 표시
A_token.fit_on_texts(processed)
A_tokenized = A_token.texts_to_sequences(processed)

In [8]:
# print(A_tokenized[0])

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
A_tokenized_seq = pad_sequences(A_tokenized, maxlen=200, padding='post', truncating='post')
type(A_tokenized_seq)

numpy.ndarray

In [10]:
import numpy as np

C_token = Tokenizer()
C_token.fit_on_texts(labels)
C_tokenized = C_token.texts_to_sequences(labels)
print(type(C_tokenized))
C_tokenized = np.array(C_tokenized).reshape(-1)
print(C_tokenized.shape)

<class 'list'>
(2225,)


In [11]:
idx_to_label = {}
for label, index in C_token.word_index.items():
    idx_to_label[index] = label
print(idx_to_label) # 라벨의 인덱스가 1부터 시작하므로 뉴런의 수를 6개로

{1: 'sport', 2: 'business', 3: 'politics', 4: 'tech', 5: 'entertainment'}


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(A_tokenized_seq, C_tokenized)

In [13]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.fit(X_train, y_train, epochs=20, batch_size=200)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x13f9b1e5590>

In [15]:
model.evaluate(X_test, y_test)



[0.16394966840744019, 0.9551166892051697]

In [16]:
news = ['''
Fast bowler Josh Hazlewood has admitted that it is in Australia's "best interest" for England to be eliminated in the T20 World Cup group stage.

Australia qualified for the Super 8s with a comprehensive nine-wicket win over Namibia, leaving England and Scotland to battle it out for second place.

Jos Buttler's side need to beat Oman and Namibia to have any chance of progressing but, even if they do, net run-rate could determine who goes through.

That would give Australia, who face Scotland on Sunday, a potentially key role in determining who else will advance from Group B.


A Scotland win would send them through and eliminate England but, with Richie Berrington's team currently boasting a vastly superior net run-rate to England, even a narrow Australia victory could be enough to knock out the defending champions.

Asked if Australia would try to make things as difficult as possible for England, Hazlewood said: "Yeah, I think so.

"In this tournament you potentially come up against England at some stage again and they're probably one of the top few teams on their day.

"We've had some real struggles against them in T20 cricket so if we can get them out of the tournament that's in our best interest, as well as probably everyone else's."
''']

In [17]:
import re
news[0] = re.sub(r'[^\w\s]', '', news[0]) # 구둣점 제거
for word in stopwords:
    token = ' ' + word + ' '
    news[0] = news[0].replace(token, ' ')
# print(news[0])

In [18]:
news_seq = A_token.texts_to_sequences(news)
news_padded = pad_sequences(news_seq, maxlen=200, padding='post', truncating='post')

In [19]:
pred = model.predict(news_padded)
print(idx_to_label[np.argmax(pred[0])])

sport
