In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split  #scikit-learn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import pickle
from konlpy.tag import Kkma, Okt

In [2]:
df = pd.read_csv('./crawling_data/naver_news_20220330.csv')
pd.set_option('display.unicode.east_asian_width',True)
df.reset_index(drop=False ,inplace=True)
df= df[['index','title','category']]
df.set_index('index',inplace=True)
X = df.title
Y = df.category
df.info()
X
Y

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13931 entries, 0 to 13930
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     13931 non-null  object
 1   category  13931 non-null  object
dtypes: object(2)
memory usage: 326.5+ KB


index
0        Economic
1        Economic
2        Economic
3        Economic
4        Economic
           ...   
13926     Culture
13927     Culture
13928     Culture
13929     Culture
13930     Culture
Name: category, Length: 13931, dtype: object

In [3]:
from tensorflow.python.keras.utils.np_utils import to_categorical
encoder = LabelEncoder()
Label_Y = encoder.fit_transform(Y)
print(encoder.classes_)
print(Label_Y[:5])
with open('./output/encoder.pickle','wb') as f:
    pickle.dump(encoder, f)
onehot_Y = to_categorical(Label_Y)
print(onehot_Y)


['Culture' 'Economic' 'IT' 'Politics' 'Social' 'World']
[1 1 1 1 1]
[[0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]]


In [4]:
okt = Okt()
for i in range(len(X)):
    X[i] = okt.morphs(X[i], stem=True)

In [5]:
stopwords = pd.read_csv('./crawling_data/stopwords.csv' ,index_col=0)

for j in range(len(X)):
    words = []
    for i in range(len(X[j])):
        if len(X[j][i]) > 1:
            if X[j][i] not in list(stopwords['stopword']):
                words.append(X[j][i])
    X[j] = ' '.join(words)
print(X[1])


핫스탁 안랩 남선 미우다 플랜 티넷 종합


In [6]:
token = Tokenizer()
token.fit_on_texts(X)
tokened_X = token.texts_to_sequences(X)
print(tokened_X[0])
with open('./output/news_token.pickle', 'wb') as f:
    pickle.dump(token, f)
wordsize = len(token.word_index) + 1
print(wordsize)
print(token.word_index)


[5796, 4647, 584, 365, 1105, 7763]
12831
{'크다': 1, '삼성': 2, '대다': 3, '이준석': 4, '정부': 5, '코로나': 6, 'lg': 7, '종합': 8, '인수위': 9, '러시아': 10, '날씨': 11, '대통령': 12, '만원': 13, 'sk': 14, '논란': 15, '당선인': 16, '만에': 17, '시위': 18, '김정숙': 19, '투자': 20, '단독': 21, '내일': 22, '가능성': 23, '개발': 24, '에도': 25, '검토': 26, '돼다': 27, '장애인': 28, '푸틴': 29, '세계': 30, '카카오': 31, '전국': 32, '가다': 33, '총리': 34, '오늘': 35, '대표': 36, '윤석열': 37, '받다': 38, '사업': 39, '시장': 40, '기업': 41, '장연': 42, '추진': 43, '나오다': 44, '뉴스': 45, '바이오': 46, '한국': 47, '여사': 48, '국민': 49, '영상': 50, '협상': 51, '진자': 52, '주가': 53, '치료': 54, '사망': 55, '이유': 56, '출마': 57, '서울': 58, '신규': 59, '바이든': 60, '인수': 61, '서다': 62, '체인': 63, '블록': 64, '출시': 65, '확진': 66, '옷값': 67, '공개': 68, '건설': 69, '이전': 70, '최대': 71, '정치': 72, '준비': 73, '게임': 74, '안철수': 75, '통화': 76, '민주당': 77, '지원': 78, '쌍용차': 79, '직원': 80, '완화': 81, '렌스': 82, '반도체': 83, '먹다': 84, '우려': 85, '미국': 86, '알다': 87, '가상': 88, '스퀘어': 89, '오미크론': 90, '산업': 91, '만들다': 92, '접종': 93, '폐지': 94, '경기':

In [7]:
max = 0
for i in range(len(tokened_X)):
    if max < len(tokened_X[i]):
        max = len(tokened_X[i])
print(max)

29


In [8]:
X_pad = pad_sequences(tokened_X, max)
print(X_pad[:10])

X_train, X_test, Y_train, Y_test = train_test_split(
    X_pad, onehot_Y, test_size=0.1)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

xy = X_train, X_test, Y_train, Y_test
np.save(
    './crawling_data/news_data_max_{}_wordsize_{}'.format(max, wordsize), xy)

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0 5796 4647  584  365 1105
  7763]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0 3938  763 3939 4648 3940 5797
     8]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 2722  145   70  229 1549 1292 1293  435  136
   109]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0 7764 3941 5798   13 1455   58
  3942]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0  969 1106   69  737 3036
  1369]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0  420  436  887  285  854 1008 7765  232  394
  1107]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0