In [2]:
import pandas as pd
import numpy as np
import urllib.request
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\008yo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\008yo\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\008yo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
df = pd.read_csv("abcnews-date-text.csv")
df.shape

(1082168, 2)

In [7]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [8]:
text = df[['headline_text']].copy()
text.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [9]:
text.nunique() # 중복을 제외하고 유일한 시퀀스를 가지는 샘플의 개수를 출력

headline_text    1054983
dtype: int64

In [10]:
text.drop_duplicates(inplace=True) # 중복 샘플 제거
text.reset_index(drop=True, inplace=True)
text.shape

(1054983, 1)

### 데이터 정제 및 정규화
* NLTK의 토크나이저를 이용해 전체 텍스트 데이터에 대해서 단어 토큰화를 수행
* NLTK가 제공하는 불용어 리스트를 사용하여 불용어를 제거

In [11]:
# NLTK 토크나이저를 이용해서 토큰화
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)

# 불용어 제거
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])

text.head()

Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


* 동일한 단어지만 다른표현을 가지는 단어들은 하나의 단어로 통합하는 단어 정규화 과정
* 길이가 1~2인 단어를 제거

In [12]:
# 단어 정규화. 3인칭 단수 표현 -> 1인칭 변환, 과거형 동사 -> 현재형 동사 등을 수행한다.
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

# 길이가 1 ~ 2인 단어는 제거.
text = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 2])
print(text[:5])

0     [aba, decide, community, broadcast, licence]
1    [act, fire, witness, must, aware, defamation]
2       [call, infrastructure, protection, summit]
3            [air, staff, aust, strike, pay, rise]
4    [air, strike, affect, australian, travellers]
Name: headline_text, dtype: object


#### 역토큰화 및 DTM 생성
* 토큰화 과정을 역으로 되돌리는 역토큰화 과정

In [13]:
# 역토큰화 (토큰화 작업을 역으로 수행)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(text[i])
    detokenized_doc.append(t)

train_data = detokenized_doc

In [14]:
# 전처리 최종물 확인
train_data[:5]

['aba decide community broadcast licence',
 'act fire witness must aware defamation',
 'call infrastructure protection summit',
 'air staff aust strike pay rise',
 'air strike affect australian travellers']

In [15]:
# 상위 5000개의 단어만 사용
c_vectorizer = CountVectorizer(stop_words='english', max_features = 5000)
document_term_matrix = c_vectorizer.fit_transform(train_data)

In [16]:
print('행렬의 크기 :',document_term_matrix.shape)

행렬의 크기 : (1054983, 5000)


### scikit-learn TruncatedSVD 활용

In [17]:
from sklearn.decomposition import TruncatedSVD

n_topics = 10
lsa_model = TruncatedSVD(n_components = n_topics)
lsa_model.fit_transform(document_term_matrix)

array([[ 0.01203015, -0.00371729,  0.01834612, ...,  0.00326195,
         0.00342136,  0.00894811],
       [ 0.0290546 , -0.01084706,  0.01819475, ...,  0.00153605,
        -0.01239243, -0.00781251],
       [ 0.0050321 , -0.00203204,  0.00977834, ..., -0.00235831,
         0.00195627,  0.00150727],
       ...,
       [ 0.02972299,  0.0041781 ,  0.02509516, ...,  0.0319858 ,
         0.00921418, -0.02023933],
       [ 0.06183063, -0.00513236,  0.13610724, ...,  0.95695778,
         0.74001662, -0.15627101],
       [ 0.0713694 ,  0.02823953,  0.00097514, ...,  0.00630366,
         0.0173362 ,  0.01154871]])

In [18]:
print(lsa_model.components_.shape)

(10, 5000)


In [19]:
terms = c_vectorizer.get_feature_names_out() # 단어 집합. 5,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lsa_model.components_, terms)

Topic 1: [('police', 0.74637), ('man', 0.45358), ('charge', 0.21091), ('new', 0.14091), ('court', 0.11134)]
Topic 2: [('man', 0.69434), ('charge', 0.30044), ('court', 0.16815), ('face', 0.1139), ('murder', 0.10635)]
Topic 3: [('new', 0.8365), ('plan', 0.23647), ('say', 0.18248), ('council', 0.11034), ('govt', 0.10974)]
Topic 4: [('say', 0.73847), ('plan', 0.35842), ('govt', 0.16857), ('council', 0.12749), ('urge', 0.07491)]
Topic 5: [('plan', 0.73121), ('council', 0.1763), ('govt', 0.14209), ('urge', 0.08408), ('water', 0.07652)]
Topic 6: [('govt', 0.50939), ('court', 0.27096), ('urge', 0.25224), ('fund', 0.22815), ('face', 0.16761)]
Topic 7: [('charge', 0.51894), ('court', 0.44928), ('face', 0.34175), ('plan', 0.12773), ('murder', 0.12518)]
Topic 8: [('win', 0.67142), ('court', 0.32503), ('crash', 0.1203), ('kill', 0.09795), ('face', 0.0947)]
Topic 9: [('win', 0.53942), ('charge', 0.49807), ('council', 0.21431), ('sydney', 0.07153), ('cup', 0.06658)]
Topic 10: [('council', 0.76832), (