In [2]:
import pandas as pd
import numpy as np
import urllib.request
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## LSA

In [3]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [4]:
df = pd.read_csv("abcnews-date-text.csv")
df.shape

(1082168, 2)

In [5]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [6]:
text = df[['headline_text']].copy()
text.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [7]:
text.nunique() # 중복을 제외하고 유일한 시퀀스를 가지는 샘플의 개수를 출력

headline_text    1054983
dtype: int64

In [8]:
text.drop_duplicates(inplace=True) # 중복 샘플 제거
text.reset_index(drop=True, inplace=True)
text.shape

(1054983, 1)

### 데이터 정제 및 정규화
* NLTK의 토크나이저를 이용해 전체 텍스트 데이터에 대해서 단어 토큰화를 수행
* NLTK가 제공하는 불용어 리스트를 사용하여 불용어를 제거

In [9]:
# NLTK 토크나이저를 이용해서 토큰화
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)

# 불용어 제거
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])

text.head()

Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


* 동일한 단어지만 다른표현을 가지는 단어들은 하나의 단어로 통합하는 단어 정규화 과정
* 길이가 1~2인 단어를 제거

In [10]:
# 단어 정규화. 3인칭 단수 표현 -> 1인칭 변환, 과거형 동사 -> 현재형 동사 등을 수행한다.
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

# 길이가 1 ~ 2인 단어는 제거.
text = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 2])
print(text[:5])

0     [aba, decide, community, broadcast, licence]
1    [act, fire, witness, must, aware, defamation]
2       [call, infrastructure, protection, summit]
3            [air, staff, aust, strike, pay, rise]
4    [air, strike, affect, australian, travellers]
Name: headline_text, dtype: object


#### 역토큰화 및 DTM 생성
* 토큰화 과정을 역으로 되돌리는 역토큰화 과정

In [11]:
# 역토큰화 (토큰화 작업을 역으로 수행)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(text[i])
    detokenized_doc.append(t)

train_data = detokenized_doc

In [12]:
# 전처리 최종물 확인
train_data[:5]

['aba decide community broadcast licence',
 'act fire witness must aware defamation',
 'call infrastructure protection summit',
 'air staff aust strike pay rise',
 'air strike affect australian travellers']

In [13]:
# 상위 5000개의 단어만 사용
c_vectorizer = CountVectorizer(stop_words='english', max_features = 5000)
document_term_matrix = c_vectorizer.fit_transform(train_data)

In [14]:
print('행렬의 크기 :',document_term_matrix.shape)

행렬의 크기 : (1054983, 5000)


### scikit-learn TruncatedSVD 활용

In [15]:
from sklearn.decomposition import TruncatedSVD

n_topics = 10
lsa_model = TruncatedSVD(n_components = n_topics)
lsa_model.fit_transform(document_term_matrix)

array([[ 0.0120269 , -0.00366278,  0.01833436, ...,  0.0040925 ,
         0.00252895,  0.01211236],
       [ 0.0290427 , -0.01065042,  0.01833555, ..., -0.00491488,
        -0.00598788, -0.00213485],
       [ 0.00502692, -0.00201313,  0.0097928 , ..., -0.0034884 ,
         0.00200884,  0.00261587],
       ...,
       [ 0.02971472,  0.00460226,  0.02523923, ...,  0.02343761,
         0.01736954,  0.02910524],
       [ 0.06171363, -0.00611369,  0.13742009, ...,  0.8604459 ,
         0.76776338, -0.42321487],
       [ 0.07141915,  0.02863788,  0.00160812, ..., -0.00352558,
         0.0098141 ,  0.06181563]])

In [16]:
print(lsa_model.components_.shape)

(10, 5000)


In [17]:
terms = c_vectorizer.get_feature_names_out() # 단어 집합. 5,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lsa_model.components_, terms)

Topic 1: [('police', 0.74636), ('man', 0.45355), ('charge', 0.21098), ('new', 0.14092), ('court', 0.11147)]
Topic 2: [('man', 0.69448), ('charge', 0.30029), ('court', 0.16857), ('face', 0.11254), ('murder', 0.10653)]
Topic 3: [('new', 0.83679), ('plan', 0.23604), ('say', 0.18242), ('govt', 0.10996), ('council', 0.10868)]
Topic 4: [('say', 0.73941), ('plan', 0.35729), ('govt', 0.16455), ('council', 0.13044), ('fund', 0.0797)]
Topic 5: [('plan', 0.73259), ('council', 0.1739), ('govt', 0.13586), ('urge', 0.0822), ('fund', 0.06208)]
Topic 6: [('govt', 0.55633), ('urge', 0.26972), ('court', 0.24011), ('fund', 0.1693), ('win', 0.15865)]
Topic 7: [('charge', 0.54799), ('court', 0.43272), ('face', 0.37049), ('murder', 0.11129), ('plan', 0.09978)]
Topic 8: [('win', 0.60617), ('court', 0.33921), ('kill', 0.23939), ('council', 0.17761), ('face', 0.12636)]
Topic 9: [('win', 0.58054), ('charge', 0.48159), ('water', 0.083), ('murder', 0.07007), ('fund', 0.06419)]
Topic 10: [('council', 0.55438), ('k

## LDA

#### TF-IDF 행렬 생성

In [18]:
# 상위 5,000개의 단어만 사용
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tf_idf_matrix = tfidf_vectorizer.fit_transform(train_data)

# TF-IDF 행렬의 크기를 확인해봅시다.
print('행렬의 크기 :', tf_idf_matrix.shape)

행렬의 크기 : (1054983, 5000)


#### scikit-learn LDA Model 활용

In [19]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=777, max_iter=1)
lda_model.fit_transform(tf_idf_matrix)

array([[0.0335099 , 0.0335099 , 0.0335099 , ..., 0.17024867, 0.0335099 ,
        0.0335099 ],
       [0.03365631, 0.03365631, 0.03365631, ..., 0.03365631, 0.03365631,
        0.03365631],
       [0.25184095, 0.0366096 , 0.0366096 , ..., 0.0366096 , 0.0366096 ,
        0.0366096 ],
       ...,
       [0.26687206, 0.02914502, 0.02914502, ..., 0.13007484, 0.02916018,
        0.28739608],
       [0.10378115, 0.02637829, 0.12325014, ..., 0.02637829, 0.02637829,
        0.02637829],
       [0.03376055, 0.03376055, 0.2255442 , ..., 0.03376055, 0.03376055,
        0.03376055]])

In [20]:
print(lda_model.components_.shape)

(10, 5000)


In [23]:
terms = tfidf_vectorizer.get_feature_names_out()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n-1:-1]])

get_topics(lda_model.components_, terms)

Topic 1: [('australia', 9359.06334), ('sydney', 5854.97288), ('attack', 4784.76322), ('change', 4193.63035), ('year', 3924.88997)]
Topic 2: [('government', 6344.07413), ('charge', 5947.12292), ('man', 4519.7974), ('state', 3658.16422), ('live', 3625.10473)]
Topic 3: [('australian', 7666.65651), ('say', 7561.01807), ('police', 5513.22932), ('home', 4048.38409), ('report', 3796.04446)]
Topic 4: [('melbourne', 5298.35047), ('south', 4844.59835), ('death', 4281.78433), ('china', 3214.44581), ('women', 3029.28443)]
Topic 5: [('win', 5704.0914), ('canberra', 4322.0963), ('die', 4025.63057), ('open', 3771.65243), ('warn', 3577.47151)]
Topic 6: [('court', 5246.3124), ('world', 4536.86331), ('country', 4166.34794), ('woman', 3983.97748), ('crash', 3793.50267)]
Topic 7: [('election', 5418.5038), ('adelaide', 4864.95604), ('house', 4478.6135), ('school', 3966.82676), ('2016', 3955.11155)]
Topic 8: [('trump', 8189.58575), ('new', 6625.2724), ('north', 3705.40987), ('rural', 3521.42659), ('donald',