In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) # 헤더, 푸터, 문장부호 삭제
documents = dataset.data
len(documents)

11314

In [2]:
news_df = pd.DataFrame({'document': documents})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ") # 알파벳만 두고 나머지는 공백으로
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) # 단어에 글자수가 4글자 이상인 것만 놔둠
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) # 단어를 전부 소문자로

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ") # 알파벳만 두고 나머지는 공백으로


In [3]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 공백 기준으로 단어를 다 나눔
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words]) # 불용어 처리

In [4]:
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i]) # 공백을 기준으로 나뉘어있던 단어를 다시 공백을 통해 조인
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000) # 전체 단어에 대해서 다 하는 게 아니라 단어 수 상위 1000개만
X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape

(11314, 1000)

In [6]:
from sklearn.decomposition import TruncatedSVD

svd_model = TruncatedSVD(n_components=20) # 행렬 특이값 분해. 차원 축소 개념. 11314개의 행을 20개로 압축
svd_model.fit(X)
len(svd_model.components_)

20

In [7]:
import numpy as np

np.shape(svd_model.components_)

(20, 1000)

In [8]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d: "% (idx+1),[(feature_names[i],topic[i].round(5)) for i in topic.argsort()[: -n -1:-1]])

get_topics(svd_model.components_,terms)
# 토픽의 단어와 그 단어가 차지하는 중요도가 출력됨

Topic 1:  [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2:  [('thanks', 0.32891), ('windows', 0.29072), ('card', 0.18078), ('drive', 0.17451), ('mail', 0.1511)]
Topic 3:  [('game', 0.37063), ('team', 0.32403), ('year', 0.28147), ('games', 0.25328), ('season', 0.18408)]
Topic 4:  [('drive', 0.53088), ('scsi', 0.20404), ('hard', 0.15551), ('disk', 0.15503), ('card', 0.14439)]
Topic 5:  [('windows', 0.40909), ('file', 0.25523), ('window', 0.17776), ('files', 0.16544), ('program', 0.13993)]
Topic 6:  [('mail', 0.16663), ('government', 0.15703), ('chip', 0.14934), ('space', 0.14539), ('information', 0.13861)]
Topic 7:  [('like', 0.66955), ('bike', 0.14254), ('know', 0.11227), ('chip', 0.11018), ('sounds', 0.10378)]
Topic 8:  [('card', 0.42788), ('sale', 0.22683), ('video', 0.20916), ('monitor', 0.15626), ('price', 0.15573)]
Topic 9:  [('know', 0.42763), ('card', 0.34943), ('people', 0.19555), ('chip', 0.16402), ('government', 0.1635



In [9]:
# 잠재 디리클레 할당 LDA

In [10]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

stm = PorterStemmer()
stopwords = set(stopwords.words('english'))
pattern = re.compile('[a-zA-Z][-_a-zA-Z0-9.]*')

def tokenize(sentence):
    def stem(w):
        try: return stm.stem(w)
        except: return w
    return [stem(w) for w in word_tokenize(sentence.lower())
            if w not in stopwords and pattern.match(w)]


In [11]:
import tomotopy as tp

model = tp.LDAModel(k=20, min_cf=5)
for i, line in enumerate(open('/Users/ian/Desktop/Study/data/text/trumph.txt', encoding='cp949')):
    model.add_doc(tokenize(line))

In [12]:
model.train(0)
print('Total docs: ',len(model.docs))
print('Total words: ',model.num_words)
print('Vocab size: ',model.num_vocabs)

Total docs:  1
Total words:  162
Vocab size:  21


In [13]:
model.train(200)
for i in range(model.k):
    res = model.get_topic_words(i, top_n=10)
    print('Topic #{0}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

Topic #0	american, peopl, one, back, great, world, everi, protect, countri, nation
Topic #1	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #2	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #3	god, american, nation, countri, peopl, one, everi, protect, world, great
Topic #4	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #5	countri, american, nation, america, peopl, one, everi, protect, world, great
Topic #6	dream, right, nation, countri, peopl, one, everi, protect, world, great
Topic #7	everi, new, nation, countri, peopl, one, american, protect, world, great
Topic #8	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #9	america, today, new, countri, peopl, one, everi, protect, world, great
Topic #10	never, make, nation, countri, peopl, one, everi, protect, world, great
Topic #11	nation, across, make, countri, peopl, one, everi, protect, world, great

In [14]:
import nltk

emma_raw = nltk.corpus.gutenberg.raw('austen-emma.txt')

In [15]:
import tomotopy as tp

model = tp.LDAModel(k=5, min_cf=5)
model.add_doc(tokenize(emma_raw))

model.train(0)
print('Total docs: ',len(model.docs))
print('Total words: ',model.num_words)
print('Vocab size: ',model.num_vocabs)

Total docs:  1
Total words:  67220
Vocab size:  1776


In [16]:
model.train(100)
for i in range(model.k):
    res = model.get_topic_words(i, top_n=2)
    print('Topic #{0}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

Topic #0	knightley, mr.
Topic #1	mr., could
Topic #2	harriet, would
Topic #3	alway, one
Topic #4	emma, thing


In [18]:
import tomotopy as tp
import re
from konlpy.tag import Hannanum

han = Hannanum()

model = tp.LDAModel(k=10, min_cf=2)

for i, line in enumerate(open('/Users/ian/Desktop/Study/data/text/news1.txt', encoding='utf-8')):
    sentence = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', line)
    a = sentence.strip()
    n = han.nouns(a)
    n2 = [x for x in n if len(x)>1]
    if len(n2) > 0:
        model.add_doc(n2)

model.train(0)
print('Total docs: ',len(model.docs))
print('Total words: ',model.num_words)
print('Vocab size: ',model.num_vocabs)

model.train(100)
for i in range(model.k):
    res = model.get_topic_words(i, top_n=10)
    print('Topic #{0}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

Total docs:  27
Total words:  320
Vocab size:  101
Topic #0	여성, 불안, 취업, 청년들, 선택, 분야, 자리, 서류전형, 이상, 영향
Topic #1	여성들, 노동환경, 가치, 때문, 증명, 경력, 기회, 직장, 좌절, 시간
Topic #2	여성, 참여자, 아이, 팬데믹, 성차별, 자신, 정도, 심층, 인터뷰, 청년
Topic #3	회사, 세대, 전망, 퇴사, 조직문화, 권리, 년대생, 직업적, 보장, 추구
Topic #4	애교, 강요, 사장님, 존재, 거래처, 회식, 직원, 이슈, 거예요, 재롱
Topic #5	이직, 비율, 만원, 년대생, 한번, 이해, 결혼, 심층, 인터뷰, 청년
Topic #6	노동, 여성, 노동자들, 환경, 결혼, 미래, 연구원, 현실, 비전, 고군분투
Topic #7	문제, 청년, 사회, 결과, 진행, 심각, 관계, 응답, 설문조사, 부적응자
Topic #8	인터뷰, 심층, 정규직, 위치, 최혜영, 실태조사, 직장, 경우, 부담, 청년
Topic #9	상황, 분석, 하나, 사람, 참여, 조건, 경제적, 어려움, 사회적, 요구
