In [1]:
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim import corpora
from gensim import models



In [2]:
def read_texts(df):
    texts=[]
    for i in range(len(df)):
        words =  WordPunctTokenizer().tokenize(df["text"][i])
        texts.append(words)
    print(texts)
    return texts


def tokenize_pos_tag(df):
    res=[]
    for i in range(len(df)):
        sentence = df["text"][i]
        tokenize_pos = pos_tag(WordPunctTokenizer().tokenize(sentence))
        res.append(tokenize_pos)
    return res


def ANRV(words_pos):
    res = []
    for i in range(len(words_pos)):
        tmp=[]
        for j in range(len(words_pos[i])):
            if words_pos[i][j][1][0] in ["J","R","V"]:
                tmp.append((words_pos[i][j][0].lower(), words_pos[i][j][1][0].lower()))
            elif words_pos[i][j][1][0] == "N":
                if words_pos[i][j][1] == "NNP" or words_pos[i][j][1] == "NNPS":
                    tmp.append((words_pos[i][j][0].lower(), words_pos[i][j][1][0].lower()))
                else:
                    tmp.append((words_pos[i][j][0].lower(), words_pos[i][j][1][0].lower()))
        res.append(tmp)
    return res


def Lemma(words_anrv):
    lm = WordNetLemmatizer()
    res=[]
    for i in range(len(words_anrv)):
        tmp=[]
        for j in range(len(words_anrv[i])):
            #print(words_anrv[i][j][0])
            if words_anrv[i][j][1] == "j":
                pos = "a"
            else:
                pos=words_anrv[i][j][1]
            #print(lm.lemmatize(words_anrv[i][j][0], pos=pos))
            tmp.append(lm.lemmatize(words_anrv[i][j][0], pos=pos))
        res.append(tmp)
    return res


def clean_stopword(words):
    stop_words = stopwords.words("english")
    res = []
    for i in range(len(words)):
        res.append([w for w in words[i] if w not in stop_words and len(w) > 3])
    return res
    

In [3]:
df1 = pd.read_csv("21_9,10.csv")
df2 = pd.read_csv("21_7,8.csv")
df3 = pd.read_csv("21_5,6.csv")
df4 = pd.read_csv("21_3,4.csv")
df5 = pd.read_csv("21_1,2.csv")
df6 = pd.read_csv("20_11,12.csv")
df7 = pd.read_csv("20_9,10.csv")
df8 = pd.read_csv("20_7,8.csv")
df9 = pd.read_csv("20_5,6.csv")
df10 = pd.read_csv("20_3,4.csv")
df11 = pd.read_csv("20_2.csv")

df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11], ignore_index=True)
df=df[["date", "title", "text"]]
len(df)

1050

In [5]:
import pickle

In [6]:
with open("cnn_news_content.pk", "wb") as f:
    pickle.dump(df, f)

In [8]:
with open('cnn_news_content.pk', 'rb') as f:
    df=pickle.load(f)

df

Unnamed: 0,date,title,text
0,2021-10-15,FDA vaccine advisers recommend emergency use a...,Vaccine advisers to the US Food and Drug Admin...
1,2021-10-8,Studies confirm waning immunity from Pfizer's ...,Two real-world studies published Wednesday con...
2,2021-10-8,Here's what having a Covid-19 vaccine for chil...,Pfizer said Thursday it's asked the US Food an...
3,2021-10-12,Texas governor bans Covid-19 vaccine mandates ...,Texas Gov. Greg Abbott on Monday issued an exe...
4,2021-10-4,NYC vaccine mandate takes effect with 96% of t...,New York City Mayor Bill de Blasio said 96% of...
...,...,...,...
1045,2020-2-8,New study an eye-opener on how coronavirus is ...,A study published Friday in the medical journa...
1046,2020-2-3,A soldier surprised his mom as she was sworn i...,Erika Benning's heart was already racing as sh...
1047,2020-2-19,An American evacuated from Japan on a US chart...,An American who was evacuated on a US-chartere...
1048,2020-2-15,US to evacuate Americans on cruise ship quaran...,The US government is preparing to evacuate Ame...


In [5]:
tagged_list = tokenize_pos_tag(df)
tagged_list[0][:10]

[('Vaccine', 'NN'),
 ('advisers', 'NNS'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('US', 'NNP'),
 ('Food', 'NNP'),
 ('and', 'CC'),
 ('Drug', 'NNP'),
 ('Administration', 'NNP'),
 ('voted', 'VBD')]

In [6]:
words_anrv = ANRV(tagged_list)
words_anrv[0][:10]

[('vaccine', 'n'),
 ('advisers', 'n'),
 ('us', 'n'),
 ('food', 'n'),
 ('drug', 'n'),
 ('administration', 'n'),
 ('voted', 'v'),
 ('unanimously', 'r'),
 ('thursday', 'n'),
 ('recommend', 'v')]

In [7]:
words_lemma = Lemma(words_anrv)
words_lemma[0][:10]

['vaccine',
 'adviser',
 'u',
 'food',
 'drug',
 'administration',
 'vote',
 'unanimously',
 'thursday',
 'recommend']

In [8]:
clean_words = clean_stopword(words_lemma)
clean_words[0][:10]

['vaccine',
 'adviser',
 'food',
 'drug',
 'administration',
 'vote',
 'unanimously',
 'thursday',
 'recommend',
 'emergency']

In [9]:
#LDA

In [10]:
dictionary = corpora.Dictionary(clean_words)
corpus = [dictionary.doc2bow(text) for text in clean_words]
print(dictionary)

Dictionary(16733 unique tokens: ['administer', 'administration', 'advancing', 'adviser', 'advisory']...)


In [16]:
corpus[2][:10]

[(0, 1),
 (1, 4),
 (3, 2),
 (4, 2),
 (6, 1),
 (8, 1),
 (12, 1),
 (13, 5),
 (14, 1),
 (16, 2)]

In [17]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [20]:
corpus_tfidf[2][:5]

[(0, 0.012605276328268081),
 (1, 0.03340969791709795),
 (3, 0.03766389133909657),
 (4, 0.03809195490228862),
 (6, 0.011899727205065257)]

In [33]:
dictionary[0]

'administer'

In [21]:
model = models.ldamodel.LdaModel(corpus_tfidf, num_topics=3, id2word=dictionary)

In [22]:
model.show_topic(0, 10)

[('vaccine', 0.0017570015),
 ('child', 0.0011231644),
 ('trial', 0.0010340727),
 ('test', 0.0009340742),
 ('patient', 0.00087849103),
 ('johnson', 0.0008600754),
 ('pfizer', 0.0008548169),
 ('case', 0.00082761823),
 ('fauci', 0.00081298366),
 ('vaccinate', 0.0008020318)]

In [31]:
# 토픽 개수, 키워드 개수를 정해주는 변수를 추가.
NUM_TOPICS = 2

NUM_TOPIC_WORDS = 30

def build_doc_term_mat(documents):
    # 문서-단어 행렬 만들어주는 함수.
    print("Building document-term matrix.")
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]

    return corpus, dictionary


def print_topic_words(model): # model = LDA된 결과 

    # 토픽 모델링 결과를 출력해 주는 함수.
    print("\nPrinting topic words.\n")

    for topic_id in range(model.num_topics): 
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print('Topic ID: {}'.format(topic_id))

        for topic_word, prob in topic_word_probs:
            print('\t{}\t{}'.format(topic_word, prob))

        print('\n')

# document-term matrix를 만들고,
corpus, dictionary = build_doc_term_mat(clean_words)
# LDA를 실행.
model = models.ldamodel.LdaModel(corpus_tfidf, num_topics=NUM_TOPICS, id2word=dictionary, alpha='auto', eta='auto')
# 결과를 출력.
print_topic_words(model)

Building document-term matrix.

Printing topic words.

Topic ID: 0
	vaccine	0.0012849392369389534
	variant	0.00085486751049757
	county	0.0007782333996146917
	vaccinate	0.0007690583006478846
	case	0.0007555091287940741
	child	0.000707088562194258
	trial	0.0006583714275620878
	trump	0.0006556896842084825
	johnson	0.0006492329994216561
	study	0.0006478257710114121
	test	0.0006471372325904667
	vaccination	0.0006405414897017181
	country	0.0006243169191293418
	mandate	0.0006168331601656973
	student	0.0005933884531259537
	school	0.0005865216953679919
	mask	0.0005857038195244968
	worker	0.000570275413338095
	patient	0.0005646080826409161
	pfizer	0.0005589401116594672
	moderna	0.0005523429135791957
	dose	0.0005511166527867317
	state	0.0005450519965961576
	official	0.0005353802116587758
	death	0.0005297913448885083
	fauci	0.0005281884805299342
	york	0.0005196359124965966
	infection	0.000513516366481781
	china	0.000510829093400389
	report	0.0005034664063714445


Topic ID: 1
	vaccine	0.00147526548

In [32]:
# pyLDAvis 불러오기
import pyLDAvis
import pyLDAvis.gensim
# pyLDAvis를 jupyter notebook에서 실행할 수 있게 활성화.
pyLDAvis.enable_notebook()

# pyLDAvis 실행.
data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
data # print X 그냥 실행