# 4. BERTopic을 이용한 8월 CNBC기사 토픽모델링
- CNBC 경제 뉴스 사이트에서 최근 인기 있는 기사 토픽을 파악하고자 함.
- 대회 시작 당시 9월 초였기에 '최근'의 기준을 8월 1달로 한정함.

In [None]:
!pip install bertopic

In [None]:
#전처리 패키지
import pandas as pd
import numpy as np
from datetime import datetime
import re
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

#토픽 모델링 패키지
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic.representation import MaximalMarginalRelevance

#interactive plot 시각화
import plotly.express as px

import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [97]:
import bertopic

try:
    version = SentenceTransformer.__version__
except AttributeError:
    try:
        from importlib.metadata import version
        version = version('bertopic')
    except ImportError:
        version = "버전을 확인할 수 없음"

BERTopic = 0.15.0


In [87]:
import sys

print('Python =', sys.version)


Python = 3.11.4 (main, Jul  5 2023, 08:54:11) [Clang 14.0.6 ]


In [94]:
#전체 패키지 버전
print('pandas = ',pd.__version__)
print('numpy = ',np.__version__)
print('re = ',re.__version__)
print('spacy = ',spacy.__version__)
print('sklearn = ', sklearn.__version__)
print('BERTopic =', version)
print('Python =', sys.version)

pandas =  1.5.3
numpy =  1.24.3
re =  2.2.1
spacy =  3.7.0
sklearn =  1.3.0
BERTopic = 0.15.0
Python = 3.11.4 (main, Jul  5 2023, 08:54:11) [Clang 14.0.6 ]


## 데이터 준비

In [8]:
df = pd.read_csv('NHIS_BDC_2023/Round1/cnbc_newsdata_final.csv')
df

Unnamed: 0,title,date,category,key_points,text,url
0,‘I work just 5 hours a week': A 39-year-old wh...,2023-01-01,Success,,"Graham Cochrane, Founder of The Recording Revo...",https://www.cnbc.com/2023/01/01/39-year-old-wh...
1,Chinese state media seek to reassure public ov...,2023-01-01,Asia-Pacific News,Chinese state media sought to reassure the pub...,Revelers prepare to release balloons to celebr...,https://www.cnbc.com/2023/01/01/chinese-state-...
2,Should you get creative with your resume? Expe...,2023-01-01,Land the Job,,Mature businessman congratulating young profes...,https://www.cnbc.com/2023/01/01/cv-will-a-crea...
3,Market misery deals sovereign wealth funds his...,2023-01-01,Markets,Heavy falls in stock and bond markets over the...,A trader works on the floor of the New York St...,https://www.cnbc.com/2023/01/01/market-misery-...
4,More social media regulation is coming in 2023...,2023-01-01,Tech,Days after Congress passed a bipartisan spendi...,"The U.K.'s Online Safety Bill, which aims to r...",https://www.cnbc.com/2023/01/01/more-social-me...
...,...,...,...,...,...,...
5628,63% of workers unable to pay a $500 emergency ...,2023-08-31,Personal Finance,Workers are reporting financial stress amid hi...,A shopper makes their way through a grocery st...,https://www.cnbc.com/2023/08/31/63percent-of-w...
5629,"This 22-year-old earns $194,000 at Google and ...",2023-08-31,Millennial Money,,This story is part of CNBC Make It's Millennia...,https://www.cnbc.com/2023/08/31/22-year-old-ea...
5630,China's factory activity shrinks for a fifth s...,2023-08-31,China Economy,The official manufacturing purchasing managers...,People walk through a gate in the Forbidden Ci...,https://www.cnbc.com/2023/08/31/china-economy-...
5631,Southeast Asia's first high-speed train – a bo...,2023-08-31,Access ASEAN,A 142-kilometre rail line linking Jakarta with...,Indonesia is starting trial runs for its first...,https://www.cnbc.com/2023/08/31/indonesias-chi...


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5633 entries, 0 to 5632
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       5633 non-null   object        
 1   date        5633 non-null   datetime64[ns]
 2   category    5633 non-null   object        
 3   key_points  5633 non-null   object        
 4   text        5633 non-null   object        
 5   url         5633 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 264.2+ KB


In [97]:
#잘못 크롤링된 9월 데이터 제거
df = df[df['date'].dt.strftime('%Y-%m') != '2023-09']
df.shape

(5632, 6)

## 카테고리 값 중 월별로 5회 미만 언급된 기사 drop
- 월에 5번도 언급되지 않은 카테고리는 주가 분석에 있어 중요하지 않은 기사라고 판단해 삭제함.

In [98]:
# 'date' 컬럼에서 월 정보를 추출하여 'month' 컬럼 생성
df['month'] = df['date'].dt.month

# 월 별로 카테고리 등장 횟수 계산
monthly_counts = df.groupby(['month', 'category']).size().reset_index(name='count')

# 월 별로 5번 이상 등장한 카테고리 찾기
valid_categories = monthly_counts[monthly_counts['count'] >= 5]['category'].unique()

final_df = df[df['category'].isin(valid_categories)]

# 결과 출력
final_df

Unnamed: 0,title,date,category,key_points,text,url,month
0,‘I work just 5 hours a week': A 39-year-old wh...,2023-01-01,Success,,"Graham Cochrane, Founder of The Recording Revo...",https://www.cnbc.com/2023/01/01/39-year-old-wh...,1
1,Chinese state media seek to reassure public ov...,2023-01-01,Asia-Pacific News,Chinese state media sought to reassure the pub...,Revelers prepare to release balloons to celebr...,https://www.cnbc.com/2023/01/01/chinese-state-...,1
2,Should you get creative with your resume? Expe...,2023-01-01,Land the Job,,Mature businessman congratulating young profes...,https://www.cnbc.com/2023/01/01/cv-will-a-crea...,1
3,Market misery deals sovereign wealth funds his...,2023-01-01,Markets,Heavy falls in stock and bond markets over the...,A trader works on the floor of the New York St...,https://www.cnbc.com/2023/01/01/market-misery-...,1
4,More social media regulation is coming in 2023...,2023-01-01,Tech,Days after Congress passed a bipartisan spendi...,"The U.K.'s Online Safety Bill, which aims to r...",https://www.cnbc.com/2023/01/01/more-social-me...,1
...,...,...,...,...,...,...,...
5625,Biden says he will visit Florida this weekend ...,2023-08-31,Politics,President Joe Biden said he would be traveling...,WASHINGTON — President Joe Biden made a surpri...,https://www.cnbc.com/2023/08/31/biden-says-he-...,8
5626,Baidu's Ernie bot jumps to the top of Apple's ...,2023-08-31,Tech,Chinese tech giant Baidu announced Thursday it...,Pictured here is the Ernie bot mobile interfac...,https://www.cnbc.com/2023/08/31/baidu-gets-chi...,8
5627,Apple reportedly tests 3D printing to manufact...,2023-08-31,Tech,Apple is reportedly testing using 3D printing ...,Apple is testing the use of 3D printers to mak...,https://www.cnbc.com/2023/08/31/apple-is-testi...,8
5628,63% of workers unable to pay a $500 emergency ...,2023-08-31,Personal Finance,Workers are reporting financial stress amid hi...,A shopper makes their way through a grocery st...,https://www.cnbc.com/2023/08/31/63percent-of-w...,8


In [99]:
# 2023년 8월 이후 데이터만 선택
start_date = '2023-08-01'
final_df = final_df[final_df['date'] >= start_date]
final_df.shape

(616, 7)

In [100]:
final_df.date

4946   2023-08-01
4947   2023-08-01
4948   2023-08-01
4949   2023-08-01
4950   2023-08-01
          ...    
5625   2023-08-31
5626   2023-08-31
5627   2023-08-31
5628   2023-08-31
5630   2023-08-31
Name: date, Length: 616, dtype: datetime64[ns]

## 데이터 전처리
- 크롤링 데이터의 특수문자와 문장부호 제거
- 불용어 제거(n차 토픽모델링 후 후처리 진행)
- 불용어 제거 및 토큰화 : 상업용으로 사용하기에 사용성과 성능 면에서 nltk보다 spacy가 우수해 spacy 모델을 사용했다. 긴 텍스트(기사 본문)를 처리해야 하므로 en_core_web_lg 모델을 사용했다. 또한 tokenizer 함수를 생성할 때는 명사만 추출하도록 했으며, 개체명은 'TIME','CARDINAL','DATE'을 제외한 모든 entity를 사용했다.

In [101]:
# 특수 문자와 문장 부호 제거 함수 정의
def remove_special_characters(text):
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    return cleaned_text

# 데이터 전처리 함수 적용
final_df['text'] = final_df['text'].apply(remove_special_characters)

In [102]:
#spacy 모델 다운로드
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [103]:
#spacy 모델 정의
nlp = spacy.load("en_core_web_lg")

#불용어 리스트
spacy_stopwords_list = list(nlp.Defaults.stop_words)

#처리할 문자열 최대 길이
nlp.max_length = 10000000

#토픽모델링 후 추가 불용어 삭제
new_stopwords_list = ['cnbc', 'share', 'earning', 'revenue', 'premarket', 'stock', 'chart icon', 'stock chart', 'chart', 'icon', 'company', 'store', 'wedding', 'food', 'economy', 'bank']
spacy_stopwords_list_new = spacy_stopwords_list + new_stopwords_list

In [104]:
#tokenizer 함수 생성
#1. 명사 추출
#2. 소문자 변환
#3. 불용어 제거
#4. 개체명 time, cardinal, date 제거
def tokenizer(text):
    words=[]
    doc=nlp(text)
    for token in doc:
        if token.tag_[0] in ['N'] and token.lemma_.lower() not in spacy_stopwords_list_new and token.ent_type_ not in ['TIME','CARDINAL','DATE']:
                if len(token.lemma_.lower())>1:
                    words.append(token.lemma_.lower())

    return words

## 토픽모델링
- 임베딩 모델 선정 기준 : sbert.net의 sentencetransformer 중 가장 Performance가 높은 모델(all-mpnet-base-v2) 선정
- 5배나 빠른 속도에 정확도가 높은 all-MiniLM-L6-v2로도 시도해봤지만 성능이 좋지 않았음.

In [105]:
#topic_modeling data 생성
all_texts = final_df['text'].values #전체 뉴스 데이터
timestamps = final_df['date'].to_list() #뉴스 발행일자

In [106]:
embedding_model = SentenceTransformer("all-mpnet-base-v2") #임베딩 모델
embeddings = embedding_model.encode(all_texts, show_progress_bar=True) #임베딩 미리 계산(파라미터 수정 용이 위함)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

**파라미터 튜닝 결과 아래의 파라미터로 하는 것이 가장 토픽을 잘 찾는다고 판단함.**
- UMAP(n_neighbors=8, min_dist=0.1, n_components=2)
- HDBSCAN(min_cluster_size=5)


- TfidfVectorizer 사용 이유 : 단어의 빈도 뿐만 아니라, 그 단어가 전체 문서 집합에서 얼마나 중요한지를 고려하기에 토픽모델링 시에 해당 모델을 사용하는 것이 적합함.(문서 집합에서 단어의 출현 빈도만을 고려하는 CountVectorizer는 불용어까지 토픽 키워드로 선정할 수 있다고 판단.)
- MaximalMarginalRelevance 사용 이유 : 토픽의 키워드를 통해 관련주를 찾아내야 하므로, 토픽 키워드를 추출하는 것이 정교해야 한다고 판단함. 또한 diversity를 0.2로 설정해 토픽과 관련된 키워드를 다소 다양하게 뽑고자 했음.(여러 번의 시행착오 결과 최적의 파라미터가 0.2라고 판단.)

In [108]:
from umap.umap_ import UMAP

In [109]:
def pipeline_models():
    # UMAP 모델 정의
    umap_model = UMAP(n_neighbors=8, min_dist=0.1, n_components=2, random_state=42, metric='cosine')

    # HDBSCAN 모델 정의
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

    # TfidfVectorizer 모델 정의
    vectorizer_model = TfidfVectorizer(tokenizer=tokenizer, stop_words="english", ngram_range=(1, 2), min_df=2)

    # MaximalMarginalRelevance 모델 정의
    representation_model = MaximalMarginalRelevance(diversity=0.2)

    return umap_model, hdbscan_model, vectorizer_model, representation_model

# 모델 생성
umap_model, hdbscan_model, vectorizer_model, representation_model = pipeline_models()

In [110]:
def create_topic_model(embedding_model, umap_model, hdbscan_model, vectorizer_model, representation_model):
    # BERTopic 모델 정의
    topic_model = BERTopic(
        # Pipeline models
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        # Hyperparameters
        top_n_words=10,
        min_topic_size=5,
        verbose=True
    )

    return topic_model

# 모델 생성
topic_model = create_topic_model(embedding_model, umap_model, hdbscan_model, vectorizer_model, representation_model)

In [111]:
topics, probs = topic_model.fit_transform(all_texts, embeddings)

2023-10-15 22:16:06,996 - BERTopic - Reduced dimensionality
2023-10-15 22:16:07,047 - BERTopic - Clustered reduced embeddings


In [112]:
#share값과 토픽과 연관된 문장의 단어
def get_topic_stats(topic_model):
    topics_info_df = topic_model.get_topic_info().sort_values('Count', ascending = False)
    topics_info_df['Share'] = 100.*topics_info_df['Count']/topics_info_df['Count'].sum()
    topics_info_df['CumulativeShare'] = 100.*topics_info_df['Count'].cumsum()/topics_info_df['Count'].sum()
    return topics_info_df[['Topic', 'Count', 'Share', 'CumulativeShare', 'Name', 'Representation']]

In [113]:
#상위 20개 토픽 추출
topic_stat = get_topic_stats(topic_model).head(20).set_index('Topic')
topic_stat

Unnamed: 0_level_0,Count,Share,CumulativeShare,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,128,20.779221,20.779221,-1_bitcoin_resume_oracle_market,"[bitcoin, resume, oracle, market, sec, etf, ap..."
0,34,5.519481,26.298701,0_analyst refinitiv_analyst_cramer_price target,"[analyst refinitiv, analyst, cramer, price tar..."
1,29,4.707792,31.006494,1_ai_chatgpt_google_ai model,"[ai, chatgpt, google, ai model, openai, chatbo..."
2,28,4.545455,35.551948,2_retailer_walmart_foot locker_merchandise,"[retailer, walmart, foot locker, merchandise, ..."
3,24,3.896104,39.448052,3_pfizer_vaccine_pharmacy_medication,"[pfizer, vaccine, pharmacy, medication, obesit..."
4,24,3.896104,43.344156,4_cnn_disney_microsoft_activision,"[cnn, disney, microsoft, activision, espn, sal..."
5,23,3.733766,47.077922,5_election_president donald_indictment_case,"[election, president donald, indictment, case,..."
6,21,3.409091,50.487013,6_iphone_apple_ipad_smartphone,"[iphone, apple, ipad, smartphone, apple iphone..."
7,20,3.246753,53.733766,7_playlist_schwartz_taylor_feedback,"[playlist, schwartz, taylor, feedback, billion..."
8,20,3.246753,56.980519,8_china_beijing_chinas_economist,"[china, beijing, chinas, economist, yuan, peop..."


### 기술 관련 토픽의 키워드 확인
- 토픽을 확인해본 결과, 토픽 0, 3, 6, 11, 15이 기술과 관련이 있다고 판단함

In [114]:
topic_model.get_topic(0)

[('analyst refinitiv', 0.03372675178059123),
 ('analyst', 0.03358069195842215),
 ('cramer', 0.029788355301943913),
 ('price target', 0.02431798295350915),
 ('cent analyst', 0.02416615560766607),
 ('analyst factset', 0.01646362837866513),
 ('wall street', 0.01471902800971651),
 ('nvidia', 0.0134541032796249),
 ('refinitiv cent', 0.01326958911577641),
 ('trade alert', 0.013018680675714253)]

In [115]:
topic_model.get_topic(3)

[('pfizer', 0.03984946568083507),
 ('vaccine', 0.036666046213431495),
 ('pharmacy', 0.031354301864470226),
 ('medication', 0.031088154769561154),
 ('obesity', 0.02947804525251283),
 ('novo', 0.023910544694010386),
 ('cvs', 0.02173974013150382),
 ('telegram', 0.020354560583392255),
 ('marketplace', 0.020174584958086234),
 ('covid', 0.017711835197507627)]

In [116]:
topic_model.get_topic(6)

[('iphone', 0.12257195997513622),
 ('apple', 0.106950452167445),
 ('ipad', 0.04105567881255938),
 ('smartphone', 0.040866859685472635),
 ('apple iphone', 0.035399444958679),
 ('huawei', 0.03232852302698514),
 ('device', 0.03137481122840396),
 ('tablet', 0.02255925250547163),
 ('android', 0.02098821779092229),
 ('iphone model', 0.019903119080530076)]

In [117]:
topic_model.get_topic(11)

[('oil', 0.0765774770724255),
 ('vessel', 0.07464874138775855),
 ('port', 0.02821272545509403),
 ('sailing', 0.02493859355202305),
 ('coast', 0.02378349325924912),
 ('gulf', 0.021537729715534585),
 ('ocean', 0.01934077079955524),
 ('el niño', 0.019109865744352927),
 ('waterway', 0.018559762799953514),
 ('supply', 0.01829513187074302)]

In [118]:
topic_model.get_topic(15)

[('rent', 0.06075332393046261),
 ('city', 0.048954732448530394),
 ('new york', 0.034744671447512525),
 ('housing', 0.034673745873007096),
 ('cost living', 0.034558645706768726),
 ('median', 0.0341092208218325),
 ('apartment', 0.028312383664619207),
 ('home price', 0.026276175593222588),
 ('california', 0.025645654719534255),
 ('angeles', 0.022325803145221985)]

In [171]:
import plotly

In [170]:
topic_model.visualize_hierarchy()

### 상위 토픽 + 관련된 토픽 확인

- 위의 계층적 군집분석 결과를 보면 topic3과 topic21, topic0과 topic6이 묶임

In [126]:
topic_model.get_topic(0)

[('analyst refinitiv', 0.03372675178059123),
 ('analyst', 0.03358069195842215),
 ('cramer', 0.029788355301943913),
 ('price target', 0.02431798295350915),
 ('cent analyst', 0.02416615560766607),
 ('analyst factset', 0.01646362837866513),
 ('wall street', 0.01471902800971651),
 ('nvidia', 0.0134541032796249),
 ('refinitiv cent', 0.01326958911577641),
 ('trade alert', 0.013018680675714253)]

In [127]:
topic_model.get_topic(6)

[('iphone', 0.12257195997513622),
 ('apple', 0.106950452167445),
 ('ipad', 0.04105567881255938),
 ('smartphone', 0.040866859685472635),
 ('apple iphone', 0.035399444958679),
 ('huawei', 0.03232852302698514),
 ('device', 0.03137481122840396),
 ('tablet', 0.02255925250547163),
 ('android', 0.02098821779092229),
 ('iphone model', 0.019903119080530076)]

토픽 0, 6은 관련도가 낮은 토픽이므로, merge하지 않기로 함.

In [128]:
topic_model.get_topic(3)

[('pfizer', 0.03984946568083507),
 ('vaccine', 0.036666046213431495),
 ('pharmacy', 0.031354301864470226),
 ('medication', 0.031088154769561154),
 ('obesity', 0.02947804525251283),
 ('novo', 0.023910544694010386),
 ('cvs', 0.02173974013150382),
 ('telegram', 0.020354560583392255),
 ('marketplace', 0.020174584958086234),
 ('covid', 0.017711835197507627)]

In [129]:
topic_model.get_topic(21)

[('bric', 0.0945983815734802),
 ('russia', 0.08458820400459105),
 ('ukraine', 0.0673767073142626),
 ('ukraines', 0.03782669575290358),
 ('arabia', 0.03613214881783308),
 ('saudi arabia', 0.03613214881783308),
 ('moscow', 0.031989020757038834),
 ('president vladimir', 0.03146708884101227),
 ('china russia', 0.025810295868082356),
 ('war ukraine', 0.0210772127345319)]

토픽 3과 21이 ai 테마로 묶인다고 판단되어 두 토픽의 keyword를 추출하고 merge 후 토픽 share값을 확인함.

In [134]:
# 키워드 추출
topic_0_keyword = [tu[0] for tu in topic_model.get_topic(3)]
topic_21_keyword = [tu[0] for tu in topic_model.get_topic(21)]
keyword = topic_0_keyword + topic_21_keyword
print(keyword)

['pfizer', 'vaccine', 'pharmacy', 'medication', 'obesity', 'novo', 'cvs', 'telegram', 'marketplace', 'covid', 'bric', 'russia', 'ukraine', 'ukraines', 'arabia', 'saudi arabia', 'moscow', 'president vladimir', 'china russia', 'war ukraine']


In [135]:
# topic merge(ai, gpu)
topics_to_merge = [3, 21]
topic_model.merge_topics(all_texts, topics_to_merge)

In [136]:
#merge 후 share 값 확인
#토픽 3, 21을 합친 토픽이 가장 높은 share를 차지함
topic_stat = get_topic_stats(topic_model).head(20).set_index('Topic')
topic_stat

Unnamed: 0_level_0,Count,Share,CumulativeShare,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,128,20.779221,20.779221,-1_bitcoin_resume_oracle_market,"[bitcoin, resume, oracle, market, sec, etf, ap..."
0,34,5.519481,26.298701,0_analyst refinitiv_price target_cent analyst_...,"[analyst refinitiv, price target, cent analyst..."
1,34,5.519481,31.818182,1_bric_russia_ukraine_pfizer,"[bric, russia, ukraine, pfizer, pharmacy, saud..."
2,29,4.707792,36.525974,2_ai_chatgpt_google_ai model,"[ai, chatgpt, google, ai model, openai, chatbo..."
3,28,4.545455,41.071429,3_retailer_walmart_foot locker_merchandise,"[retailer, walmart, foot locker, merchandise, ..."
4,24,3.896104,44.967532,4_cnn_disney_microsoft_activision,"[cnn, disney, microsoft, activision, espn, sal..."
5,23,3.733766,48.701299,5_election_president donald_indictment_case,"[election, president donald, indictment, case,..."
6,21,3.409091,52.11039,6_iphone_apple_smartphone_apple watch,"[iphone, apple, smartphone, apple watch, apple..."
7,20,3.246753,55.357143,7_playlist_schwartz_taylor_mindset,"[playlist, schwartz, taylor, mindset, billiona..."
8,20,3.246753,58.603896,8_china_beijing_chinas_economist,"[china, beijing, chinas, economist, yuan, peop..."


In [137]:
# 8월 간 토픽 언급량 추이 시각화
topics_over_time = topic_model.topics_over_time(all_texts, timestamps,
                                                global_tuning=True, evolution_tuning=True, nr_bins=8)


8it [04:40, 35.06s/it]


In [138]:
#상위 5개 토픽
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=5)

등락이 크지 않고, 꾸준히 상승하고 있는 ai로 테마를 선정

In [139]:
#모델 저장
topic_model.save(os.path.join(PATH, "NH_topics_model"))

In [140]:
print(keyword)

['pfizer', 'vaccine', 'pharmacy', 'medication', 'obesity', 'novo', 'cvs', 'telegram', 'marketplace', 'covid', 'bric', 'russia', 'ukraine', 'ukraines', 'arabia', 'saudi arabia', 'moscow', 'president vladimir', 'china russia', 'war ukraine']


In [73]:
# 유의미한 키워드만 추출
best_keywords = ['ai', 'google', 'chatgpt', 'chatbot', 'openai', 'czech', 'ai model', 'language model', 'generative ai', 'aws', 'gpu', 'amd', 'processing unit', 'vmware', 'graphic processing', 'computing', 'ai model', 'micro device', 'advanced micro']

# 5. 언급량이 많은 토픽 기반 기업 찾기

## 데이터 전처리(복수형, 동의어)

In [70]:
!pip install inflect
!pip install nltk

Collecting inflect
  Downloading inflect-6.0.5-py3-none-any.whl.metadata (21 kB)
Downloading inflect-6.0.5-py3-none-any.whl (34 kB)
Installing collected packages: inflect
Successfully installed inflect-6.0.5


In [71]:
import pandas as pd
from nltk.corpus import wordnet as wn
import nltk
import inflect


nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/notebook/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/notebook/nltk_data...


True

In [74]:
print(best_keywords)

['ai', 'google', 'chatgpt', 'chatbot', 'openai', 'czech', 'ai model', 'language model', 'generative ai', 'aws', 'gpu', 'amd', 'processing unit', 'vmware', 'graphic processing', 'computing', 'ai model', 'micro device', 'advanced micro']


In [75]:
combined_list = []

for word in best_keywords:
    lower_word = word.lower()
    synsets = wn.synsets(lower_word)
    if synsets:
        first_synset = synsets[0]
        combined_list.extend(first_synset.lemma_names())
    combined_list.append(word)

# 결과 리스트 출력
combined_list = list(set([word for word in combined_list if "_" not in word]))
print(combined_list)

['ai', 'advanced micro', 'graphic processing', 'ai model', 'Google', 'processing unit', 'openai', 'vmware', 'AMD', 'language model', 'micro device', 'aws', 'chatbot', 'czech', 'generative ai', 'google', 'gpu', 'amd', 'Czech', 'computing', 'chatgpt', 'AI']


In [76]:
#복수형 생성
def singular_to_plural(word):
    p = inflect.engine()
    return p.plural(word)

new_list = []

for word in combined_list:
    new_word = singular_to_plural(word)
    new_list.append(new_word)

all_list = list(set(combined_list + new_list))

In [77]:
# 최종 키워드
print(all_list)

['ai', 'advanced micro', 'AMDS', 'computings', 'graphic processing', 'ai model', 'advanced micros', 'AIS', 'vmwares', 'processing unit', 'openai', 'amds', 'vmware', 'AMD', 'openais', 'language model', 'micro devices', 'processing units', 'chatbots', 'AI', 'chatgpts', 'micro device', 'aws', 'chatbot', 'aw', 'czech', 'language models', 'generative ai', 'Czechs', 'Googles', 'google', 'gpu', 'amd', 'graphic processings', 'googles', 'generative ais', 'Czech', 'computing', 'czechs', 'chatgpt', 'ais', 'ai models', 'gpus', 'Google']


## 토픽 키워드와 연관된 기업 찾기

In [78]:
# 기업 description 불러오기(yfinance crawling)
stock_info_df = pd.read_csv(os.path.join(PATH, 'stock_description.csv'))

In [79]:
# 토픽 관련 기업 찾기
def retrieve_companies_by_keywords(keywords):
    keywords_set = set([word.lower() for word in keywords])

    # 기업 리스트
    cp = []

    for i, row in stock_info_df.iterrows():
        description = row['description']

        #단어 추출
        if isinstance(description, str):
            description_words = set(description.replace(",", '').lower().replace('.', '').split(" "))

            # 토픽 키워드와 description이 겹치는 기업 찾기
            if description_words & keywords_set:
                cp.append(row['tck_iem_cd'])

    return list(set(cp))

In [80]:
len(retrieve_companies_by_keywords(all_list))

100

In [81]:
# 키워드 언급횟수 count
def count_companies_by_keywords(keywords):
    keywords_set = set([word.lower() for word in keywords])

    # 키워드 언급횟수 딕셔너리 생성
    keyword_counts = {word: 0 for word in keywords_set}

    for i, row in stock_info_df.iterrows():
        description = row['description']

        if isinstance(description, str):
            description_words = set(description.replace(",", '').lower().replace('.', '').split(" "))

            # 키워드 횟수 카운트
            for word in keywords_set:
                if word in description_words:
                    keyword_counts[word] += 1

    for keyword, count in keyword_counts.items():
        print(f"{keyword}: {count} companies")

In [82]:
count_companies_by_keywords(all_list)

ai: 40 companies
advanced micro: 0 companies
computings: 0 companies
graphic processing: 0 companies
ai model: 0 companies
advanced micros: 0 companies
vmwares: 0 companies
processing unit: 0 companies
openai: 0 companies
amds: 0 companies
vmware: 0 companies
openais: 0 companies
language model: 0 companies
micro devices: 0 companies
processing units: 0 companies
chatbots: 0 companies
chatgpts: 0 companies
micro device: 0 companies
aws: 2 companies
chatbot: 0 companies
aw: 0 companies
czech: 4 companies
language models: 0 companies
generative ai: 0 companies
google: 9 companies
gpu: 1 companies
amd: 3 companies
googles: 0 companies
graphic processings: 0 companies
generative ais: 0 companies
computing: 52 companies
czechs: 0 companies
chatgpt: 1 companies
ais: 0 companies
ai models: 0 companies
gpus: 1 companies


In [83]:
cp_list = retrieve_companies_by_keywords(all_list)

In [84]:
#토픽과 관련있는 기업 추출
print(cp_list)

['QMCO', 'NEWT', 'KTOS', 'NTGR', 'CDNS', 'ZBRA', 'NUWE', 'RGTI', 'KC', 'OCGN', 'QUBT', 'VUZI', 'CTSH', 'GOOG', 'TTMI', 'EXAI', 'CNXA', 'GRRR', 'WETG', 'LNTH', 'NICE', 'BRQS', 'MCLD', 'HUT', 'NTAP', 'PRST', 'NNOX', 'ECX', 'KOD', 'AUUD', 'MSFT', 'CRNC', 'INTA', 'RBBN', 'NSIT', 'THRM', 'LVOX', 'AMD', 'VERI', 'SOUN', 'ALTR', 'AOSL', 'FWRG', 'AMPG', 'APLD', 'PERI', 'SMCI', 'XRX', 'MCHP', 'GFAI', 'CCCS', 'CSCO', 'MPWR', 'LIZI', 'STCN', 'MLGO', 'VOD', 'XNET', 'DRS', 'FFIV', 'LWLG', 'BELFB', 'KOSS', 'ABSI', 'RTC', 'TROO', 'CD', 'AIMD', 'LKCO', 'DIOD', 'RIOT', 'INTC', 'MARK', 'SANM', 'PALT', 'OTEX', 'REKR', 'VRNT', 'AMKR', 'POAI', 'RNLX', 'GLMD', 'API', 'PLTK', 'CEVA', 'AKAM', 'LSCC', 'SCSC', 'AEHR', 'AVPT', 'EGIO', 'ANSS', 'TASK', 'SASI', 'NVDA', 'WDC', 'SGH', 'LKQ', 'RDFN', 'TXN']


In [155]:
len(cp_list)

100