# 0. 크롤링 데이터 불러오기

In [1]:
import pandas as pd

df = pd.read_csv("뉴스 크롤링.csv", encoding="utf-8")

df

Unnamed: 0,date,title
0,2018-01-01,"우체국 1년정기예금금리0.3%p 인상, 연 1.6%→1.9%"
1,2018-01-01,[2018 경제기상도] 미국 기준금리상승 압력…韓銀 하반기 인상에 무게
2,2018-01-01,[2018 경제기상도] 보유세·금리인상…불확실성 커진 주택시장
3,2018-01-02,학자금 대출금리2.20%로 인하…3일부터 신청
4,2018-01-02,"이주열 ""기준금리추가 인상, 경기지표·상황 뒷받쳐 주면 하는 것"""
...,...,...
8414,2025-08-19,“중대재해 터진 기업 자금난 각오해야”...대출금리·한도·만기 다 조인다
8415,2025-08-19,‘중대재해 엄벌’ 코드 맞추는 금융위...“사고 땐 대출금리·한도 페널티”
8416,2025-08-19,사망사고 낸 기업 대출엔금리올리고 한도 깎는다
8417,2025-08-19,"중대재해 기업엔 대출금리높이고, 한도 줄인다는데"


# 1. 감성분석(KR-FinBERT 모델 활용-한국어 금융 텍스트 특화)

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
from tqdm import tqdm  

# 빈 제목 제거
df = df.dropna(subset=["title"])

# 모델 로드 (KR-FinBERT)
model_name = "snunlp/KR-FinBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# 감성 분석 함수
def get_sentiment_scores(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
    
    hawkish_score = probs[0]   # LABEL_0 → Hawkish
    dovish_score = probs[1]    # LABEL_1 → Dovish
    tone_score = hawkish_score - dovish_score
    
    return hawkish_score, dovish_score, tone_score

# tqdm 진행상황 표시
scores = []
for title in tqdm(df["title"], desc="뉴스 제목 감성분석 진행중"):
    scores.append(get_sentiment_scores(title))

df[["Hawkish_score", "Dovish_score", "Tone_score"]] = pd.DataFrame(scores, index=df.index)

df

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at snunlp/KR-FinBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
뉴스 제목 감성분석 진행중:   0%|                       | 0/8419 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
뉴스 제목 감성분석 진행중: 100%|████████████| 8419/8419 [04:17<00:00, 32.68it/s]


Unnamed: 0,date,title,Hawkish_score,Dovish_score,Tone_score
0,2018-01-01,"우체국 1년정기예금금리0.3%p 인상, 연 1.6%→1.9%",0.595807,0.404193,0.191613
1,2018-01-01,[2018 경제기상도] 미국 기준금리상승 압력…韓銀 하반기 인상에 무게,0.514450,0.485550,0.028900
2,2018-01-01,[2018 경제기상도] 보유세·금리인상…불확실성 커진 주택시장,0.597593,0.402407,0.195185
3,2018-01-02,학자금 대출금리2.20%로 인하…3일부터 신청,0.626408,0.373592,0.252816
4,2018-01-02,"이주열 ""기준금리추가 인상, 경기지표·상황 뒷받쳐 주면 하는 것""",0.526504,0.473496,0.053008
...,...,...,...,...,...
8414,2025-08-19,“중대재해 터진 기업 자금난 각오해야”...대출금리·한도·만기 다 조인다,0.573563,0.426437,0.147125
8415,2025-08-19,‘중대재해 엄벌’ 코드 맞추는 금융위...“사고 땐 대출금리·한도 페널티”,0.545848,0.454152,0.091696
8416,2025-08-19,사망사고 낸 기업 대출엔금리올리고 한도 깎는다,0.586583,0.413417,0.173166
8417,2025-08-19,"중대재해 기업엔 대출금리높이고, 한도 줄인다는데",0.555093,0.444907,0.110186


# 2. 전처리(데이터 클리닝, 결측값 처리)

In [4]:
# date 컬럼을 datetime 형식으로 변환 후 인덱스로 지정
df["date"] = pd.to_datetime(df["date"])
df = df.set_index("date")   # 인덱스로 지정 → 기존 date 칼럼은 제거됨

# 전체 날짜 범위 생성
full_range = pd.date_range(start="2018-01-01", end="2025-08-19", freq="D")

# 날짜별 Tone_score 평균 계산
daily_tone = df.groupby(df.index)["Tone_score"].mean()

# 전체 날짜 인덱스 맞추기 (뉴스 없는 날은 NaN)
daily_tone = daily_tone.reindex(full_range)

daily_tone.index.name = "date"
daily_tone = daily_tone.to_frame(name="Tone_score_mean")

KeyError: 'date'

In [5]:
daily_tone

Unnamed: 0_level_0,Tone_score_mean
date,Unnamed: 1_level_1
2018-01-01,0.138566
2018-01-02,0.165511
2018-01-03,0.238423
2018-01-04,0.102867
2018-01-05,0.150966
2018-01-06,
2018-01-07,
2018-01-08,0.212867
2018-01-09,0.145854
2018-01-10,0.108553


In [6]:
# 시간 기반 결측치 보간 (NaN → 날짜 간격 고려한 값으로 채움)
daily_tone_interp = daily_tone.interpolate(method="time")

daily_tone_interp

Unnamed: 0_level_0,Tone_score_mean
date,Unnamed: 1_level_1
2018-01-01,0.138566
2018-01-02,0.165511
2018-01-03,0.238423
2018-01-04,0.102867
2018-01-05,0.150966
2018-01-06,0.1716
2018-01-07,0.192234
2018-01-08,0.212867
2018-01-09,0.145854
2018-01-10,0.108553


In [7]:
daily_tone_interp.to_csv("감성분석 톤 점수 결과.csv", encoding="utf-8-sig")