In [1]:
import json
import pandas as pd
import numpy as np
import re
import hanlp
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# 📌 JSON 데이터 로드
file_path = "../data/news_최종.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 📌 4. 데이터프레임 변환
df = pd.DataFrame(data)

In [3]:
# 기존 tokenized_content 컬럼 삭제
if "tokenized_content" in df.columns:
    df.drop(columns=["tokenized_content"], inplace=True)

In [4]:
# 📌 1. HanLP 토큰화 모델 로드
segmenter = hanlp.load("CTB9_TOK_ELECTRA_BASE_CRF")

  from .autonotebook import tqdm as notebook_tqdm
                                   

In [5]:
df["year"] = df["quarter"].str[:4].astype(int)  # "2006Q3" → 2006

#2006~2009년 데이터를 "2006-2009Q"로 그룹화, 나머지는 기존 분기(YYYYQ#) 유지
df["input_quarter"] = df["quarter"].apply(lambda x: "2006-2009Q" if int(x[:4]) in range(2006, 2010) else x)
df["input_year"] = df["year"].apply(lambda x: "2006-2009" if 2006 <= x <= 2009 else str(x))


In [6]:
 # 불용어 리스트 정의
chinese_stopwords =  set([
    "的", "了", "在", "是", "和", "与", "也", "有", "对", "就", "以", "将", "要",
"但", "其", "而", "此", "我们", "他们", "你们", "可以", "但是", "这样", "这个",
    "其中", "其中之一", "包括", "根据", "由于", "通过", "此外", "同时",
    "记者", "新闻", "媒体", "采访", "发表", "宣布", "透露", "介绍", "报道称",
    "指出", "强调", "证实", "承认", "评价", "评论", "提到", "解释", "分析",
    "总结", "预测", "预计", "关注", "反映", "说明", "进一步",
    "今天", "昨天", "明天", "日前", "近日", "本周", "上周", "下周", "目前",
    "现在", "过去", "未来", "今年", "去年", "明年", "此前", "随后", "当地",
    "相关", "部门", "官员", "发言人", "部长", "事务",
    "一", "二", "这", "不", "可能", "都", "才", "可", "一直", "到", "如果", "带来",
    "十分", "称", "个", "次", "因", "因此", "最", "即", "当天", "因为", "曾",
    "号", "第", "相当", "两", "很", "所以", "各种", "从而", "仍", "为了", "以及",
    "据", "并", "过", "几", "立即", "着", "于", "为", "说", "却", "使", "还",
    "来说", "至", "会", "除了", "被", "外", "若", "更", "已经", "已", "大大", "不仅",
    "能够", "再", "一些", "明确", "作为", "向",  "是否", "积极", "讨论",
    "越来越", "甚至", "出于", "以来", "重启", "近期", "依然", "影响", "之间", "重点",
    "方面", "力度", "报道", "如何", "周年", "文章", "前往", "通讯社", "事情",
    "社论", "非常", "上午", "时间", "它们", "应该", "不断", "结束", "愿意",
    "最后", "开始", "缺乏", "自己", "调查", "今后", "乃至", "特别", "深化",
    "即将", "类似", "什么", "没有", "方向", "更为", "需要", "做出",
    "公司", "平台", "环球", "时报", "人民", "日报", "新华社", "看起来",
    "凡是", "将", "丨"
])


In [8]:
def tokenize_and_process(text, chinese_stopwords):
    # 1️⃣ HanLP 토큰화 (날짜 패턴 포함된 원본 토큰 유지)
    token_list = segmenter(text)

    # 2️⃣ 날짜 패턴 제거 (예: "5日", "7月", "2024年")
    token_list_no_date = [re.sub(r'\d+[日月年]', '', token) for token in token_list]

    # 3️⃣ 빈 문자열 제거 (날짜 패턴이 제거된 후 빈 문자열이 될 수 있음)
    token_list_no_date = [token for token in token_list_no_date if token.strip()]

    # 4️⃣ 불용어 제거
    filtered_list = [word for word in token_list_no_date if word not in chinese_stopwords]

    return token_list, filtered_list

# 새로운 토큰화 및 불용어 제거 컬럼 추가
df["tokenized_content"], df["filtered_content"] = zip(*df["cleaned_content"].apply(lambda x: tokenize_and_process(x, chinese_stopwords)))

In [20]:
json_filename = "../data/tokenized_data.json"
with open(json_filename, "w", encoding="utf-8") as f:
    json.dump(df.to_dict(orient="records"), f, ensure_ascii=False, indent=4)

In [12]:
def convert_list_to_string(df, columns):
    """
    주어진 DataFrame의 특정 컬럼이 리스트인 경우, 공백으로 구분된 문자열로 변환하는 함수.

    :param df: 변환할 DataFrame
    :param columns: 변환할 컬럼 리스트 (예: ["tokenized_content", "filtered_content"])
    :return: 변환된 DataFrame
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
    return df

# 리스트를 공백으로 구분된 문자열로 변환
df = convert_list_to_string(df, ["tokenized_content", "filtered_content"])


In [13]:
df

Unnamed: 0,title,link,date,content,source,quarter,cleaned_content,year,input_quarter,input_year,tokenized_content,filtered_content
0,朝鲜试射导弹亚太骤起波澜,https://www.gmw.cn/01gmrb/2006-07/12/content_4...,2006-07-12,7月5日，朝鲜连续发射7枚导弹，引起国际社会震惊。近日，美日在安理会积极推动制裁朝鲜的议案，...,光明网,2006Q3,7月5日朝鲜连续发射7枚导弹引起国际社会震惊。近日美日在安理会积极推动制裁朝鲜的议案并加紧协...,2006,2006-2009Q,2006-2009,7月 5日 朝鲜 连续 发射 7 枚 导弹 引起 国际 社会 震惊 。 近日 美 日 在 安...,朝鲜 连续 发射 7 枚 导弹 引起 国际 社会 震惊 。 美 日 安理会 推动 制裁 朝鲜...
1,美朝关系开始解冻,https://www.gmw.cn/01gmrb/2007-03/08/content_5...,2007-03-08,被外界视为“破冰之旅”的朝鲜副外相金桂冠3月5至6日在纽约与美国助理国务卿希尔就两国关系正常...,光明网,2007Q1,被外界视为破冰之旅的朝鲜副外相金桂冠3月5至6日在纽约与美国助理国务卿希尔就两国关系正常化进...,2007,2006-2009Q,2006-2009,被 外界 视为 破冰 之 旅 的 朝鲜 副外相 金桂冠 3月 5 至 6日 在 纽约 与 美...,外界 视为 破冰 之 旅 朝鲜 副外相 金桂冠 5 纽约 美国 助理 国务卿 希尔 国 关系...
2,美日韩间谍云集朝鲜周边空中侦察地面窃听 - 国际经济,http://intl.ce.cn/zj/200809/19/t20080919_16855...,2008-09-19,神秘，这是朝鲜留给外界的印象。 正因为神秘，一些国家的情报机构费尽心机，通过投放卫星、派遣侦...,中国经济网,2008Q3,神秘这是朝鲜留给外界的印象。 正因为神秘一些国家的情报机构费尽心机通过投放卫星派遣侦察机甚至...,2008,2006-2009Q,2006-2009,神秘 这 是 朝鲜 留给 外界 的 印象 。 正 因为 神秘 一些 国家 的 情报 机构 费...,神秘 朝鲜 留给 外界 印象 。 正 神秘 国家 情报 机构 费尽心机 投放 卫星 派遣 侦...
3,美日韩紧盯朝鲜射导弹(图) - 国际经济,http://intl.ce.cn/zj/200902/11/t20090211_18171...,2009-02-11,改进型“大浦洞2号”据说可攻击美本土，甚至可能具有搭载核弹头技术 朝鲜政府最近频繁发出朝韩关...,中国经济网,2009Q1,改进型大浦洞2号据说可攻击美本土甚至可能具有搭载核弹头技术 朝鲜政府最近频繁发出朝韩关系恶化...,2009,2006-2009Q,2006-2009,改进型 大浦洞 2 号 据说 可 攻击 美 本土 甚至 可能 具有 搭载 核 弹头 技术 朝...,改进型 大浦洞 2 据说 攻击 美 本土 具有 搭载 核 弹头 技术 朝鲜 政府 最近 频繁...
4,美国朝鲜问题特使称美愿与朝对话,https://world.huanqiu.com/article/9CaKrnJlEWs,2009-03-07,新华网首尔3月7日电 (记者李拯宇 干玉兰) 美国朝鲜问题特使斯蒂芬·博斯沃思7日在韩国说，...,环球网,2009Q1,新华网首尔3月7日电 记者李拯宇 干玉兰 美国朝鲜问题特使斯蒂芬博斯沃思7日在韩国说美国愿意...,2009,2006-2009Q,2006-2009,新华网 首尔 3月 7日 电 记者 李拯宇 干玉兰 美国 朝鲜 问题 特使 斯蒂芬博斯沃思 ...,新华网 首尔 电 李拯宇 干玉兰 美国 朝鲜 问题 特使 斯蒂芬博斯沃思 韩国 美国 朝鲜 ...
...,...,...,...,...,...,...,...,...,...,...,...,...
1019,詹德斌：韩国政局变动，美国要做的不应是施压,https://hqtime.huanqiu.com/share/article/4Ky1A...,2025-01-07,美国国务卿布林肯1月5日抵达韩国。这看上去似乎是一次礼仪性道别之旅，但防止韩国“后弹劾政局”...,环球网,2025Q1,美国国务卿布林肯1月5日抵达韩国。这看上去似乎是一次礼仪性道别之旅但防止韩国后弹劾政局对美韩...,2025,2025Q1,2025,美国 国务卿 布林肯 1月 5日 抵达 韩国 。 这 看上去 似乎 是 一 次 礼仪性 道别...,美国 国务卿 布林肯 抵达 韩国 。 看上去 似乎 礼仪性 道别 之 旅 防止 韩国 后 弹...
1020,朝鲜谴责美韩军事挑衅导致地区局势恶化,http://www.xinhuanet.com/20250126/55a39b230ccf...,2025-01-26,新华社首尔1月26日电 据朝中社26日报道，朝鲜外务省对外政策室长当天发表谈话，谴责美韩近期...,新华网,2025Q1,新华社首尔1月26日电 据朝中社26日报道朝鲜外务省对外政策室长当天发表谈话谴责美韩近期对朝...,2025,2025Q1,2025,新华社 首尔 1月 26日 电 据 朝中社 26日 报道 朝鲜 外务省 对外 政策 室长 当...,首尔 电 朝中社 朝鲜 外务省 对外 政策 室长 谈话 谴责 美 韩 朝 军事 挑衅 行为 ...
1021,朝鲜外务省：美韩军事挑衅行为导致地区局势恶化 - 国际频道,https://world.gmw.cn/2025-01/26/content_378203...,2025-01-26,中新网1月26日电据朝中社报道，当地时间1月26日，朝鲜外务省对外政策室长发表谈话，谴责美韩...,光明网,2025Q1,中新网1月26日电据朝中社报道当地时间1月26日朝鲜外务省对外政策室长发表谈话谴责美韩近期对...,2025,2025Q1,2025,中新网 1月 26日 电 据 朝中社 报道 当地 时间 1月 26日 朝鲜 外务省 对外 政...,中新网 电 朝中社 朝鲜 外务省 对外 政策 室长 谈话 谴责 美 韩 朝 军事 挑衅 行为...
1022,石破茂与特朗普会面，美日“小集团”谋划“大算盘” | 国际识局,http://intl.ce.cn/qqss/202502/09/t20250209_392...,2025-02-09,中新网2月9日电(记者 孟湘君)特朗普当选新一任美国总统后，意大利、以色列等国领导人先后与其...,中国经济网,2025Q1,中新网2月9日电记者 孟湘君特朗普当选新一任美国总统后意大利以色列等国领导人先后与其会面。近...,2025,2025Q1,2025,中新网 2月 9日 电 记者 孟湘君 特朗普 当选 新 一 任 美国 总统 后 意大利 以色...,中新网 电 孟湘君 特朗普 当选 新 任 美国 总统 后 意大利 以色列 等 国 领导人 先...


In [15]:
# 2️⃣ 전체 분기 기준 IDF 계산

# 전체 분기 리스트 및 개수
quarters = df["input_quarter"].unique()
Q = len(quarters)  # 전체 분기 수

# 단어별 등장 분기 수 계산
term_quarter_count = {}

for input_quarter in quarters:
    # 해당 분기의 모든 문서에서 등장한 단어 목록 추출
    quarter_texts = df[df["input_quarter"] == input_quarter]["filtered_content"].tolist()
    unique_words = set(" ".join(quarter_texts).split())  # 토큰을 한 단어로 인식
    for word in unique_words:
        term_quarter_count[word] = term_quarter_count.get(word, 0) + 1

# 전체 분기 기준 IDF 계산
idf_scores = {word: np.log(Q / df_t) for word, df_t in term_quarter_count.items()}


In [16]:
# 3️⃣ 분기별 TF 계산 (각 분기를 하나의 문서로 취급)

# 분기별 텍스트 결합 (분기 단위로 하나의 문서로 취급)
quarter_texts = df.groupby("input_quarter")["filtered_content"].apply(lambda x: " ".join(x)).reset_index()

# TF 벡터라이저 적용
vectorizer = TfidfVectorizer(use_idf=False, token_pattern=r'[^ ]+')  # 공백을 기준으로 토큰을 단일 단어로 인식
tf_matrix = vectorizer.fit_transform(quarter_texts["filtered_content"])
tf_df = pd.DataFrame(tf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# IDF 적용하여 최종 TF-IDF 계산
for word in tf_df.columns:
    if word in idf_scores:
        tf_df[word] *= idf_scores[word]

# 분기 정보 추가
tf_df = pd.concat([tf_df, quarter_texts[['input_quarter']].reset_index(drop=True)], axis=1)

In [28]:
word_counts = (tf_df.drop(columns=['input_quarter']) > 0).sum()
tf_df_filtered = tf_df.loc[:, list(word_counts[word_counts >= 3].index) + ['input_quarter']]
print(tf_df_filtered)

          01        02        08       08版         1        10       100  \
0   0.007806  0.007806  0.000000  0.000000  0.000000  0.002048  0.011877   
1   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
5   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
6   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
7   0.000000  0.000000  0.000000  0.000000  0.000000  0.004601  0.013338   
8   0.000000  0.000000  0.000000  0.000000  0.000953  0.000000  0.000000   
9   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.001791   
10  0.000000  0.000000  0.000000  0.000000  0.006714  0.002656  0.005774   
11  0.000000  0.000000  0.000000  0.000000  0.000000  0.001186  0.000000   
12  0.004559

In [29]:
tf_df_filtered

Unnamed: 0,01,02,08,08版,1,10,100,1000,1000万,1000亿,...,齐心协力,龃龉,龙,龙山,！,２０,４,５０,？,input_quarter
0,0.007806,0.007806,0.0,0.0,0.0,0.002048,0.011877,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007806,0.000541,2006-2009Q
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2010Q1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001617,2010Q3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2010Q4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.028802,0.0,0.0,0.0,0.000656,2011Q1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011Q3
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011Q4
7,0.0,0.0,0.0,0.0,0.0,0.004601,0.013338,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001214,2012Q1
8,0.0,0.0,0.0,0.0,0.000953,0.0,0.0,0.0,0.0,0.004763,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005575,2012Q2
9,0.0,0.0,0.0,0.0,0.0,0.0,0.001791,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001791,0.0,0.0,0.0,0.003915,2012Q3


In [19]:
# 🔹 분기별 TF-IDF 점수가 높은 상위 10개 단어 출력 (세부 점수 포함)
result_text = ""
for quarter in tf_df['input_quarter'].unique():
    quarter_data = tf_df[tf_df['input_quarter'] == quarter].drop(columns=['input_quarter'])
    num_documents = df[df['input_quarter'] == quarter].shape[0]
    top_words = quarter_data.T.sum(axis=1).sort_values(ascending=False).head(20)
    top_keywords = [(kw, round(top_words[kw], 4)) for kw in top_words.index]

    result_text += f"{quarter}분기 중요 키워드 (문서 수: {num_documents}개):\n"
    result_text += ", ".join([f"{kw}({avg_score}점)" for kw, avg_score in top_keywords]) + "\n\n"

print(result_text)

2006-2009Q분기 중요 키워드 (문서 수: 14개):
间谍(0.0551점), 葛瑞格森(0.0527점), 澳门(0.0525점), 侦察机(0.0509점), 金正日(0.0505점), 金桂冠(0.0438점), 发射(0.0424점), 卫星(0.0423점), 汇业(0.0422점), 金正男(0.0422점), 纽金特(0.0422점), 大浦洞(0.0414점), 总参谋部(0.0383점), 希尔(0.0348점), 导弹(0.0325점), 谍报(0.0316점), 第１７１８(0.0316점), 自救(0.0316점), 安理会(0.0316점), 试射(0.0312점)

2010Q1분기 중요 키워드 (문서 수: 1개):
金色(0.5229점), 眼镜蛇(0.4158점), 本杰明密克森(0.21점), 观察员国(0.1743점), 宋伟钢(0.1743점), 观摩(0.1386점), 泰国(0.1344점), 中将(0.1271점), 救灾(0.1098점), 海外(0.0927점), 人道主义(0.0641점), 实习(0.0611점), 增多(0.0611점), 陆军(0.0611점), 演习(0.0597점), 机会(0.0568점), 军演(0.0558점), 身份(0.0532점), 军队(0.0453점), 参加(0.0426점)

2010Q3분기 중요 키워드 (문서 수: 2개):
军演(0.0843점), 希拉里(0.0732점), 猛禽(0.0696점), 盖茨(0.0635점), 反空(0.0635점), 抵挡(0.0635점), 军输(0.0635점), 200多(0.0635점), 金泰荣(0.0635점), 柳明桓(0.0635점), 防御战(0.0635점), 敌(0.0534점), 围堵(0.0529점), 反渗透(0.0527점), 调查团(0.0527점), 警戒舰(0.0527점), 探寻(0.0464점), 供给(0.0464점), 沉没(0.0464점), 空对地(0.0464점)

2010Q4분기 중요 키워드 (문서 수: 8개):
马伦(0.197점), 浓缩(0.0995점), 铀(0.0995점), 迈克尔(0.0863점), 前原诚司(0.0653점), 曹颖(0.0

In [15]:

print(quarters, Q)

['2006-2009Q' '2010Q1' '2010Q3' '2010Q4' '2011Q1' '2011Q3' '2011Q4'
 '2012Q1' '2012Q2' '2012Q3' '2012Q4' '2013Q1' '2013Q2' '2013Q3' '2013Q4'
 '2014Q1' '2014Q2' '2014Q3' '2014Q4' '2015Q1' '2015Q2' '2015Q3' '2015Q4'
 '2016Q1' '2016Q2' '2016Q3' '2016Q4' '2017Q1' '2017Q2' '2017Q3' '2017Q4'
 '2018Q1' '2018Q2' '2018Q3' '2018Q4' '2019Q1' '2019Q2' '2019Q3' '2019Q4'
 '2020Q1' '2020Q3' '2020Q4' '2021Q1' '2021Q2' '2021Q3' '2021Q4' '2022Q1'
 '2022Q2' '2022Q3' '2022Q4' '2023Q1' '2023Q2' '2023Q3' '2023Q4' '2024Q1'
 '2024Q2' '2024Q3' '2024Q4' '2025Q1'] 59


In [27]:
import pandas as pd

# 저장할 데이터 리스트 생성
data = []
for quarter in tf_df['input_quarter'].unique():
    num_documents = df[df['input_quarter'] == quarter].shape[0]
    data.append([quarter, num_documents])

# DataFrame 생성
count_df = pd.DataFrame(data, columns=['분기', '문서 수'])

# CSV 파일로 저장
# count_df.to_csv('quarterly_document_counts.csv', index=False, encoding='utf-8-sig')


In [28]:
count_df

Unnamed: 0,분기,문서 수
0,2006-2009Q,14
1,2010Q1,1
2,2010Q3,2
3,2010Q4,8
4,2011Q1,6
5,2011Q3,1
6,2011Q4,2
7,2012Q1,6
8,2012Q2,17
9,2012Q3,10
