In [None]:
'''
20_kw_extract_yake.ipynb

목적: (키워드X & 초록O/제목O) 문서에서 YAKE로 키워드 추출 후 랭킹 생성

입력: data/interim/papers_clean.parquet

출력: data/interim/papers_kw_enriched.parquet, outputs/tables/kw_rank_*.csv

언어 처리: 한글/영문 분리 추출 → 랭킹 합치기(정규화 포함)

주의: 추출 키워드는 “원래 키워드”와 구분 컬럼으로 저장
'''

In [8]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# 그룹 A(키워드 없음 + 초록 있음) 생성
# 국문/영문 초록 분리 YAKE 추출
# 언어별 전체 랭킹 생성 + Drive 저장

In [10]:
!pip -q install yake

In [11]:
import os
import pandas as pd
import numpy as np
from collections import Counter
import yake

In [12]:
BASE = "/content/drive/MyDrive/SSU_Datathon2025"
PARQ = f"{BASE}/papers.parquet"

In [13]:
papers = pd.read_parquet(PARQ)

# 초록 컬럼명 자동 찾기
abst_kr = "ABST_KR" if "ABST_KR" in papers.columns else ("KR_ABST" if "KR_ABST" in papers.columns else None)
abst_en = "ABST_EN" if "ABST_EN" in papers.columns else ("EN_ABST" if "EN_ABST" in papers.columns else None)

if not abst_kr and not abst_en:
    raise ValueError("초록 컬럼(ABST_KR/ABST_EN 또는 KR_ABST/EN_ABST)을 찾지 못함")

In [14]:
# 공백 문자열 -> NaN (키워드/초록/제목)
for col in ["KYWD", "NODE_TTLE", "NODE_TTLE_EN", abst_kr, abst_en]:
    if col and col in papers.columns:
        papers[col] = papers[col].replace(r"^\s*$", np.nan, regex=True)

In [15]:
# 그룹 A 생성(키워드 없음 + 초록 있음)
has_kw = papers["KYWD"].notna()

if abst_kr and abst_en:
    has_abs = papers[abst_kr].notna() | papers[abst_en].notna()
elif abst_kr:
    has_abs = papers[abst_kr].notna()
else:
    has_abs = papers[abst_en].notna()

grp_A = papers[(~has_kw) & (has_abs)].copy()
print("Group A (no KYWD, has abstract):", len(grp_A))

Group A (no KYWD, has abstract): 1322


In [16]:
# 국문/영문 초록 분리
df_kr = grp_A[grp_A[abst_kr].notna()].copy() if abst_kr else grp_A.iloc[0:0].copy()
df_en = grp_A[grp_A[abst_en].notna()].copy() if abst_en else grp_A.iloc[0:0].copy()
print("KR abstracts in A:", len(df_kr))
print("EN abstracts in A:", len(df_en))

KR abstracts in A: 1278
EN abstracts in A: 300


In [17]:
# 국문 영문 YAKE 설정
yake_kr = yake.KeywordExtractor(lan="ko", n=3, top=10, dedupLim=0.9, dedupFunc="seqm", windowsSize=1)
yake_en = yake.KeywordExtractor(lan="en", n=3, top=10, dedupLim=0.9, dedupFunc="seqm", windowsSize=1)

def extract_yake(extractor, text):
    if not isinstance(text, str) or not text.strip():
        return []
    try:
        kws = extractor.extract_keywords(text)  # [(kw, score), ...]
        return [k for k, s in kws]
    except Exception:
        return []

In [18]:
# 문서별 키워드 추출
if len(df_kr) > 0:
    df_kr["yake_kw_kr"] = df_kr[abst_kr].astype(str).apply(lambda x: extract_yake(yake_kr, x))
    df_kr["yake_kw_kr_str"] = df_kr["yake_kw_kr"].apply(lambda xs: ", ".join(xs))

if len(df_en) > 0:
    df_en["yake_kw_en"] = df_en[abst_en].astype(str).apply(lambda x: extract_yake(yake_en, x))
    df_en["yake_kw_en_str"] = df_en["yake_kw_en"].apply(lambda xs: ", ".join(xs))

In [19]:
# 언어별 전체 랭킹(빈도 기준)
rank_kr = pd.DataFrame(columns=["keyword", "count"])
rank_en = pd.DataFrame(columns=["keyword", "count"])

if len(df_kr) > 0:
    c = Counter([k for ks in df_kr["yake_kw_kr"] for k in ks])
    rank_kr = pd.DataFrame(c.most_common(300), columns=["keyword", "count"])

if len(df_en) > 0:
    c = Counter([k for ks in df_en["yake_kw_en"] for k in ks])
    rank_en = pd.DataFrame(c.most_common(300), columns=["keyword", "count"])

display(rank_kr.head(30))
display(rank_en.head(30))

Unnamed: 0,keyword,count
0,디지털,14
1,Large Language Model,10
2,본고에서는,9
3,ICT,9
4,ITU-T,8
5,정보통신기술( ICT,7
6,에너지,7
7,Internet of Things,7
8,국제전기통신연합( ITU,7
9,연구가 활발히 진행되고,7


Unnamed: 0,keyword,count
0,concise and factual,16
1,abstract,16
2,factual abstract,16
3,required,16
4,concise,16
5,factual,16
6,briefly the background,16
7,purpose and methods,16
8,results and conclusions,16
9,abstract is required,16


In [20]:
# 저장
out_A_path = f"{BASE}/kw_extract_group_A.parquet"
out_kr_doc = f"{BASE}/group_A_kr_yake.csv"
out_en_doc = f"{BASE}/group_A_en_yake.csv"
out_kr_rank = f"{BASE}/rank_kr.csv"
out_en_rank = f"{BASE}/rank_en.csv"

grp_A.to_parquet(out_A_path, index=False)

if len(df_kr) > 0:
    df_kr.to_csv(out_kr_doc, index=False, encoding="utf-8-sig")
if len(df_en) > 0:
    df_en.to_csv(out_en_doc, index=False, encoding="utf-8-sig")

rank_kr.to_csv(out_kr_rank, index=False, encoding="utf-8-sig")
rank_en.to_csv(out_en_rank, index=False, encoding="utf-8-sig")

print("saved:", out_A_path)
print("saved:", out_kr_doc)
print("saved:", out_en_doc)
print("saved:", out_kr_rank)
print("saved:", out_en_rank)

saved: /content/drive/MyDrive/SSU_Datathon2025/kw_extract_group_A.parquet
saved: /content/drive/MyDrive/SSU_Datathon2025/group_A_kr_yake.csv
saved: /content/drive/MyDrive/SSU_Datathon2025/group_A_en_yake.csv
saved: /content/drive/MyDrive/SSU_Datathon2025/rank_kr.csv
saved: /content/drive/MyDrive/SSU_Datathon2025/rank_en.csv


In [23]:
# 작업 공간 이동
%cd /content
!ls

# 공개 레포 클론
!git clone https://github.com/kkhhmm3103/SSU_Datathon2025.git
%cd /content/SSU_Datathon2025

# 현재 브랜치 확인
!git branch


/content
drive  sample_data
Cloning into 'SSU_Datathon2025'...
/content/SSU_Datathon2025


In [24]:
!git config --global user.name "kkhhmm3103"
!git config --global user.email "kkhhmm23@gmail.com"

In [25]:
%cd /content/SSU_Datathon2025
!ls -la

/content/SSU_Datathon2025
total 12
drwxr-xr-x 3 root root 4096 Jan  6 15:42 .
drwxr-xr-x 1 root root 4096 Jan  6 15:42 ..
drwxr-xr-x 7 root root 4096 Jan  6 15:42 .git


In [28]:
%%bash
cd /content/SSU_Datathon2025

cat > .gitignore << 'EOF'
*.parquet
*.csv
*.json
.ipynb_checkpoints/
.DS_Store
EOF

In [29]:
%cd /content/SSU_Datathon2025
!git status
!git add notebooks .gitignore
!git commit -m "Add Colab notebooks"

/content/SSU_Datathon2025
On branch main

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.gitignore[m

nothing added to commit but untracked files present (use "git add" to track)
fatal: pathspec 'notebooks' did not match any files
On branch main

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.gitignore[m

nothing added to commit but untracked files present (use "git add" to track)
