<a href="https://colab.research.google.com/github/kkhhmm3103/SSU_Datathon2025/blob/main/%E1%84%8C%E1%85%A5%E1%86%AB%E1%84%8E%E1%85%A5%E1%84%85%E1%85%B5_%E1%84%8C%E1%85%A6%E1%84%8E%E1%85%AE%E1%86%AF%E1%84%8B%E1%85%AD%E1%86%BC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import unicodedata
import itertools
import pandas as pd
from collections import Counter, defaultdict
from typing import Union

In [None]:
df = pd.read_csv(
    "keyword_bert_df.csv",
    encoding="utf-8-sig"
)
print(df.shape)
print(df.columns)

In [None]:
MAP_CSV  = "keyword_map.csv"     # raw,normalized
DROP_CSV = "keyword_drop.csv"    # raw

def load_keyword_maps(map_csv=MAP_CSV, drop_csv=DROP_CSV):
    # --- map ---
    m = pd.read_csv(map_csv, encoding="utf-8-sig")
    m = m.dropna(subset=["raw", "normalized"]).copy()
    m["raw"] = m["raw"].astype(str).str.strip()
    m["normalized"] = m["normalized"].astype(str).str.strip()

    # raw 키는 map_soft에서 normalize한 key로 들어오기 때문에
    # 저장된 csv도 같은 기준(소문자/공백/괄호제거)로 맞춰주는 게 안전함
    # (아래 normalize 함수 정의 뒤에 다시 한번 정규화 적용함)

    map_dict = dict(zip(m["raw"], m["normalized"]))

    # --- drop ---
    d = pd.read_csv(drop_csv, encoding="utf-8-sig")
    d = d.dropna(subset=["raw"]).copy()
    d["raw"] = d["raw"].astype(str).str.strip()
    drop_set = set(d["raw"].tolist())

    return map_dict, drop_set

# 전처리(키워드 정규화/표준화) 블록

GENERIC_KW = {"기술", "연구"}
REMOVE_EXACT_NORM = {"기술", "technology", "연구"}

# ------------------------------------------------------------
# 하이픈/슬래시 같은 토큰 내부 기호 처리 정책
# - True: U-net, BERT-base, 5G/6G 등 보존
# - False: 연결기호를 공백으로 바꿔 토큰 분해 (U-net -> U net)
# ------------------------------------------------------------
KEEP_INNER_CONNECTORS = True

BRACKET_PATTERNS = [r"\([^)]*\)", r"\[[^\]]*\]", r"\{[^}]*\}", r"\<[^>]*\>"]
MULTISPACE_RE = re.compile(r"\s+")
HANGUL_RE = re.compile(r"[가-힣]")

if KEEP_INNER_CONNECTORS:
    # 토큰 내부 연결기호(-,/ 등)는 보존하고, ·• 같은 특수 구분자만 공백 처리
    CONNECTORS_RE = re.compile(r"[·•]+")
else:
    CONNECTORS_RE = re.compile(r"[_\-–—/\\·•]+")

# 공통 유틸
def split_keywords(x, sep=","):
    if not isinstance(x, str) or not x.strip():
        return []
    return [t.strip() for t in x.split(sep) if t.strip()]

def normalize_kw_basic(x: str) -> str:
    x = str(x)
    x = x.strip()
    x = x.lower()
    x = " ".join(x.split())
    return x

def remove_parenthetical(x: str) -> str:
    # deep learning(딥러닝) -> deep learning
    return re.sub(r"\s*\([^)]*\)", "", str(x)).strip()

def basic_cleanup(s: str) -> str:
    s = "" if s is None else str(s)
    s = unicodedata.normalize("NFKC", s).strip()
    s = CONNECTORS_RE.sub(" ", s)   # 정책에 따라 다르게 동작
    s = MULTISPACE_RE.sub(" ", s).strip(" \t\r\n;|,")
    return s

# CSV 매핑 로드 + key 정규화 통일
MAP_DICT_RAW, DROP_SET_RAW = load_keyword_maps()

def _norm_map_key(k: str) -> str:
    # map_soft에서 쓰는 기준과 동일하게
    return normalize_kw_basic(remove_parenthetical(k))

# CSV에 들어있는 raw 키들도 기준 통일
MAP_DICT = {_norm_map_key(k): normalize_kw_basic(v) for k, v in MAP_DICT_RAW.items()}
DROP_SET = {_norm_map_key(k) for k in DROP_SET_RAW}

# ------------------------------------------------------------
# CSV 기반 매핑 2종
# - strict: 원문 그대로 strip만 하고 찾음
# - soft  : 괄호 제거 + 소문자/공백 정리 후 찾음
# ------------------------------------------------------------
def map_strict(x: str) -> str:
    if not isinstance(x, str):
        return x
    key = x.strip()

    # key = normalize_kw_basic(key)
    return MAP_DICT.get(key, key)

def map_soft(x: str) -> str:
    if not isinstance(x, str):
        return x
    key = normalize_kw_basic(remove_parenthetical(x))

    # drop 우선
    if key in DROP_SET:
        return ""

    # 매핑
    out = MAP_DICT.get(key, key)
    out = normalize_kw_basic(out)

    # 매핑 결과가 drop이면 제거
    if out in DROP_SET:
        return ""
    return out

def unify_exact_ko_en(x: str) -> str:
    return map_soft(x)

def extract_parenthetical_preference(raw: str) -> str:
    """
    '영문(한글)'이면 영문(괄호 밖)을 대표로,
    '한글(영문)'이면 영문(괄호 안)을 대표로,
    그 외는 원문(가능하면 밖 우선)을 유지.
    """
    s = basic_cleanup(raw)
    m = re.match(r"^(.*?)\((.*?)\)\s*$", s)
    if not m:
        return s

    outside = basic_cleanup(m.group(1))
    inside  = basic_cleanup(m.group(2))

    out_has_ko = bool(HANGUL_RE.search(outside))
    in_has_ko  = bool(HANGUL_RE.search(inside))

    # 영문(한글) -> outside(영문)
    if (not out_has_ko) and in_has_ko:
        return outside

    # 한글(영문) -> inside(영문)
    if out_has_ko and (not in_has_ko):
        return inside

    # outside 우선
    return outside if outside else inside

def normalize_english_key(s: str) -> str:
    s = basic_cleanup(s).lower()
    s = MULTISPACE_RE.sub(" ", s).strip()

    s = s.replace("analyses", "analysis")
    s = s.replace("modelling", "modeling")

    # s 제거 예외 어미들
    NO_STRIP_SUFFIX = ("sis", "us", "is")

    if (
        s.endswith("s")
        and len(s) > 5
        and not s.endswith("ss")
        and not s.endswith(NO_STRIP_SUFFIX)
    ):
        s = s[:-1]
    return s

def canon_key(s: str) -> str:
    """
    자동매핑에서 같은 그룹인지 판단하는 key.
    - 괄호 병기 정리 + 브라켓류 제거 + 공백 정리
    - 영문은 normalize_english_key로 정규화
    - 마지막에 map_soft로 사전 기반 표준화까지 적용
    """
    s = extract_parenthetical_preference(s)

    for pat in BRACKET_PATTERNS:
        s = re.sub(pat, " ", s)

    s = basic_cleanup(s)

    if not HANGUL_RE.search(s):
        key = normalize_english_key(s)
    else:
        key = normalize_kw_basic(s)

    # CSV 기반 표준화
    key = map_soft(key)
    key = normalize_kw_basic(key)
    return key

def representative_of_group(items):
    # 대표어 우선순위: (1) 한글 포함 (2) 괄호 없는 것 (3) 빈도 높은 것
    def score(x):
        raw, cnt = x
        has_ko = 1 if HANGUL_RE.search(raw) else 0
        no_paren = 1 if "(" not in raw and ")" not in raw else 0
        return (has_ko, no_paren, cnt)
    return max(items, key=score)[0]

def preprocess_and_aggregate_docs(df: pd.DataFrame) -> pd.DataFrame:
    """
    문서 집계용 전처리:
    - KYWD explode
    - 기본 정규화 + CSV(map_soft) 표준화
    - GENERIC_KW 제거
    - PBSH에서 year/month
    - NODE_ID 단위로 keywords 리스트 집계
    """
    df_kw = df.dropna(subset=["KYWD"]).copy()
    df_kw["KYWD"] = df_kw["KYWD"].astype(str).str.split(",")
    df_kw["KYWD"] = df_kw["KYWD"].apply(lambda xs: [k.strip() for k in xs if k and k.strip()])

    df_exp = df_kw.explode("KYWD").dropna(subset=["KYWD"]).copy()

    df_exp["kw_norm"] = df_exp["KYWD"].apply(normalize_kw_basic)
    df_exp["kw_std"]  = df_exp["kw_norm"].apply(unify_exact_ko_en)

    # drop 처리로 빈 문자열 들어올 수 있음
    df_exp = df_exp[df_exp["kw_std"].astype(str).str.strip().ne("")].copy()

    df_exp = df_exp[~df_exp["kw_std"].isin(GENERIC_KW)].copy()

    if "PBSH" in df_exp.columns:
        df_exp["PBSH_str"] = df_exp["PBSH"].astype(str)
        df_exp["year"] = df_exp["PBSH_str"].str[:4]
        df_exp["month"] = df_exp["PBSH_str"].str[4:6]
    else:
        df_exp["year"] = None
        df_exp["month"] = None

    cols = df_exp.columns

    def safe_first(col):
        return (col, "first") if col in cols else (col, lambda x: None)

    doc_df = (
        df_exp.groupby("NODE_ID", as_index=False)
        .agg(
            NODE_TTLE=safe_first("NODE_TTLE"),
            NODE_TTLE_EN=safe_first("NODE_TTLE_EN"),
            NODE_CLSS_01=safe_first("NODE_CLSS_01"),
            NODE_CLSS_02=safe_first("NODE_CLSS_02"),
            ABST_KR=safe_first("ABST_KR"),
            ABST_EN=safe_first("ABST_EN"),
            year=("year", "first"),
            month=("month", "first"),
            keywords=("kw_std", lambda x: sorted(set([k for k in x if str(k).strip()]))),
            keywords_raw=("KYWD", lambda x: sorted(set([k for k in x if str(k).strip()]))),
        )
    )
    return doc_df


# 파이프라인(빈도표 → 자동매핑 → 정규화 적용 → Gephi)
def build_keyword_pipeline(
    in_data: Union[str, pd.DataFrame],
    sep: str = ",",
    out_freq_raw: str = "keyword_frequency.csv",
    min_count_for_mapping: int = 2,
    out_map: str = "keyword_mapping_auto.csv",
    out_merged_freq: str = "keyword_frequency_merged.csv",
    save_normalized_dataset: bool = True,
    out_data_norm: str = "dataset_kw_normalized.csv",
    top_n: int = 3000,
    min_edge_w: int = 4,
    max_kw_per_doc: int = 12,
    out_nodes: str = "gephi_nodes.csv",
    out_edges: str = "gephi_edges.csv",
):
    # 입력 처리: DataFrame vs CSV
    if isinstance(in_data, pd.DataFrame):
        df_base = in_data.copy()
    else:
        df_base = pd.read_csv(in_data, encoding="utf-8-sig")

    # 원표기 빈도표 생성
    df_raw = df_base[df_base["KYWD"].notna()].copy()
    df_raw["KW_LIST_RAW"] = df_raw["KYWD"].apply(lambda x: split_keywords(x, sep=sep))

    all_keywords_raw = [k for kws in df_raw["KW_LIST_RAW"] for k in kws]
    kw_freq_raw = Counter(all_keywords_raw)

    freq_raw_df = (
        pd.DataFrame(kw_freq_raw.items(), columns=["keyword", "count"])
        .sort_values("count", ascending=False)
        .reset_index(drop=True)
    )
    freq_raw_df.to_csv(out_freq_raw, index=False, encoding="utf-8-sig")
    print("[PIPE] saved raw freq:", out_freq_raw, "/ unique:", len(freq_raw_df))

    # 자동 매핑 생성 (count>=min_count_for_mapping)
    freq_df = freq_raw_df[freq_raw_df["count"] >= min_count_for_mapping].copy()
    print(f"[PIPE] mapping targets (count>={min_count_for_mapping}):", len(freq_df))

    groups = defaultdict(list)
    for kw, cnt in zip(freq_df["keyword"], freq_df["count"]):
        ck = canon_key(kw)  # CSV 기반 표준화 포함
        groups[ck].append((kw, int(cnt)))

    rows = []
    merged = []
    for ck, items in groups.items():
        rep_raw = representative_of_group(items)
        rep_pref = extract_parenthetical_preference(rep_raw)

        # canon_key()가 CSV 기반 표준화 포함
        rep_std = canon_key(rep_pref)
        rep_std = normalize_kw_basic(rep_std)

        total = sum(c for _, c in items)
        merged.append((rep_std, total, len(items)))

        for raw, _ in items:
            rows.append((raw, rep_std))

    map_df = pd.DataFrame(rows, columns=["raw", "normalized"]).drop_duplicates()
    map_df.to_csv(out_map, index=False, encoding="utf-8-sig")

    merged_df = (
        pd.DataFrame(merged, columns=["normalized", "count", "variants"])
        .groupby("normalized", as_index=False)
        .agg({"count": "sum", "variants": "sum"})
        .sort_values("count", ascending=False)
    )
    merged_df.to_csv(out_merged_freq, index=False, encoding="utf-8-sig")

    print("[PIPE] saved map:", out_map, "/ rows:", len(map_df))
    print("[PIPE] saved merged freq:", out_merged_freq, "/ unique normalized:", len(merged_df))

    # 원본에 매핑 적용(정규화 키워드 리스트 생성)
    map_dict = dict(zip(map_df["raw"], map_df["normalized"]))

    def apply_mapping_list(kw_list):
        out = [map_dict.get(k, k) for k in kw_list]
        out = [k for k in out if isinstance(k, str) and k.strip()]

        # 사전(CSV) 기반 표준화까지 적용
        out = [map_soft(k) for k in out]
        out = [k for k in out if isinstance(k, str) and k.strip()]

        # 정확히 일치하는 키워드만 제거
        out = [k for k in out if normalize_kw_basic(k) not in REMOVE_EXACT_NORM]

        return sorted(set(out))

    df_norm = df_base.copy()
    df_norm["KW_LIST_RAW"] = df_norm["KYWD"].apply(lambda x: split_keywords(x, sep=sep))
    df_norm["KW_LIST_NORM"] = df_norm["KW_LIST_RAW"].apply(apply_mapping_list)
    df_norm["KYWD_NORM"] = df_norm["KW_LIST_NORM"].apply(lambda xs: ", ".join(xs))

    if save_normalized_dataset:
        df_norm.to_csv(out_data_norm, index=False, encoding="utf-8-sig")
        print("[PIPE] saved normalized dataset:", out_data_norm)

    # Gephi 네트워크 생성
    all_kws_norm = [k for kws in df_norm["KW_LIST_NORM"] for k in kws]
    freq_norm = Counter(all_kws_norm)

    top_keywords = [k for k, _ in freq_norm.most_common(top_n)]
    top_set = set(top_keywords)

    print("[PIPE] unique keywords (norm):", len(freq_norm))
    print("[PIPE] top_n nodes:", len(top_set))

    edge_counter = Counter()
    for kw_list in df_norm["KW_LIST_NORM"]:
        kws = [k for k in kw_list if k in top_set]
        kws = sorted(set(kws), key=lambda x: freq_norm[x], reverse=True)[:max_kw_per_doc]
        if len(kws) < 2:
            continue

        for a, b in itertools.combinations(sorted(kws), 2):
            edge_counter[(a, b)] += 1

    edges = [(a, b, w) for (a, b), w in edge_counter.items() if w >= min_edge_w]
    edge_df = (
        pd.DataFrame(edges, columns=["Source", "Target", "Weight"])
        .sort_values("Weight", ascending=False)
    )
    edge_df.to_csv(out_edges, index=False, encoding="utf-8-sig")

    node_df = (
        pd.DataFrame([(k, freq_norm[k]) for k in top_set], columns=["Id", "Frequency"])
        .sort_values("Frequency", ascending=False)
    )
    node_df.to_csv(out_nodes, index=False, encoding="utf-8-sig")

    print("[PIPE] saved edges:", out_edges, "/ edges:", len(edge_df))
    print("[PIPE] saved nodes:", out_nodes, "/ nodes:", len(node_df))

    return {
        "freq_raw_df": freq_raw_df,
        "map_df": map_df,
        "merged_df": merged_df,
        "df_norm": df_norm,
        "node_df": node_df,
        "edge_df": edge_df,
    }

results = build_keyword_pipeline(
    in_data=df,
    sep=",",
    out_freq_raw="keyword_frequency333.csv",
    min_count_for_mapping=2,
    out_map="keyword_mapping_auto_countge2_v2333.csv",
    out_merged_freq="keyword_frequency_merged_countge2_v2333.csv",
    save_normalized_dataset=True,
    out_data_norm="keyword_bert_df.csv",
    top_n=3000,
    min_edge_w=4,
    max_kw_per_doc=12,
    out_nodes="gephi_nodes_topN333.csv",
    out_edges="gephi_edges_topN333.csv",
)

✅ [PIPE] saved raw freq: keyword_frequency333.csv / unique: 178391
✅ [PIPE] mapping targets (count>=2): 31935
✅ [PIPE] saved map: keyword_mapping_auto_countge2_v2333.csv / rows: 31935
✅ [PIPE] saved merged freq: keyword_frequency_merged_countge2_v2333.csv / unique normalized: 25242
✅ [PIPE] saved normalized dataset: keyword_bert_df.csv
✅ [PIPE] unique keywords (norm): 150645
✅ [PIPE] top_n nodes: 3000
✅ [PIPE] saved edges: gephi_edges_topN333.csv / edges: 2268
✅ [PIPE] saved nodes: gephi_nodes_topN333.csv / nodes: 3000
===== DONE =====
