In [1]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
src_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
if src_dir not in sys.path:
    sys.path.append(src_dir)
from modules.preprocess import *

In [9]:
model_df=pd.read_csv("../../data/250611_SUN_MODEL_STD.csv")
maker_df=pd.read_csv("../../data/250611_SUN_MAKER_STD.csv")

In [27]:
model_df

Unnamed: 0,modelId,modelSTDName,makerId,modelAliasList,categoryCD
0,MO1040322,20-124059-000,MA287,20-124059-000,PARTS_PC
1,MO1040323,20-124096-000,MA287,20-124096-000,PARTS_PC
2,MO1040326,23-234002-000R,MA287,23-234002-000R,PARTS_RF
3,MO1040327,23-234007-000R,MA287,23-234007-000R,PARTS_MC
4,MO1040328,23-234011-000R,MA287,23-234011-000R,PARTS_MC
...,...,...,...,...,...
398577,MO5904,SC-410,MA2490,SC-410,PKG_ETC
398578,MO5933,YS24 Dual,MA2574,YS24 Dual,CHIP_MOUNTER
398579,MO5934,YS24 Single,MA2574,YS24 Single,CHIP_MOUNTER
398580,MO6392,CYW100,MA362,CYW100,MEDICAL


In [33]:
def extract_candidates(alias_str):
    if not isinstance(alias_str, str):
        return []
    alias_str = re.sub(r"Co\., Ltd\.", "Co. Ltd.", alias_str)  # 콤마 제거 방지용 치환
    parts = [part.strip() for part in alias_str.split(",") if part.strip()]
    seen = set()
    return [x if x != "Co. Ltd." else "Co., Ltd." for x in parts if not (x in seen or seen.add(x))]


def alias_info(df, explode_column="modelAlias", save_path=None, save=False):
    df = df.copy()
    df["new_alias"] = df[explode_column].apply(extract_candidates)
    df = df.explode("new_alias", ignore_index=True)
    df.drop(columns=[explode_column], inplace=True)
    df.rename(columns={"new_alias": explode_column}, inplace=True)
    df[explode_column] = df[explode_column].str.strip()
    df.dropna(subset=[explode_column], inplace=True)
    df = df.astype(str)
    if save:
        os.makedirs(save_path, exist_ok=True)
        save_file = os.path.join(save_path, f"{explode_column}_alias.csv")
        df.to_csv(save_file, index=False)
    return df


In [39]:
from rapidfuzz import fuzz
import gradio as gr
import pandas as pd
import json
import os
import re

# 경로 설정
MODEL_CSV_PATH = "../../data/250611_SUN_MODEL_STD.csv"
MAKER_CSV_PATH = "../../data/250611_SUN_MAKER_STD.csv"
MODEL_RESULT_JSON_PATH = "../model_alias_results.json"
MAKER_RESULT_JSON_PATH = "../maker_alias_results.json"

# 저장 파일 없으면 생성
for path in [MODEL_RESULT_JSON_PATH, MAKER_RESULT_JSON_PATH]:
    if not os.path.exists(path):
        with open(path, 'w') as f:
            json.dump({}, f)

# 데이터 불러오기
model_df = pd.read_csv(MODEL_CSV_PATH)
maker_df = pd.read_csv(MAKER_CSV_PATH)
maker_df.drop(0,inplace=True)
model_df.fillna("", inplace=True)
maker_df.fillna("", inplace=True)

# 기존 결과 불러오기
with open(MODEL_RESULT_JSON_PATH, 'r') as f:
    saved_results = json.load(f)
with open(MAKER_RESULT_JSON_PATH, 'r') as f:
    saved_maker_results = json.load(f)

# 모델 alias 검증용 필터링 (중복 저장 방지 포함)
model_pending_rows = []
for i, row in model_df.iterrows():
    model_id = row['modelId']
    std_name = str(row['modelSTDName'])
    alias_str = str(row['modelAliasList'])
    candidates = extract_candidates(alias_str)

    valid_candidates = []
    for alias in candidates:
        score = fuzz.token_sort_ratio(std_name, alias)
        if score >= 80:
            if model_id not in saved_results:
                saved_results[model_id] = []
            if alias not in saved_results[model_id]:
                saved_results[model_id].append(alias)
        else:
            valid_candidates.append(alias)

    if valid_candidates:
        row['modelCandidates'] = valid_candidates
        model_pending_rows.append(row)

with open(MODEL_RESULT_JSON_PATH, 'w') as f:
    json.dump(saved_results, f, indent=2)

pending_df = pd.DataFrame(model_pending_rows).reset_index(drop=True)

# 메이커 alias 검증용 필터링 (중복 저장 방지 포함)
maker_pending_rows = []
for i, row in maker_df.iterrows():
    maker_id = row['makerID']
    std_name = str(row['makerSTDName'])
    alias_str = str(row['makerAliasList'])
    candidates = extract_candidates(alias_str)

    valid_candidates = []
    for alias in candidates:
        score = fuzz.token_sort_ratio(std_name, alias)
        if score >= 80:
            if maker_id not in saved_maker_results:
                saved_maker_results[maker_id] = []
            if alias not in saved_maker_results[maker_id]:
                saved_maker_results[maker_id].append(alias)
        else:
            valid_candidates.append(alias)

    if valid_candidates:
        row['makerCandidates'] = valid_candidates
        maker_pending_rows.append(row)

with open(MAKER_RESULT_JSON_PATH, 'w') as f:
    json.dump(saved_maker_results, f, indent=2)

maker_pending_df = pd.DataFrame(maker_pending_rows).reset_index(drop=True)

# 상태 변수
index = 0
maker_index = 0


def load_next():
    global index
    while index < len(pending_df):
        row = pending_df.iloc[index]
        std_name = row["modelSTDName"]
        model_id = row["modelId"]
        candidates = row.get("modelCandidates", [])
        if not isinstance(candidates, list):
            candidates = extract_candidates(str(candidates))
        if not candidates:
            index += 1
            continue  # 다음 row로 이동
        related = []
        alias_index = 0
        history = []
        question = f"{std_name}와 연관된 단어인가요? → {candidates[alias_index]}"
        return question, std_name, candidates, model_id, related, alias_index, history, ""
    return "모든 모델이 검토되었습니다!", "", [], "", [], 0, [], ""


def go_next_model():
    global index
    index += 1
    return load_next()

def check_relation(choice, std_name, candidates, model_id, related, alias_index, history):
    word = candidates[alias_index]
    if choice == "연관 있음":
        related.append(word)
        history.append((alias_index, word, True))
    else:
        history.append((alias_index, word, False))
    alias_index += 1
    if alias_index >= len(candidates):
        saved_results[model_id] = related
        with open(MODEL_RESULT_JSON_PATH, 'w') as f:
            json.dump(saved_results, f, indent=2)
        global index
        index += 1
        return load_next()
    question = f"{std_name}와 연관된 단어인가요? → {candidates[alias_index]}"
    return question, std_name, candidates, model_id, related, alias_index, history, "\n".join(related)

def undo(std_name, candidates, model_id, related, alias_index, history):
    if not history:
        return f"{std_name}와 연관된 단어인가요? → {candidates[alias_index]}", std_name, candidates, model_id, related, alias_index, history, "\n".join(related)
    last_index, word, was_related = history.pop()
    alias_index = last_index
    if was_related and word in related:
        related.remove(word)
    question = f"{std_name}와 연관된 단어인가요? → {candidates[alias_index]}"
    return question, std_name, candidates, model_id, related, alias_index, history, "\n".join(related)

# ----------- 메이커 검증 로직 ----------- #
def load_next_maker():
    global maker_index
    while maker_index < len(maker_pending_df):
        row = maker_pending_df.iloc[maker_index]
        std_name = row["makerSTDName"]
        maker_id = row["makerID"]
        candidates = row.get("makerCandidates", [])
        if not isinstance(candidates, list):
            candidates = extract_candidates(str(candidates))
        if not candidates:
            maker_index += 1
            continue
        related = []
        alias_index = 0
        history = []
        question = f"{std_name}의 alias로 '{candidates[alias_index]}'를 저장할까요?"
        return question, std_name, candidates, maker_id, related, alias_index, history, ""
    return "모든 메이커가 검토되었습니다!", "", [], "", [], 0, [], ""


def go_next_maker():
    global maker_index
    maker_index += 1
    return load_next_maker()

def check_relation_maker(choice, std_name, candidates, maker_id, related, alias_index, history):
    word = candidates[alias_index]
    if choice == "저장":
        related.append(word)
        history.append((alias_index, word, True))
    else:
        history.append((alias_index, word, False))
    alias_index += 1
    if alias_index >= len(candidates):
        saved_maker_results[maker_id] = related
        with open(MAKER_RESULT_JSON_PATH, 'w') as f:
            json.dump(saved_maker_results, f, indent=2)
        global maker_index
        maker_index += 1
        return load_next_maker()
    question = f"{std_name}의 alias로 '{candidates[alias_index]}'를 저장할가요?"
    return question, std_name, candidates, maker_id, related, alias_index, history, "\n".join(related)

def undo_maker(std_name, candidates, maker_id, related, alias_index, history):
    if not history:
        return f"{std_name}의 alias로 '{candidates[alias_index]}'를 저장할가요?", std_name, candidates, maker_id, related, alias_index, history, "\n".join(related)
    last_index, word, was_related = history.pop()
    alias_index = last_index
    if was_related and word in related:
        related.remove(word)
    question = f"{std_name}의 alias로 '{candidates[alias_index]}'를 저장할가요?"
    return question, std_name, candidates, maker_id, related, alias_index, history, "\n".join(related)

# ----------- Gradio 인터페이스 ----------- #
initial_model_values = load_next()
initial_maker_values = load_next_maker()

with gr.Blocks() as demo:
    with gr.Tabs():
        with gr.Tab("모델 Alias 검증"):
            gr.Markdown("# 🔍 Model Alias 검증 시스템")
            question = gr.Textbox(value=initial_model_values[0], label="질문", interactive=False)
            std_name = gr.State(initial_model_values[1])
            candidates = gr.State(initial_model_values[2])
            model_id = gr.State(initial_model_values[3])
            related = gr.State(initial_model_values[4])
            alias_index = gr.State(initial_model_values[5])
            history = gr.State(initial_model_values[6])
            output = gr.Textbox(value=initial_model_values[7], label="선택된 alias", lines=6)

            with gr.Row():
                yes_btn = gr.Button("✅ 연관 있음")
                no_btn = gr.Button("❌ 연관 없음")
                undo_btn = gr.Button("⬅️ 되돌리기")
            start_btn = gr.Button("▶️ 다음 모델 시작")
            start_btn.click(fn=go_next_model, outputs=[question, std_name, candidates, model_id, related, alias_index, history, output])
            yes_btn.click(fn=lambda s, c, m, r, a, h: check_relation("연관 있음", s, c, m, r, a, h), inputs=[std_name, candidates, model_id, related, alias_index, history], outputs=[question, std_name, candidates, model_id, related, alias_index, history, output])
            no_btn.click(fn=lambda s, c, m, r, a, h: check_relation("연관 없음", s, c, m, r, a, h), inputs=[std_name, candidates, model_id, related, alias_index, history], outputs=[question, std_name, candidates, model_id, related, alias_index, history, output])
            undo_btn.click(fn=undo, inputs=[std_name, candidates, model_id, related, alias_index, history], outputs=[question, std_name, candidates, model_id, related, alias_index, history, output])

        with gr.Tab("메이커 Alias 검증"):
            gr.Markdown("# 🏭 Maker Alias 검증 시스템")
            question_m = gr.Textbox(value=initial_maker_values[0], label="질문", interactive=False)
            std_name_m = gr.State(initial_maker_values[1])
            candidates_m = gr.State(initial_maker_values[2])
            maker_id = gr.State(initial_maker_values[3])
            related_m = gr.State(initial_maker_values[4])
            alias_index_m = gr.State(initial_maker_values[5])
            history_m = gr.State(initial_maker_values[6])
            output_m = gr.Textbox(value=initial_maker_values[7], label="선택된 alias", lines=6)

            with gr.Row():
                yes_btn_m = gr.Button("💾 저장")
                no_btn_m = gr.Button("❌ 제외")
                undo_btn_m = gr.Button("⬅️ 되돌리기")
            start_btn_m = gr.Button("▶️ 다음 메이커 시작")
            start_btn_m.click(fn=go_next_maker, outputs=[question_m, std_name_m, candidates_m, maker_id, related_m, alias_index_m, history_m, output_m])
            yes_btn_m.click(fn=lambda s, c, m, r, a, h: check_relation_maker("저장", s, c, m, r, a, h), inputs=[std_name_m, candidates_m, maker_id, related_m, alias_index_m, history_m], outputs=[question_m, std_name_m, candidates_m, maker_id, related_m, alias_index_m, history_m, output_m])
            no_btn_m.click(fn=lambda s, c, m, r, a, h: check_relation_maker("제외", s, c, m, r, a, h), inputs=[std_name_m, candidates_m, maker_id, related_m, alias_index_m, history_m], outputs=[question_m, std_name_m, candidates_m, maker_id, related_m, alias_index_m, history_m, output_m])
            undo_btn_m.click(fn=undo_maker, inputs=[std_name_m, candidates_m, maker_id, related_m, alias_index_m, history_m], outputs=[question_m, std_name_m, candidates_m, maker_id, related_m, alias_index_m, history_m, output_m])

# 실행
demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7865
* Running on public URL: https://dc5289294642c2e342.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


