In [None]:
import sys
import os
import gradio as gr
import pandas as pd
from rapidfuzz import process, fuzz
src_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
if src_dir not in sys.path:
    sys.path.append(src_dir)
from modules.preprocess import *

reference_df = pd.read_csv("../../data/test_dataset.csv")
# reference_df["modelAlias"] = reference_df["modelAlias"].apply(normalize_query)
# reference_df["makerAlias"] = reference_df["makerAlias"].apply(normalize_query)
reference_df.fillna("", inplace=True)
#reference_tuples = list(reference_df[['modelID', 'modelAlias', 'makerAlias']].itertuples(index=False, name=None))
reference_tuples = []

for model_id, group in reference_df.groupby("modelID"):
    alias_list = group["modelAlias"].dropna().unique().tolist()
    maker_list = group["makerAlias"].dropna().unique().tolist()
    reference_tuples.append((model_id, alias_list, maker_list))



# 유사도 매칭 함수
def fuzzy_match_model_id_fast(model_alias, maker_alias, reference_tuples, top_k=5):
    query = f"{model_alias} {maker_alias}".strip()

    all_choices = []
    all_ids = []

    for model_id, model_list, maker_list in reference_tuples:
        for model in model_list:
            if not maker_alias.strip():
                all_choices.append(model)
                all_ids.append(model_id)
            else:
                for maker in maker_list:
                    all_choices.append(f"{model} {maker}")
                    all_ids.append(model_id)

    matches = process.extract(
        query,
        all_choices,
        scorer=fuzz.token_sort_ratio,
        limit=top_k,
        score_cutoff=70,
    )

 
    top_matches = [(all_ids[idx], score) for (_, score, idx) in matches]

    return top_matches



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def predict_from_excel(file):
    try:
        test_df = pd.read_excel(file)
    except Exception as e:
        return pd.DataFrame([{"error": f"파일을 불러올 수 없습니다: {e}"}])

    # 필수 컬럼 확인
    if not {"MODEL_NAME", "MAKER_NAME"}.issubset(test_df.columns):
        return pd.DataFrame([{"error": "엑셀 파일에 'MODEL_NAME', 'MAKER_NAME' 컬럼이 존재해야 합니다."}])

    test_df.fillna("", inplace=True)
    test_df["MODEL_NAME"] = test_df["MODEL_NAME"].apply(normalize_query)
    test_df["MAKER_NAME"] = test_df["MAKER_NAME"].apply(normalize_query)

    results = []

    for _, row in test_df.iterrows():
        top_preds = fuzzy_match_model_id_fast(row["MODEL_NAME"], row["MAKER_NAME"], reference_tuples)

        # modelID별 최대 점수로 그룹화
        top_df = pd.DataFrame(top_preds, columns=["modelID", "score"])
        top_df = top_df.groupby("modelID", as_index=False).agg({"score": "max"}).sort_values("score", ascending=False)

        if top_df.empty:
            results.append({
                "input_MODEL_NAME": row["MODEL_NAME"],
                "input_MAKER_NAME": row["MAKER_NAME"],
                "predicted_modelID": None,
                "score": None,
                "predicted_others_modelID": [],
                "others_score": [],
            })
            continue

        best_row = top_df.iloc[0]
        other_rows = top_df.iloc[1:]

        results.append({
            "input_MODEL_NAME": row["MODEL_NAME"],
            "input_MAKER_NAME": row["MAKER_NAME"],
            "predicted_modelID": best_row["modelID"],
            "score": best_row["score"],
            "predicted_others_modelID": other_rows["modelID"].tolist(),
            "others_score": other_rows["score"].tolist(),
        })

    return pd.DataFrame(results)




# 📌 Gradio UI 구성
iface = gr.Interface(
    fn=predict_from_excel,
    inputs=gr.File(type="filepath", label="엑셀 파일 업로드 (MODEL_NAME, MAKER_NAME 포함)"),
    outputs="dataframe",
    title="🔍 ModelID Matcher",
    description="업로드한 엑셀의 MODEL_NAME과 MAKER_NAME을 기반으로 유사한 modelID를 추정합니다."
)

# 📌 앱 실행
iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://caef80c7228079c6c8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [3]:
model_query = "XNIII-725PCG"
maker_query = ""
category_query = ""
top_preds = fuzzy_match_model_id_fast(normalize_query(model_query),normalize_query(maker_query), reference_tuples)
pd.DataFrame(top_preds, columns=["modelID", "score"]) \
  .groupby("modelID", as_index=False) \
  .agg({"score": "max"}) \
  .sort_values("score", ascending=False)


Unnamed: 0,modelID,score
0,MO6845,100.0


In [85]:
queryID="MO14258"
reference_df[reference_df["modelID"]==queryID]

Unnamed: 0,modelID,category,modelAlias,makerAlias


In [80]:
model_df[model_df["modelID"]==queryID]

Unnamed: 0,modelID,modelSTDName,MakerID,modelAlias,category


In [79]:
model_df=pd.read_excel("../../data/250224 model_STD.xlsx")
