In [2]:
!pip install transformers
!pip install sentence-transformers
!pip install torch

Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp313-cp313-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.7.34-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.1-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.34.0->transformers)
  


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->sentence-transformers)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch>=1.11.0->sentence-transformers)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting setuptools (from torch>=1.11.0->sentence-transformers)
  Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Collec


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# 모델 로드
MODEL_NAME = "BM-K/KoSimCSE-roberta-multitask"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# 인코딩 함수 정의
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0]  # [CLS] 토큰 임베딩
        embeddings = F.normalize(embeddings, p=2, dim=1)  # 정규화
    return embeddings

# 유사도 측정 함수
def cosine_similarity(text1, text2):
    emb1 = get_embedding(text1)
    emb2 = get_embedding(text2)
    sim = F.cosine_similarity(emb1, emb2).item()
    return sim

# 테스트
text_a = "수입자의 지급 지연으로 인한 사고"
text_b = "수입자가 기한 내 대금을 송금하지 않음"

similarity_score = cosine_similarity(text_a, text_b)
print(f"✅ 유사도 점수: {similarity_score:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


✅ 유사도 점수: 0.7443


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np

# 1. 모델 로드
#MODEL_NAME = "BM-K/KoSimCSE-roberta"
MODEL_NAME = "insurance-korean-bert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# 2. 텍스트 → 임베딩 벡터 함수
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        emb = outputs.last_hidden_state[:, 0]  # [CLS] 토큰
        emb = F.normalize(emb, p=2, dim=1)
    return emb

# 3. 상품분류그룹 리스트 (예시)
group_names = [
    "의류 및 직물류", "무기 및 유기화학제품", "수출 품목 그룹 정보 미제공", "전기 및 전자제품", "기계류", 
    "금속, 비금속류", "고무, 가죽", "운송장비 및 부품", "기타", "목재와 펄프, 지물류",
    "농수산물, 식료품", "정밀기기, 시계, 악기, 무기류", "도기, 유리, 보석류", 
    "완구, 운동용구, 잡품", "소금, 광물류", "제조설비", "기타산업설비", "해외엔지니어링 활동", "통신설비"
]

     


# 4. 모든 그룹명 → 벡터화
embeddings = []
for name in group_names:
    embeddings.append(get_embedding(name))

# 5. pairwise cosine similarity 계산
n = len(group_names)
sim_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(n):
        sim = F.cosine_similarity(embeddings[i], embeddings[j]).item()
        sim_matrix[i, j] = sim

# 6. DataFrame으로 정리
sim_df = pd.DataFrame(sim_matrix, index=group_names, columns=group_names)


OSError: insurance-korean-bert is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [11]:
# 결과 보기
sim_df.round(3)

Unnamed: 0,의류 및 직물류,무기 및 유기화학제품,수출 품목 그룹 정보 미제공,전기 및 전자제품,기계류,"금속, 비금속류","고무, 가죽",운송장비 및 부품,기타,"목재와 펄프, 지물류","농수산물, 식료품","정밀기기, 시계, 악기, 무기류","도기, 유리, 보석류","완구, 운동용구, 잡품","소금, 광물류",제조설비,기타산업설비,해외엔지니어링 활동,통신설비
의류 및 직물류,1.0,0.207,0.232,0.295,0.411,0.343,0.35,0.478,0.319,0.539,0.359,0.381,0.448,0.435,0.266,0.373,0.369,0.242,0.149
무기 및 유기화학제품,0.207,1.0,0.218,0.287,0.28,0.507,0.265,0.35,0.272,0.385,0.385,0.422,0.352,0.314,0.476,0.437,0.394,0.221,0.249
수출 품목 그룹 정보 미제공,0.232,0.218,1.0,0.123,0.187,0.267,0.085,0.251,0.082,0.21,0.324,0.148,0.192,0.171,0.249,0.199,0.147,0.258,0.164
전기 및 전자제품,0.295,0.287,0.123,1.0,0.546,0.342,0.161,0.463,0.349,0.32,0.262,0.507,0.395,0.363,0.255,0.487,0.527,0.293,0.619
기계류,0.411,0.28,0.187,0.546,1.0,0.437,0.249,0.73,0.382,0.392,0.288,0.552,0.411,0.433,0.239,0.693,0.631,0.457,0.53
"금속, 비금속류",0.343,0.507,0.267,0.342,0.437,1.0,0.336,0.456,0.329,0.446,0.353,0.5,0.5,0.403,0.568,0.382,0.437,0.307,0.282
"고무, 가죽",0.35,0.265,0.085,0.161,0.249,0.336,1.0,0.36,0.252,0.317,0.191,0.285,0.282,0.349,0.22,0.273,0.269,0.202,0.208
운송장비 및 부품,0.478,0.35,0.251,0.463,0.73,0.456,0.36,1.0,0.356,0.463,0.423,0.58,0.462,0.558,0.331,0.596,0.587,0.445,0.547
기타,0.319,0.272,0.082,0.349,0.382,0.329,0.252,0.356,1.0,0.396,0.225,0.455,0.357,0.361,0.218,0.264,0.694,0.207,0.215
"목재와 펄프, 지물류",0.539,0.385,0.21,0.32,0.392,0.446,0.317,0.463,0.396,1.0,0.443,0.377,0.53,0.427,0.528,0.378,0.422,0.362,0.144


In [1]:
import sys
sys.path.append(r'C:\Users\wq240\Project\case')

# 평가 시스템 로드
from KoSimCSE.new.evaluation_system import InsuranceEvaluationSystem

# 검색기(유사도 모델) 로드 - 파일 경로 직접 로드
from importlib.machinery import SourceFileLoader
Improved = SourceFileLoader('improved', r'C:\Users\wq240\Project\case\KoSimCSE\new\improved_insurance_system.py').load_module()

# 평가 파이프라인 준비
eval_sys = InsuranceEvaluationSystem('data/design.csv', preserve_labels=True, min_support_for_test=2)
eval_sys.load_and_prepare_data()
eval_sys.create_train_valid_test_split()
eval_sys.prepare_features_for_modeling()

# 검색기 인스턴스 (Streamlit 없는 환경에서도 동작)
sim = Improved.ImprovedInsuranceSystem()

# 속도 빠르게(선택): 임베딩 비활성화 → 텍스트 폴백(Jaccard) 사용
# sim.get_text_embeddings = lambda texts, batch_size=4: None

# 평가 실행
results = eval_sys.evaluate_similarity_system(sim, sample_size=200)

ModuleNotFoundError: No module named 'streamlit'

In [2]:
# 0) 경로
import sys, types, contextlib
sys.path.append(r'C:\Users\wq240\Project\case')

# 1) Streamlit 스텁(코어만 임포트하기 위함)
st = types.SimpleNamespace(
    cache_resource=lambda f: f,
    cache_data=lambda f: f,
    spinner=lambda *a, **k: contextlib.nullcontext(),
    markdown=lambda *a, **k: None,
    success=lambda *a, **k: None,
    warning=lambda *a, **k: None,
    error=lambda *a, **k: None,
    write=lambda *a, **k: None,
    subheader=lambda *a, **k: None,
    columns=lambda *a, **k: [types.SimpleNamespace()],
    plotly_chart=lambda *a, **k: None,
    set_page_config=lambda *a, **k: None,
)
mod = types.ModuleType('streamlit'); mod.__dict__.update(st.__dict__)
sys.modules['streamlit'] = mod

# 2) 검색기(코어) 임포트
from importlib.machinery import SourceFileLoader
Improved = SourceFileLoader('improved', r'C:\Users\wq240\Project\case\KoSimCSE\new\improved_insurance_system.py').load_module()
sim = Improved.ImprovedInsuranceSystem()

# 임베딩을 끄고 빠르게 확인하고 싶으면(폴백 Jaccard 사용):
# sim.get_text_embeddings = lambda texts, batch_size=4: None

# 3) 평가 파이프라인 (라벨 보존 + 희소 test 제외)
from KoSimCSE.new.evaluation_system import InsuranceEvaluationSystem
eval_sys = InsuranceEvaluationSystem('data/design.csv', preserve_labels=True, min_support_for_test=2)

eval_sys.load_and_prepare_data()
eval_sys.create_train_valid_test_split()
eval_sys.prepare_features_for_modeling()

# 4) 평가 실행 (판정구분/판정사유 각각 지표 출력)
results = eval_sys.evaluate_similarity_system(sim, sample_size=200)

# 5) 핵심 수치 요약 확인
print('판정구분 정확도:', results['judgment']['accuracy'])
print('판정사유 정확도:', results['reason']['accuracy'])
print('판정구분 balanced acc:', results['judgment']['balanced_accuracy'])
print('판정사유 balanced acc:', results['reason']['balanced_accuracy'])

ModuleNotFoundError: No module named 'plotly'