In [1]:
!pip install transformers
!pip install torch
!pip install pandas
!pip install numpy
!pip install annoy
!pip install tqdm


Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=550737 sha256=5d2e2b79b9187cda42f3632000f1ee05f4c967150b3e7ecc00e4452f26ca701e
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [2]:
import pandas as pd
import logging
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from annoy import AnnoyIndex
from tqdm import tqdm
import os
import torch.nn.functional as F


# Setup logging
logging.basicConfig(level=logging.INFO)


In [3]:
# Constants
MODEL_NAME = "intfloat/multilingual-e5-large"
MAX_LENGTH = 512
N_TREES = 100
ANN_FILE = '/content/drive/MyDrive/훈련전용/1207/mteann3.ann'
EMBEDDINGS_FILE = '/content/drive/MyDrive/훈련전용/1207/mteann3.npy'


In [4]:
def average_pool(last_hidden_states, attention_mask):
    # Apply a mask to the last hidden states
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
    sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    mean_embeddings = sum_embeddings / sum_mask
    return mean_embeddings

In [5]:
class MULTIEmbedding:
    def __init__(self, model_name="intfloat/multilingual-e5-large"):
        self.model_name = model_name
        self.tokenizer, self.model = self.load_model()
        if self.model is None or self.tokenizer is None:
            raise RuntimeError("Model loading failed.")

    def load_model(self):
        try:
            tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            model = AutoModel.from_pretrained(self.model_name)
            logging.info("MULTIE5 model loaded successfully.")
            return tokenizer, model
        except Exception as e:
            logging.error(f"Error loading MULTIE5 model: {e}")
            return None, None

    def get_embeddings(self, docs, batch_size=10):
        self.model = self.model.to('cuda' if torch.cuda.is_available() else 'cpu')
        embeddings = []
        logging.info("Starting embedding generation.")

        for i in tqdm(range(0, len(docs), batch_size), desc="Generating embeddings", total=len(docs)//batch_size + 1):
            batch = docs[i:i+batch_size]
            inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs)
            batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)  # Normalize embeddings
            embeddings.extend(batch_embeddings.cpu().detach().numpy())

        logging.info("Embedding generation complete.")
        return np.array(embeddings)



In [6]:
class AnnoyIndexBuilder:
    def __init__(self, embedding_dim, n_trees=N_TREES, ann_file=ANN_FILE):
        self.embedding_dim = embedding_dim
        self.n_trees = n_trees
        self.ann_file = ann_file

    def build_and_save(self, embeddings):
        if embeddings is None or len(embeddings) == 0:
            raise ValueError("No embeddings provided.")

        t = AnnoyIndex(self.embedding_dim, 'angular')
        logging.info("Building Annoy index.")

        for i, vec in enumerate(tqdm(embeddings, desc="Building Annoy Index")):
            t.add_item(i, vec)

        t.build(self.n_trees)
        t.save(self.ann_file)
        logging.info("Annoy index built and saved.")
        return t

    def load(self):
        u = AnnoyIndex(self.embedding_dim, 'angular')
        if not u.load(self.ann_file):
            raise IOError(f"Could not load Annoy index from {self.ann_file}")
        logging.info("Annoy index loaded.")
        return u



In [7]:
def save_embeddings(embeddings, filename=EMBEDDINGS_FILE):
    try:
        np.save(filename, embeddings)
        logging.info("Embeddings saved.")
    except Exception as e:
        logging.error(f"Error saving embeddings: {e}")


def load_embeddings(filename=EMBEDDINGS_FILE):
    if os.path.exists(filename):
        return np.load(filename)
    else:
        logging.error(f"Embeddings file {filename} not found.")
        return None

def query_index(query, embedding_model, annoy_index, top_n=5):
    query_vec = embedding_model.get_embeddings([query])[0]
    nns = annoy_index.get_nns_by_vector(query_vec, top_n)
    return nns



In [18]:
def main():
    # Initialize MULTIE5 model for embeddings
    embedding_model = MULTIEmbedding()

# Paths to your TXT files
    txt_file_paths = ["/content/drive/MyDrive/훈련전용/병원csv데이터/질환정리1212.txt",
                     "/content/drive/MyDrive/훈련전용/병원csv데이터/아산병원데이터.txt",
                      "/content/drive/MyDrive/훈련전용/병원csv데이터/대통합데이터.txt"]



    def read_text_file(file_path):
        for encoding in ['utf-8', 'utf-16', 'ISO-8859-1']:
            try:
                with open(file_path, 'r', encoding=encoding) as file:
                    return [line.strip() for line in file]
            except UnicodeDecodeError:
                continue
        raise ValueError(f"Failed to open file {file_path} with common encodings.")

    docs = []
    for txt_file_path in txt_file_paths:
        docs.extend(read_text_file(txt_file_path))

    # # Read and store each line in the files as a document
    # docs = []
    # for txt_file_path in txt_file_paths:
    #     with open(txt_file_path, 'r', encoding='utf-8') as file:
    #         for line in file:
    #             docs.append(line.strip())  # Adds each line as a separate document




    # Paths to your CSV files
    # csv_file_paths = ["/content/drive/MyDrive/훈련전용/병원csv데이터/대통합데이터.csv",
    #                   "/content/drive/MyDrive/훈련전용/병원csv데이터/삼성서울병원_6컬럼.csv",
    #                   "/content/drive/MyDrive/훈련전용/병원csv데이터/성모병원데이터.csv",
    #                   "/content/drive/MyDrive/훈련전용/병원csv데이터/세브란스데이터.csv",
    #                   "/content/drive/MyDrive/훈련전용/병원csv데이터/아산병원데이터.csv",
    #                   "/content/drive/MyDrive/훈련전용/병원csv데이터/아산병원데이터en.csv",]

    # # Read and concatenate the specified columns for each document
    # docs = []
    # for csv_file_path in csv_file_paths:  # This line is changed
    #     df = pd.read_csv(csv_file_path)
    #     for index, row in df.iterrows():
    #         # Combine the text from all relevant columns into a single document string
    #         document = ' '.join(str(row[col]) if not pd.isnull(row[col]) else '' for col in ["질병명", "진료과", "증상", "관련질환", "동의어", "부위"])
    #         docs.append(document)

    # Generate or load embeddings
    if os.path.exists(EMBEDDINGS_FILE):
        embeddings = load_embeddings()
    else:
        embeddings = embedding_model.get_embeddings(docs)
        save_embeddings(embeddings)

        # Check if embeddings are saved successfully
        if not os.path.exists(EMBEDDINGS_FILE):
            raise FileNotFoundError("Failed to save embeddings file.")

    # Build or load Annoy index
    annoy_builder = AnnoyIndexBuilder(embedding_dim=embeddings.shape[1])
    if os.path.exists(ANN_FILE):
        annoy_index = annoy_builder.load()
    else:
        annoy_index = annoy_builder.build_and_save(embeddings)

    # Querying
    query = input("Enter your query text: ")
    top_n = int(input("Enter number of top results to fetch: "))
    nearest_neighbors = query_index(query, embedding_model, annoy_index, top_n)

    for nn in nearest_neighbors:
        print(f"Document {nn+1}: {docs[nn]}")

if __name__ == "__main__":
    main()

Enter your query text: 보챔,발진,식욕부진,반점
Enter number of top results to fetch: 5


Generating embeddings: 100%|██████████| 1/1 [00:00<00:00, 29.76it/s]


Document 835: 발열, 빈맥, 저혈압, 심계항진, 발한
Document 3785: 발열, 불안감, 불쾌감
Document 2938: 돌발적인 두통, 구토, 반신불수
Document 5729: 홍역(Measles),감염내과,"열,기침,림프 부종,식욕부진,피부소양감,콧물,코플릭 반점","디프테리아,백일해",rubeola,전신
Document 5895: 홍역(Measles),감염내과,"코플릭 반점,림프 부종,기침,열,식욕부진,콧물,피부소양감","백일해,디프테리아",rubeola,피부


In [9]:
!pip install kobert-transformers


Collecting kobert-transformers
  Downloading kobert_transformers-0.5.1-py3-none-any.whl (12 kB)
Collecting sentencepiece>=0.1.91 (from kobert-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, kobert-transformers
Successfully installed kobert-transformers-0.5.1 sentencepiece-0.1.99


In [20]:

from transformers import BertModel, BertTokenizer

# KoBERT model checkpoint
kobert_model_checkpoint = "monologg/kobert"

# Initialize the tokenizer and model directly from the Hugging Face model repository
tokenizer = BertTokenizer.from_pretrained(kobert_model_checkpoint)
model = BertModel.from_pretrained(kobert_model_checkpoint)

import torch
from sklearn.metrics.pairwise import cosine_similarity

def get_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    # Use mean pooling for embeddings
    input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
    sum_embeddings = torch.sum(outputs.last_hidden_state * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    mean_embeddings = sum_embeddings / sum_mask
    return mean_embeddings.cpu().numpy()

def compute_similarity(doc, query, tokenizer, model):
    doc_embedding = get_embeddings([doc], tokenizer, model)
    query_embedding = get_embeddings([query], tokenizer, model)
    sim = cosine_similarity(doc_embedding, query_embedding)[0][0]
    return sim

# Example usage
doc = "요로감염,배뇨곤란,긴박뇨,빈뇨,지연뇨,배뇨장애,혈뇨,야간뇨,잔뇨감"
query = "오줌이 곤란하다, 밤에 화장실을 가야한다"
similarity_score = compute_similarity(doc, query, tokenizer, model)
print(f"Similarity: {similarity_score}")







Similarity: 0.6967868208885193
