In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re


import warnings
warnings.filterwarnings('ignore')

## Словарь

In [2]:
concept_df = pd.read_csv('концепты.tsv', sep='\t')
concept_df.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE
0,C0000765,"Excessive body weight gain,Excessive weight gain",224994002
1,C0701811,"Poor short-term memory,Poor short-term memory",247592009
2,C0002170,"Alopecia,Loss of hair",278040002
3,C0549622,"Sexual Dysfunction,Sexual disorder",231532002
4,C0027497,"Nausea,Nausea",422587007


In [3]:
len(concept_df)

544

In [4]:
concept_df['CUI'] = concept_df['CUI'].apply(lambda x: re.findall(r'C[0-9]{7}', x))
concept_df.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE
0,[C0000765],"Excessive body weight gain,Excessive weight gain",224994002
1,[C0701811],"Poor short-term memory,Poor short-term memory",247592009
2,[C0002170],"Alopecia,Loss of hair",278040002
3,[C0549622],"Sexual Dysfunction,Sexual disorder",231532002
4,[C0027497],"Nausea,Nausea",422587007


In [5]:
concept_df['CUI'] = concept_df['CUI'].apply(lambda x: ''.join(x))
concept_df['проверка'] = concept_df['CUI'].apply(lambda x: 'yes' if len(x) == 8 else 'no')

concept_df.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE,проверка
0,C0000765,"Excessive body weight gain,Excessive weight gain",224994002,yes
1,C0701811,"Poor short-term memory,Poor short-term memory",247592009,yes
2,C0002170,"Alopecia,Loss of hair",278040002,yes
3,C0549622,"Sexual Dysfunction,Sexual disorder",231532002,yes
4,C0027497,"Nausea,Nausea",422587007,yes


In [6]:
concept_df['проверка'].value_counts()

yes    544
Name: проверка, dtype: int64

там до этого был один "no" из-за кода который начинался с маленькой "с"

In [7]:
concept_df['CUI'].value_counts()

C0231403    3
C0557386    2
C0392171    2
C0233408    2
C0151786    2
           ..
C0003862    1
C0427195    1
C0426584    1
C0232461    1
C0587054    1
Name: CUI, Length: 529, dtype: int64

529 vs 544 <br> есть повторы кодов

In [8]:
concept_df = concept_df[['CUI', 'CONCEPT']].drop_duplicates(subset=['CUI'])
len(concept_df)

529

In [9]:
concept_df = concept_df.reset_index(drop=True)

In [10]:
def tune_for_row(name_df, name_row):
    name_df[name_row] = name_df[name_row].str.lower()
    name_df[name_row] = name_df[name_row].apply(lambda x: re.sub(',', ' ', x))
    name_df[name_row] = name_df[name_row].str.split()
    name_df[name_row] = name_df[name_row].apply(lambda x: [item for item in x if str(item) not in stopwords.words('english')])
    name_df[name_row] = name_df[name_row].apply(lambda x: set(x))
    name_df[name_row] = name_df[name_row].apply(lambda x: ' '.join(x))

tune_for_row(concept_df, 'CONCEPT')

concept_df.head()

Unnamed: 0,CUI,CONCEPT
0,C0000765,excessive gain body weight
1,C0701811,poor short-term memory
2,C0002170,loss alopecia hair
3,C0549622,sexual disorder dysfunction
4,C0027497,nausea


## BERT

*Принцип: извлечение эмбеддингов по косинусовому сходству*

In [11]:
from transformers import BertTokenizer, BertModel
import torch

# Загрузка pubmed BERT модели
tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
model_B = BertModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def get_bert_embedding(text):
    
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
    
    with torch.no_grad():
        outputs = model_B(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)
        
    return embedding

# Apply the get_bert_embedding function to each concept text in concept_df
concept_df['Embedding'] = concept_df['CONCEPT'].apply(get_bert_embedding)

In [13]:
def extract_top_codes(text, model, tokenizer, concept_df):
    # Encode the input text using the BERT tokenizer and model
    encoded_text = get_bert_embedding(text)
    
    similarity_scores = []
    
    for idx, row in concept_df.iterrows():
        code_embedding = row['Embedding']
        
        # Resize code_embedding to match the dimension of encoded_text
        code_embedding_resized = code_embedding.view(-1)
        
        # Calculate the similarity score using BERT embeddings
        similarity_score = np.dot(encoded_text, code_embedding_resized) / (np.linalg.norm(encoded_text) * np.linalg.norm(code_embedding_resized))
        similarity_scores.append((row['CUI'], similarity_score.item()))
    
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_codes = [code for code, score in sorted_scores[:3]]

    return top_codes

In [14]:
with open('побочные_эффекты.txt', 'r') as file:
    complaints = file.readlines()    
    
del complaints[0]

In [15]:
%%time 
# Создание dataframe для хранения id жалобы и всех кодов в строчке текста
result_df = pd.DataFrame(columns=['id', 'Code'])


for complaint in complaints:
    
    pre_id, pre_text = complaint.split('\t')
    
    complaint_id = int(pre_id)
    
    complaint_text = re.sub('\n', '', pre_text)
    complaint_text = complaint_text.lower()
    complaint_text = complaint_text.split()
    complaint_text = [item for item in complaint_text if item not in stopwords.words('english')]
    complaint_text = ' '.join(complaint_text)

    
#     if complaint_id % 200 == 0:
#         print(complaint_id)
        
    codes = extract_top_codes(complaint_text, model_B, tokenizer, concept_df)
    
    result_df = result_df.append({'id': complaint_id, 'Code': ','.join(codes)}, ignore_index=True)

CPU times: total: 17min 39s
Wall time: 4min 19s


In [16]:
result_df.head(20)

Unnamed: 0,id,Code
0,1,"C0011124,C0042023,C0013378"
1,2,"C2981158,C0011124,C0021177"
2,3,"C0011124,C0042023,C0021177"
3,4,"C0036902,C0011124,C0013378"
4,5,"C0235283,C0812387,C0563717"
5,6,"C0011124,C0042023,C0424528"
6,7,"C0016382,C0011124,C0423636"
7,8,"C0577477,C0011124,C0042023"
8,9,"C0554978,C0563158,C0424537"
9,10,"C0577477,C0425082,C0582140"


In [24]:
# NB (для себя) Вариант с логикой кластеров

# from sklearn.cluster import KMeans 
    
# def extract_cui_with_clustering(complaint_text):
#     # Токенизация и кодирование текста
#     input_ids = tokenizer.encode(complaint_text, return_tensors='tf')

#     # Получение BERT эмбеддингов
#     outputs = model(input_ids)
#     embeddings = outputs.last_hidden_state

#     # Применение кластеризации с KMeans
#     kmeans = KMeans(n_clusters=529, random_state=0)
#     cluster_labels = kmeans.fit_predict(embeddings)

#     cui_codes = ['CUI' + str(label) for label in cluster_labels]
    
#     return cui_codes

## Сохраняем результаты

### Вариант только с айди жалобы и кодами (формат str)

In [17]:
cols = ['id', 'Code']
adr = result_df[cols]
adr.head()

Unnamed: 0,id,Code
0,1,"C0011124,C0042023,C0013378"
1,2,"C2981158,C0011124,C0021177"
2,3,"C0011124,C0042023,C0021177"
3,4,"C0036902,C0011124,C0013378"
4,5,"C0235283,C0812387,C0563717"


In [18]:
adr.to_csv('tychina_3.csv')

### Вариант как в домашнем задании

In [21]:
adr['One_Code'] = adr['Code'].apply(lambda x: x[0:8]) 
adr.head()

Unnamed: 0,id,Code,One_Code
0,1,"C0011124,C0042023,C0013378",C0011124
1,2,"C2981158,C0011124,C0021177",C2981158
2,3,"C0011124,C0042023,C0021177",C0011124
3,4,"C0036902,C0011124,C0013378",C0036902
4,5,"C0235283,C0812387,C0563717",C0235283


In [22]:
adr['One_Code'].value_counts()

C0812387    283
C0011124    281
C0016382    203
C0042023    194
C2981158     71
           ... 
C0549622      1
C2939147      1
C0234019      1
C0554978      1
C4074950      1
Name: One_Code, Length: 119, dtype: int64

Топ 5:
1. Ощущение зябкости
2. Снижение либидо
3. Покраснение лица
4. Учащение мочеиспускания
5. Снижение либидо

In [23]:
cols = ['id', 'One_Code']
adr_2 = adr[cols]

for el in range(1, 5):
    col_name = 'level_' + str(el)
    adr_2[col_name] = 0

adr_2['level_5'] = 1
adr_2.head()

Unnamed: 0,id,One_Code,level_1,level_2,level_3,level_4,level_5
0,1,C0011124,0,0,0,0,1
1,2,C2981158,0,0,0,0,1
2,3,C0011124,0,0,0,0,1
3,4,C0036902,0,0,0,0,1
4,5,C0235283,0,0,0,0,1


In [24]:
adr.to_csv('tychina_3_2.csv') # не убирала индексы!!