In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


import warnings
warnings.filterwarnings('ignore')

## Словарь

In [2]:
concept_df = pd.read_csv('концепты.tsv', sep='\t')
concept_df.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE
0,C0000765,"Excessive body weight gain,Excessive weight gain",224994002
1,C0701811,"Poor short-term memory,Poor short-term memory",247592009
2,C0002170,"Alopecia,Loss of hair",278040002
3,C0549622,"Sexual Dysfunction,Sexual disorder",231532002
4,C0027497,"Nausea,Nausea",422587007


In [3]:
len(concept_df)

544

In [4]:
concept_df['CUI'] = concept_df['CUI'].apply(lambda x: re.findall(r'C[0-9]{7}', x))
concept_df.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE
0,[C0000765],"Excessive body weight gain,Excessive weight gain",224994002
1,[C0701811],"Poor short-term memory,Poor short-term memory",247592009
2,[C0002170],"Alopecia,Loss of hair",278040002
3,[C0549622],"Sexual Dysfunction,Sexual disorder",231532002
4,[C0027497],"Nausea,Nausea",422587007


In [5]:
concept_df['CUI'] = concept_df['CUI'].apply(lambda x: ''.join(x))
concept_df['проверка'] = concept_df['CUI'].apply(lambda x: 'yes' if len(x) == 8 else 'no')

concept_df.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE,проверка
0,C0000765,"Excessive body weight gain,Excessive weight gain",224994002,yes
1,C0701811,"Poor short-term memory,Poor short-term memory",247592009,yes
2,C0002170,"Alopecia,Loss of hair",278040002,yes
3,C0549622,"Sexual Dysfunction,Sexual disorder",231532002,yes
4,C0027497,"Nausea,Nausea",422587007,yes


In [6]:
concept_df['проверка'].value_counts()

yes    544
Name: проверка, dtype: int64

там до этого был один "no" из-за кода который начинался с маленькой "с"

In [7]:
concept_df['CUI'].value_counts()

C0231403    3
C0557386    2
C0392171    2
C0233408    2
C0151786    2
           ..
C0003862    1
C0427195    1
C0426584    1
C0232461    1
C0587054    1
Name: CUI, Length: 529, dtype: int64

529 vs 544 <br> есть повторы кодов

In [8]:
concept_df = concept_df[['CUI', 'CONCEPT']].drop_duplicates(subset=['CUI'])
len(concept_df)

529

In [9]:
def tune_for_row(name_df, name_row):
    name_df[name_row] = name_df[name_row].str.lower()
    name_df[name_row] = name_df[name_row].apply(lambda x: re.sub(',', ' ', x))
    name_df[name_row] = name_df[name_row].str.split()
    name_df[name_row] = name_df[name_row].apply(lambda x: [item for item in x if str(item) not in stopwords.words('english')])
    name_df[name_row] = name_df[name_row].apply(lambda x: set(x))
    name_df[name_row] = name_df[name_row].apply(lambda x: ' '.join(x))

tune_for_row(concept_df, 'CONCEPT')

concept_df.head()

Unnamed: 0,CUI,CONCEPT
0,C0000765,weight gain excessive body
1,C0701811,memory short-term poor
2,C0002170,alopecia loss hair
3,C0549622,disorder dysfunction sexual
4,C0027497,nausea


## Сами жалобы

In [10]:
pd.set_option('display.max_colwidth', None)
complaints_df = pd.read_csv('побочные_эффекты.txt', delimiter='\t')
complaints_df.head()

Unnamed: 0,id,Text
0,1,"extreme weight gain, short-term memory loss, hair loss."
1,2,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .
2,3,"Just TWO tablets of Lexapro 10mg completely destroyed my sexual functioning, probably for life."
3,4,It's called PSSD: post-SSRI sexual dysfunction.
4,5,"And there is a chance that it will give you PSSD, which as the name suggests persists even after you stop taking the drug, Just google 'PSSD' and you'll see what I mean, So please: NEVER take this drug, not even one tablet."


In [11]:
def tune_for_row(name_df, name_row):
    name_df[name_row] = complaints_df[name_row].str.lower()
    name_df[name_row] = complaints_df[name_row].str.split()
    name_df[name_row] = complaints_df[name_row].apply(lambda x: [item for item in x if str(item) not in stopwords.words('english')])
    name_df[name_row] = name_df[name_row].apply(lambda x: set(x))
    name_df[name_row] = complaints_df[name_row].apply(lambda x: ' '.join(x))

In [12]:
tune_for_row(complaints_df, 'Text')
complaints_df.head()

Unnamed: 0,id,Text
0,1,"gain, loss, weight short-term hair memory loss. extreme"
1,2,completely destroyed . sexually functioning
2,3,"two sexual lexapro life. tablets completely probably functioning, destroyed 10mg"
3,4,pssd: sexual dysfunction. called post-ssri
4,5,"pssd, name one suggests chance taking tablet. google give 'pssd' even please: take never stop mean, persists see drug,"


In [13]:
len(complaints_df)

2150

2150 потому что в "побочные_эффекты" были пропущены некоторые айдишники жалоб (например: 1182, 1184)

## Модель итд

In [14]:
train_df = concept_df
train_df.rename(columns={'CONCEPT': 'TEXT'}, inplace=True)

train_df.head()

Unnamed: 0,CUI,TEXT
0,C0000765,weight gain excessive body
1,C0701811,memory short-term poor
2,C0002170,alopecia loss hair
3,C0549622,disorder dysfunction sexual
4,C0027497,nausea


### Последний шаг с моделью и предсказаниями

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

In [34]:
# Преобразование меток классов 'CUI' в числовой формат
label_encoder = LabelEncoder()
train_df['Code'] = label_encoder.fit_transform(train_df['CUI'])

# Обучение модели на train_df
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_df['TEXT'])
y_train = train_df['Code']
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [35]:
# Предсказание кодов жалоб в complaints_df
X_complaints = tfidf_vectorizer.transform(complaints_df['Text'])
predicted_codes = model.predict(X_complaints)
predicted_labels = label_encoder.inverse_transform(predicted_codes)

complaints_df['Code'] = predicted_labels

complaints_df.head()

Unnamed: 0,id,Text,Code
0,1,"gain, loss, weight short-term hair memory loss. extreme",C0701811
1,2,completely destroyed . sexually functioning,C0425736
2,3,"two sexual lexapro life. tablets completely probably functioning, destroyed 10mg",C0549622
3,4,pssd: sexual dysfunction. called post-ssri,C0549622
4,5,"pssd, name one suggests chance taking tablet. google give 'pssd' even please: take never stop mean, persists see drug,",C0029944


pssd не распознает, не знает, что это аббревиатура к sexual dysfunction

In [36]:
complaints_df['Code'].value_counts()

C0037199    194
C0234019     70
C0392171     70
C0043094     67
C0027497     64
           ... 
C0241633      1
C1821572      1
C0586742      1
C0522173      1
C0028643      1
Name: Code, Length: 309, dtype: int64

## Сохраняем результаты

### Вариант только с айди жалобы и кодами (формат str)

In [37]:
cols = ['id', 'Code']
adr = complaints_df[cols]
adr.head()

Unnamed: 0,id,Code
0,1,C0701811
1,2,C0425736
2,3,C0549622
3,4,C0549622
4,5,C0029944


In [38]:
adr.to_csv('tychina_3.csv')

### Вариант как в домашнем задании

In [39]:
for el in range(1, 5):
    col_name = 'level_' + str(el)
    adr[col_name] = 0

adr['level_5'] = 1
adr.head()

Unnamed: 0,id,Code,level_1,level_2,level_3,level_4,level_5
0,1,C0701811,0,0,0,0,1
1,2,C0425736,0,0,0,0,1
2,3,C0549622,0,0,0,0,1
3,4,C0549622,0,0,0,0,1
4,5,C0029944,0,0,0,0,1


In [41]:
adr.to_csv('tychina_3_2.csv') # не убирала индексы!!

**Но! Тут только один код, можно попробовать мультиклассовую классификацию (чтоб, например, под gain loss hair memory loss. weight short-term extreme все-таки три кода выдавала модель)**

In [28]:
from transformers import BertTokenizer, BertForTokenClassification, AdamW
import torch
from transformers import pipeline


# Load pre-trained BERT model and tokenizer
model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=529)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForToken

In [29]:
cui_mappings = train_df.set_index('CUI')['TEXT'].to_dict()

# 2й вариант
# cui_mapping = dict(zip(train_df['TEXT'], train_df['CUI']))

# Добавление мэппинга в модель BERT
model.bert_cui_mapping = cui_mappings

In [30]:
# Define a function to extract CUIs from text using the NER pipeline
def extract_cuis(text):
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
    results = ner_pipeline(text)
    cuis = []
    
    for result in results:
        if result['entity'] == 'CUI':
            cuis.append(result['word'])
            
    return cuis

In [31]:
complaints_df['Code'] = complaints_df['Text'].apply(lambda x: extract_cuis(x))

complaints_df.head()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,id,Text,Code
0,1,"gain, loss, weight short-term hair memory loss. extreme",[]
1,2,completely destroyed . sexually functioning,[]
2,3,"two sexual lexapro life. tablets completely probably functioning, destroyed 10mg",[]
3,4,pssd: sexual dysfunction. called post-ssri,[]
4,5,"pssd, name one suggests chance taking tablet. google give 'pssd' even please: take never stop mean, persists see drug,",[]


In [33]:
complaints_df['Code'].value_counts()

[]    2150
Name: Code, dtype: int64