In [1]:
import pandas as pd
import time as time

import numpy as np
from collections import Counter

In [2]:
import matplotlib.pyplot as plt #visualisation
import seaborn as sns

In [3]:
import tensorflow as tf

In [4]:
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_rows", None, "display.max_columns", None)

## Load Dataset

In [5]:
s = time.time()
labelled = pd.read_csv('data/mature_labelled.csv', index_col=0, dtype='string')
e = time.time()
print("Multicore Loading Time = {}".format(e-s))

print(len(labelled))

Multicore Loading Time = 1.5009870529174805
34179


In [6]:
labelled[labelled.isnull().all(1)]

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,journal_country,authors,author_affils,keywords,mesh_terms,references_pmids,feature,include,mature


In [7]:
len(labelled)

34179

In [8]:
selected = labelled[['pmid', 'feature']].copy()

In [9]:
selected = selected.rename(columns={"feature":"text"})
selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34179 entries, 1 to 172538
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   pmid    34179 non-null  string
 1   text    34179 non-null  string
dtypes: string(2)
memory usage: 801.1+ KB


## Pre-process Text

In [10]:
groups = selected.fillna("") #handle NaN values to allow regex over all cells

groups_1 = groups.applymap(lambda x:x.lower() if type(x) == str else x) #reduce all to lowercase

groups_2 = groups_1.replace(r"[\([{})\]]", "", regex=True) #remove brackets

groups_3 = groups_2.replace("' ", "'", regex=True) #remove quote+space in front of word

groups_4 = groups_3.replace("""[\.'!?]""", "", regex=True) #remove punctuation

groups = groups_4.replace('"', "", regex=True) #remove double quote

## Tag Algorithms

In [11]:
algo = groups[['text']].copy()

In [12]:
######################
## CLASSES
######################
# NEURAL NETWORK / nn
# SUPPORT VECTOR MACHINE / svm
# STANDARD REGRESSIONS /reg
# DECISION TREES / dt
# DISCRIMINANT ANALYSIS / da
# NAIVE BAYES / nb
# K-NEAREST NEIGHBOUR / knn
# 
# TRANSFER LEARNING / tl
# FEDERATED LEARNING / fl
# UNSUPERVISED LEARNING / unsup

In [13]:
## NEURAL NETWORK

## text
text = ['neural net', 'deep learning', 'convolutional', 'back propagation', 'lstm', ' cnn']

algo['nn_text'] = np.where(groups['text'].str.contains("neural net"), "1", "0")

for x in text:
    algo['nn_text'] = np.where(groups['text'].str.contains(x), "1", algo['nn_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['nn_text']))

text counts:
Counter({'0': 20276, '1': 13903})


In [14]:
## SUPPORT VECTOR MACHINE

## text
text = ['vector machine', 'support vector', 'svm', 'vector regression']

algo['svm_text'] = np.where(groups['text'].str.contains("support vector machine"), "1", "0")

for x in text:
    algo['svm_text'] = np.where(groups['text'].str.contains(x), "1", algo['svm_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['svm_text']))

text counts:
Counter({'0': 29687, '1': 4492})


In [15]:
## MULTIVARIABLE REGRESSION

## text
text = ['logistic regression', 'linear regression', 'multivariable regression', 'multivariate regression',
       'simple regression', 'univariate logistic', 'multivariate linear', 'multivariable linear', 'linear model', 'logistic model',
        'glm', 'regularized regression', 'ridge regression', 'sparse regression', 'stepwise regression', 'kernel regression',
       'process regression']

algo['reg_text'] = np.where(groups['text'].str.contains("univariate regression"), "1", "0")

for x in text:
    algo['reg_text'] = np.where(groups['text'].str.contains(x), "1", algo['reg_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['reg_text']))

text counts:
Counter({'0': 32171, '1': 2008})


In [16]:
## DECISION TREE

## text
text = ['regression tree', 'random forest', 'ensemble tree', 'adaboost', 'xgboost', 'gradient boost']

algo['dt_text'] = np.where(groups['text'].str.contains("decision tree"), "1", "0")

for x in text:
    algo['dt_text'] = np.where(groups['text'].str.contains(x), "1", algo['dt_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['dt_text']))

text counts:
Counter({'0': 31365, '1': 2814})


In [17]:
## DISCRIMINANT ANALYSIS

## text
text = ['discriminant analysis', 'linear discriminant', 'linear discrimination']

algo['da_text'] = np.where(groups['text'].str.contains("discrimination analysis"), "1", "0")

for x in text:
    algo['da_text'] = np.where(groups['text'].str.contains(x), "1", algo['da_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['da_text']))

text counts:
Counter({'0': 33575, '1': 604})


In [18]:
## NAIVE BAYES

## text
text = ['probabilistic classif']

algo['nb_text'] = np.where(groups['text'].str.contains("naive bayes"), "1", "0")

for x in text:
    algo['nb_text'] = np.where(groups['text'].str.contains(x), "1", algo['nb_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['nb_text']))

text counts:
Counter({'0': 33965, '1': 214})


In [19]:
## TRANSFER LEARNING

## text
algo['tl_text'] = np.where(groups['text'].str.contains("transfer learning"), "1", "0")

##output
print('text counts:')
print(Counter(algo['tl_text']))

text counts:
Counter({'0': 33372, '1': 807})


In [20]:
## FEDERATED LEARNING

## text
algo['fl_text'] = np.where(groups['text'].str.contains("federated learning"), "1", "0")

##output
print('text counts:')
print(Counter(algo['fl_text']))

text counts:
Counter({'0': 34163, '1': 16})


In [21]:
## K-NEAREST NEIGHBOUR

## text
algo['knn_text'] = np.where(groups['text'].str.contains("k-nearest"), "1", "0")
algo['knn_text'] = np.where(groups['text'].str.contains("k nearest neighbour"), "1", algo['knn_text'])

##output
print('text counts:')
print(Counter(algo['knn_text']))

text counts:
Counter({'0': 33629, '1': 550})


In [22]:
## UNSUPERVISED LEARNING

## text
text = ['k-means', 'means cluster', 'hierarchical cluster', 'unsupervised learning', 'unsupervised algorithm',
       'unsupervised model', 'unsupervised method', 'latent class analysis']

algo['unsup_text'] = np.where(groups['text'].str.contains("clustering algorithm"), "1", "0")

for x in text:
    algo['unsup_text'] = np.where(groups['text'].str.contains(x), "1", algo['unsup_text']) #if yes then 1, if no, keep current
    
##output
print('text counts:')
print(Counter(algo['unsup_text']))

text counts:
Counter({'0': 33576, '1': 603})


In [23]:
## COMBINE
labelled['algo_neural_net'] = np.where(algo['nn_text'].str.contains("1"), "1", "0")

labelled['algo_support_vector'] = np.where(algo['svm_text'].str.contains("1"), "1", "0")

labelled['algo_regression'] = np.where(algo['reg_text'].str.contains("1"), "1", "0")

labelled['algo_decision_tree'] = np.where(algo['dt_text'].str.contains("1"), "1", "0")

labelled['algo_discriminant'] = np.where(algo['da_text'].str.contains("1"), "1", "0")

labelled['algo_naive_bayes'] = np.where(algo['nb_text'].str.contains("1"), "1", "0")

labelled['algo_transfer'] = np.where(algo['tl_text'].str.contains("1"), "1", "0")

labelled['algo_federated'] = np.where(algo['fl_text'].str.contains("1"), "1", "0")

labelled['algo_k_nearest'] = np.where(algo['knn_text'].str.contains("1"), "1", "0")

labelled['algo_unsupervised'] = np.where(algo['unsup_text'].str.contains("1"), "1", "0")

#algo.to_csv('output/algo_tagged.csv')

In [24]:
labelled.head(5)

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,journal_country,authors,author_affils,keywords,mesh_terms,references_pmids,feature,include,mature,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised
1,34688173,10.1016/j.compbiomed.2021.104924,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists.,Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,2021-10-06,2021-10-24,Journal Article,eng,Computers in biology and medicine,Comput Biol Med,United States,"['Yang Yiguang', 'Wang Juncheng', 'Xie Fengying', 'Liu Jie', 'Shu Chang', 'Wang Yukun', 'Zheng Yushan', 'Zhang Haopeng']","['Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China. Electronic address: xfy_73@buaa.edu.cn.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China. Electronic address: Liujie04672@pumch.cn.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.']","['Convolutional neural networks', 'Deep-learning', 'Dermoscopic images', 'Papulosquamous skin diseases', 'Psoriasis']",,,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists. Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,1.0,1.0,1,0,0,0,0,0,0,0,0,0
2,34688172,10.1016/j.compbiomed.2021.104927,A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19.,"The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",2021-10-11,2021-10-24,Journal Article,eng,Computers in biology and medicine,Comput Biol Med,United States,"['Azouji Neda', 'Sami Ashkan', 'Taheri Mohammad', 'Müller Henning']","['Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: azouji@shirazu.ac.ir.', 'Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: sami@shirazu.ac.ir.', 'Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: motaheri@shirazu.ac.ir.', 'Department of Business Information Systems University of Applied Sciences Western Switzerland, Sierre (HES SO), Switzerland. Electronic address: henning.mueller@hevs.ch.']","['COVID-19', 'Computer-aided diagnosis (CAD)', 'Deep feature extraction', 'Large margin classifier', 'MERS', 'SARS', 'X-ray']",,,"A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19. The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",1.0,0.0,0,0,0,0,0,0,1,0,0,0
8,34687858,10.1016/j.neuroimage.2021.118652,Causal Decoding of Individual Cortical Excitability States.,"Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",2021-10-20,2021-10-24,Journal Article,eng,NeuroImage,Neuroimage,United States,"['Metsomaa J', 'Belardinelli P', 'Ermolova M', 'Ziemann U', 'Zrenner C']","['Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen; CIMeC, Center for Mind-Brain Sciences, University of Trento, Italy.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen. Electronic address: ulf.ziemann@uni-tuebingen.de.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen; Temerty Centre for Therapeutic Brain Intervention, Centre for Addiction and Mental Health, and Department of Psychiatry, University of Toronto, Toronto, ON, Canada.']","['EEG', 'TMS', 'brain state', 'classification', 'excitability', 'machine learning']",,,"Causal Decoding of Individual Cortical Excitability States. Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",1.0,0.0,0,0,0,0,0,0,0,0,0,0
9,34687853,10.1016/j.mri.2021.10.024,Radiomic machine learning for pretreatment assessment of prognostic risk factors for endometrial cancer and its effects on radiologists' decisions of deep myometrial invasion.,To evaluate radiomic machine learning (ML) classifiers based on multiparametric magnetic resonance images (MRI) in pretreatment assessment of endometrial cancer (EC) risk factors and to examine effects on radiologists' interpretation of deep myometrial invasion (dMI).,2021-10-20,2021-10-24,Journal Article,eng,Magnetic resonance imaging,Magn Reson Imaging,Netherlands,"['Otani Satoshi', 'Himoto Yuki', 'Nishio Mizuho', 'Fujimoto Koji', 'Moribata Yusaku', 'Yakami Masahiro', 'Kurata Yasuhisa', 'Hamanishi Junzo', 'Ueda Akihiko', 'Minamiguchi Sachiko', 'Mandai Masaki', 'Kido Aki']","['Department of Diagnostic Imaging and Nuclear Medicine, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan. Electronic address: yhimoto@kuhp.kyoto-u.ac.jp.', 'Department of Diagnostic Imaging and Nuclear Medicine, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Real World Data Research and Developmentx, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan; Preemptive Medicine and Lifestyle-related Disease Research Center, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Preemptive Medicine and Lifestyle-related Disease Research Center, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Pathology, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan.']","['Endometrial cancer', 'Radiomic machine learning']",,,Radiomic machine learning for pretreatment assessment of prognostic risk factors for endometrial cancer and its effects on radiologists' decisions of deep myometrial invasion. To evaluate radiomic machine learning (ML) classifiers based on multiparametric magnetic resonance images (MRI) in pretreatment assessment of endometrial cancer (EC) risk factors and to examine effects on radiologists' interpretation of deep myometrial invasion (dMI).,1.0,1.0,0,0,0,0,0,0,0,0,0,0
10,34687850,10.1016/j.mri.2021.10.023,MRI-based machine learning for determining quantitative and qualitative characteristics affecting the survival of glioblastoma multiforme.,Our current study aims to consider the image biomarkers extracted from the MRI images for exploring their effects on glioblastoma multiforme (GBM) patients' survival. Determining its biomarker helps better manage the disease and evaluate treatments. It has been proven that imaging features could be used as a biomarker. The purpose of this study is to investigate the features in MRI and clinical features as the biomarker association of survival of GBM.,2021-10-20,2021-10-24,Journal Article,eng,Magnetic resonance imaging,Magn Reson Imaging,Netherlands,"['Jajroudi Mahdie', 'Enferadi Milad', 'Homayoun Amir Azar', 'Reiazi Reza']","['Pharmaceutical Research Center, Mashhad University of Medical Sciences, Mashhad, Iran. Electronic address: Jajroudimh991@mums.ac.ir.', 'Research Center for Nuclear Medicine, Shariati Hospital, Tehran University of Medical Sciences, Tehran, Iran.', 'Sina Trauma Research Center, Tehran University of Medical Sciences, Tehran, Iran.', 'Radiation Medicine Program, Princess Margaret Cancer Centre, University Health Network, Toronto, Ontario, Canada. Electronic address: reza.reiazi@uhnresearch.ca.']","['Biomarker', 'Clinical features', 'Glioblastoma multiforme', 'MRI features', 'Machine learning']",,,MRI-based machine learning for determining quantitative and qualitative characteristics affecting the survival of glioblastoma multiforme. Our current study aims to consider the image biomarkers extracted from the MRI images for exploring their effects on glioblastoma multiforme (GBM) patients' survival. Determining its biomarker helps better manage the disease and evaluate treatments. It has been proven that imaging features could be used as a biomarker. The purpose of this study is to investigate the features in MRI and clinical features as the biomarker association of survival of GBM.,1.0,0.0,0,0,0,0,0,0,0,0,0,0


## Tag Features

In [25]:
feat = groups[['text']].copy()

In [26]:
######################
## CLASSES
######################
# BIO_MARKER / bio
# GENOMIC / gene
# IMAGING / imaging
    ### XR / xr
    ### CT / ct
    ### MRI / mri
# ECHO / echo
# US / us
# MAMMOGRAM / mamm
# OCT / oct
# EEG / eeg
# ECG / ecg
# EMG / emg
# DERMASCOPY / derm
# CELLULAR_PATH / histo
# ENDOSCOPY / endo
#
# NATURAL_LANGUAGE / nlp
# EHR RECORDS / ehr
#
# WEARABLE_SENSORS / sensor
# SMARTPHONE / phone
# PATIENT REPORTED / prom
# DIGITAL STETH / sound

In [27]:
## XR

## text
text = ['xr', 'x-ray', 'radiograph']

feat['xr_text'] = np.where(groups['text'].str.contains("cxr"), "1", "0")

for x in text:
    feat['xr_text'] = np.where(groups['text'].str.contains(x), "1", feat['xr_text']) #if yes then 1, if no, keep current
    
##output
print('text counts:')
print(Counter(feat['xr_text']))

text counts:
Counter({'0': 32828, '1': 1351})


In [28]:
## CT

## text
text = ['computed tomograph', 'axial tomograph', 'ct scan', 'ct image', 'ct slice', ' ct ', ' ct-',
       'tomography scan', 'computerised tomograph', 'computerized tomograph', 'assisted tomograph']

feat['ct_text'] = np.where(groups['text'].str.contains("cat scan"), "1", "0")

for x in text:
    feat['ct_text'] = np.where(groups['text'].str.contains(x), "1", feat['ct_text']) #if yes then 1, if no, keep current

##exclude

feat['ct_text'] = np.where(groups['text'].str.contains("optical coherence"), "0", feat['ct_text']) #exclude oct
feat['ct_text'] = np.where(groups['text'].str.contains("coherence tomograph"), "0", feat['ct_text']) #exclude oct
        
##output
print('text counts:')
print(Counter(feat['ct_text']))

text counts:
Counter({'0': 31198, '1': 2981})


In [29]:
## MRI

## text
text = ['magnetic resonance']

feat['mri_text'] = np.where(groups['text'].str.contains(" mri"), "1", "0")

for x in text:
    feat['mri_text'] = np.where(groups['text'].str.contains(x), "1", feat['mri_text'] ) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['mri_text']))

text counts:
Counter({'0': 30726, '1': 3453})


In [30]:
## ECHO

## text
text = ['echo-cardio', 'echokardio', 'cardiac echo']

feat['echo_text'] = np.where(groups['text'].str.contains("echocardio"), "1", "0")

for x in text:
    feat['echo_text'] = np.where(groups['text'].str.contains(x), "1", feat['echo_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['echo_text']))

text counts:
Counter({'0': 33988, '1': 191})


In [31]:
## US

## text
text = ['sonography', 'ultra-sound', 'ultrasonograph', 'doppler']

feat['us_text'] = np.where(groups['text'].str.contains("ultrasound"), "1", "0")

for x in text:
    feat['us_text'] = np.where(groups['text'].str.contains(x), "1", feat['us_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['us_text']))

text counts:
Counter({'0': 33003, '1': 1176})


In [32]:
## ECG

## text
text = [' ecg', ' ekg', 'electrokardio', 'electro-cardio', 'holter monitor', 'cardiac monitor']

feat['ecg_text'] = np.where(groups['text'].str.contains("electrocardio"), "1", "0")

for x in text:
    feat['ecg_text'] = np.where(groups['text'].str.contains(x), "1", feat['ecg_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['ecg_text']))

text counts:
Counter({'0': 33072, '1': 1107})


In [33]:
## EEG

## text
text = [' eeg']

feat['eeg_text'] = np.where(groups['text'].str.contains("electroenc"), "1", "0")

for x in text:
    feat['eeg_text'] = np.where(groups['text'].str.contains(x), "1", feat['eeg_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(feat['eeg_text']))

text counts:
Counter({'0': 32309, '1': 1870})


In [34]:
## EMG

## text
text = ['myoelectric', 'electro-myo']

feat['emg_text'] = np.where(groups['text'].str.contains("electromyo"), "1", "0")

for x in text:
    feat['emg_text'] = np.where(groups['text'].str.contains(x), "1", feat['emg_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(feat['emg_text']))

text counts:
Counter({'0': 33656, '1': 523})


In [35]:
feat[feat['emg_text']=="1"].sample(5)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text
74683,"deep learning for electromyographic hand gesture signal classification using transfer learning in recent years, deep learning algorithms have become increasingly more prominent for their unparalleled ability to automatically learn discriminant features from large amounts of data however, within the field of electromyography-based gesture recognition, deep learning algorithms are seldom employed as they require an unreasonable amount of effort from a single person, to generate tens of thousands of examples this papers hypothesis is that general, informative features can be learned from the large amounts of data generated by aggregating the signals of multiple users, thus reducing the recording burden while enhancing gesture recognition consequently, this paper proposes applying transfer learning on aggregated data from multiple users while leveraging the capacity of deep learning algorithms to learn discriminant features from large datasets two datasets comprised 19 and 17 able-bodied participants, respectively the first one is employed for pre-training, were recorded for this work, using the myo armband a third myo armband dataset was taken from the ninapro database and is comprised ten able-bodied participants three different deep learning networks employing three different modalities as input raw emg, spectrograms, and continuous wavelet transform cwt are tested on the second and third dataset the proposed transfer learning scheme is shown to systematically and significantly enhance the performance for all three networks on the two datasets, achieving an offline accuracy of 9831% for 7 gestures over 17 participants for the cwt-based convnet and 6898% for 18 gestures over 10 participants for the raw emg-based convnet finally, a use-case study employing eight able-bodied participants suggests that real-time feedback allows users to adapt their muscle activation strategy which reduces the degradation in accuracy normally experienced over time",0,0,0,0,0,0,0,1
30529,"feature extraction of surface electromyography using wavelet weighted permutation entropy for hand movement recognition the feature extraction of surface electromyography semg signals has been an important aspect of myoelectric prosthesis control to improve the practicability of myoelectric prosthetic hands, we proposed a feature extraction method for semg signals that uses wavelet weighted permutation entropy wwpe first, wavelet transform was used to decompose and preprocess semg signals collected from the relevant muscles of the upper limbs to obtain the wavelet sub-bands in each frequency segment then, the weighted permutation entropies wpes of the wavelet sub-bands were extracted to construct wwpe feature set lastly, the wwpe feature set was used as input to a support vector machine svm classifier and a backpropagation neural network bpnn classifier to recognize seven hand movements experimental results show that the proposed method exhibits remarkable recognition accuracy that is superior to those of single sub-band feature set and commonly used time-domain feature set the maximum recognition accuracy rate is 100% for hand movements, and the average recognition accuracy rates of svm and bpnn are 100% and 98%, respectively",0,0,0,0,0,0,0,1
36064,"estimating knee joint load using acoustic emissions during ambulation quantifying joint load in activities of daily life could lead to improvements in mobility for numerous people; however, current methods for assessing joint load are unsuitable for ubiquitous settings the aim of this study is to demonstrate that joint acoustic emissions contain information to estimate this internal joint load in a potentially wearable implementation eleven healthy, able-bodied individuals performed ambulation tasks under varying speed, incline, and loading conditions while joint acoustic emissions and essential gait measures-electromyography, ground reaction forces, and motion capture trajectories-were collected the gait measures were synthesized using a neuromuscular model to estimate internal joint contact force which was the target variable for subject-specific machine learning models xgboost trained based on spectral, temporal, cepstral, and amplitude-based features of the joint acoustic emissions the model using joint acoustic emissions significantly outperformed p < 005 the best estimate without the sounds, the subject-specific average load mae = 031 ± 012 bw, for both seen mae = 008 ± 001 bw and unseen mae = 021 ± 005 bw conditions this demonstrates that joint acoustic emissions contain information that correlates to internal joint contact force and that information is consistent such that unique cases can be estimated",0,0,0,0,0,0,0,1
13659,"evaluation of three machine learning algorithms for the automatic classification of emg patterns in gait disorders gait disorders are common in neurodegenerative diseases and distinguishing between seemingly similar kinematic patterns associated with different pathological entities is a challenge even for the experienced clinician ultimately, muscle activity underlies the generation of kinematic patterns therefore, one possible way to address this problem may be to differentiate gait disorders by analyzing intrinsic features of muscle activations patterns here, we examined whether it is possible to differentiate electromyography emg gait patterns of healthy subjects and patients with different gait disorders using machine learning techniques nineteen healthy volunteers 9 male, 10 female, age 282 ± 62 years and 18 patients with gait disorders 10 male, 8 female, age 662 ± 147 years resulting from different neurological diseases walked down a hallway 10 times at a convenient pace while their muscle activity was recorded via surface emg electrodes attached to 5 muscles of each leg 10 channels in total gait disorders were classified as predominantly hypokinetic <i>n</i> = 12 or ataxic <i>n</i> = 6 gait by two experienced raters based on video recordings three different classification methods convolutional neural network-cnn, support vector machine-svm, k-nearest neighbors-knn were used to automatically classify emg patterns according to the underlying gait disorder and differentiate patients and healthy participants using a leave-one-out approach for training and evaluating the classifiers, the automatic classification of normal and abnormal emg patterns during gait 2 classes: healthy and patient was possible with a high degree of accuracy using cnn accuracy 919%, but not svm accuracy 676% or knn accuracy 487% for classification of hypokinetic vs ataxic vs normal gait 3 classes best results were again obtained for cnn accuracy 838% while svm and knn performed worse accuracy svm 514%, knn 324% these results suggest that machine learning methods are useful for distinguishing individuals with gait disorders from healthy controls and may help classification with respect to the underlying disorder even when classifiers are trained on comparably small cohorts in our study, cnn achieved higher accuracy than svm and knn and may constitute a promising method for further investigation",0,0,0,0,0,0,0,1
147165,"support vector machine-based classification scheme for myoelectric control applied to upper limb this paper proposes and evaluates the application of support vector machine svm to classify upper limb motions using myoelectric signals it explores the optimum configuration of svm-based myoelectric control, by suggesting an advantageous data segmentation technique, feature set, model selection approach for svm, and postprocessing methods this work presents a method to adjust svm parameters before classification, and examines overlapped segmentation and majority voting as two techniques to improve controller performance a svm, as the core of classification in myoelectric control, is compared with two commonly used classifiers: linear discriminant analysis lda and multilayer perceptron mlp neural networks it demonstrates exceptional accuracy, robust performance, and low computational load the entropy of the output of the classifier is also examined as an online index to evaluate the correctness of classification; this can be used by online training for long-term myoelectric control operations",0,0,0,0,0,0,0,1


In [36]:
## CELLULAR PATHOLOGY

## text
text = ['histopath', 'histology', 'histochem', 'immunohist', 'cytolog', 'cytochem', 'cellular path', 'microscopy',
       'smear', 'cytometry', 'hematoxylin', 'specimens', 'stain', 'tissue sample', 'tissue section', 'brushing']

feat['histo_text'] = np.where(groups['text'].str.contains("histologic"), "1", "0")

for x in text:
    feat['histo_text'] = np.where(groups['text'].str.contains(x), "1", feat['histo_text']) #if yes then 1, if no, keep current
    
## output
print('text counts:')
print(Counter(feat['histo_text']))

text counts:
Counter({'0': 32009, '1': 2170})


In [37]:
## OCT / retinal

## text
text = ['coherence tomog', ' oct ', 'retinal photo', 'retinal imag', 'retinal tomograph',
        'laser ophth', 'fundus imag', 'fundus phot', 'fundal imag', 'fundal phot']

feat['oct_text'] = np.where(groups['text'].str.contains("optical coherence"), "1", "0")

for x in text:
    feat['oct_text'] = np.where(groups['text'].str.contains(x), "1", feat['oct_text']) #if yes then 1, if no, keep current

##output    
print('text counts:')
print(Counter(feat['oct_text']))

text counts:
Counter({'0': 33245, '1': 934})


In [38]:
feat[feat['oct_text']=='1'].sample(20)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text,histo_text,oct_text
166783,"comparing neural networks and linear discriminant functions for glaucoma detection using confocal scanning laser ophthalmoscopy of the optic disc to determine whether neural network techniques can improve differentiation between glaucomatous and nonglaucomatous eyes, using the optic disc topography parameters of the heidelberg retina tomograph hrt; heidelberg engineering, heidelberg, germany",0,0,0,0,0,0,0,0,0,1
51496,"deep learning segmentation for optical coherence tomography measurements of the lower tear meniscus the tear meniscus contains most of the tear fluid and therefore is a good indicator for the state of the tear film previously, we used a custom-built optical coherence tomography oct system to study the lower tear meniscus by automatically segmenting the image data with a thresholding-based segmentation algorithm tbsa in this report, we investigate whether the results of this image segmentation algorithm are suitable to train a neural network in order to obtain similar or better segmentation results with shorter processing times considering the class imbalance problem, we compare two approaches, one directly segmenting the tear meniscus dsa, the other first localizing the region of interest and then segmenting within the higher resolution image section lsa a total of 6658 images labeled by the tbsa were used to train deep convolutional neural networks with supervised learning five-fold cross-validation reveals a sensitivity of 9636% and 9643%, a specificity of 9998% and 9986% and a jaccard index of 9324% and 9316% for the dsa and lsa, respectively average segmentation times are up to 228 times faster than the tbsa additionally, we report the behavior of the dsa and lsa in cases challenging for the tbsa and further test the applicability to measurements acquired with a commercially available oct system the application of deep learning for the segmentation of the tear meniscus provides a powerful tool for the assessment of the tear film, supporting studies for the investigation of the pathophysiology of dry eye-related diseases",0,0,0,0,0,0,0,0,0,1
36645,"probability distribution guided optic disc and cup segmentation from fundus images in this paper, we proposed and validated a probability distribution guided network for segmenting optic disc od and optic cup oc from fundus images uncertainty is inevitable in deep learning, as induced by different sensors, insufficient samples, and inaccurate labeling since the input data and the corresponding ground truth label may be inaccurate, they may actually follow some potential distribution in this study, a variational autoencoder vae based network was proposed to estimate the joint distribution of the input image and the corresponding segmentation both the ground truth segmentation and the predicted segmentation, making the segmentation network learn not only pixel-wise information but also semantic probability distribution moreover, we designed a building block, namely the dilated inception block dib, for a better generalization of the model and a more effective extraction of multi-scale features the proposed method was compared to several existing state-of-the-art methods superior segmentation performance has been observed over two datasets origa and refuge, with the mean dice overlap coefficients being 9657% and 9581% for od and 8846% and 8891% for oc",0,0,0,0,0,0,0,0,0,1
33977,"joint optic disc and cup segmentation based on residual multi-scale fully convolutional neural network glaucoma is the leading cause of irreversible blindness, but its early symptoms are not obvious and are easily overlooked, so early screening for glaucoma is particularly important the cup to disc ratio is an important indicator for clinical glaucoma screening, and accurate segmentation of the optic cup and disc is the key to calculating the cup to disc ratio in this paper, a full convolutional neural network with residual multi-scale convolution module was proposed for the optic cup and disc segmentation first, the fundus image was contrast enhanced and polar transformation was introduced subsequently, w-net was used as the backbone network, which replaced the standard convolution unit with the residual multi-scale full convolution module, the input port was added to the image pyramid to construct the multi-scale input, and the side output layer was used as the early classifier to generate the local prediction output finally, a new multi-tag loss function was proposed to guide network segmentation the mean intersection over union of the optic cup and disc segmentation in the refuge dataset was 0904 0 and 0955 3 respectively, and the overlapping error was 0178 0 and 0066 5 respectively the results show that this method not only realizes the joint segmentation of cup and disc, but also improves the segmentation accuracy effectively, which could be helpful for the promotion of large-scale early glaucoma screening",0,0,0,0,0,0,0,0,0,1
76356,from machine to machine: an oct-trained deep learning algorithm for objective quantification of glaucomatous damage in fundus photographs previous approaches using deep learning dl algorithms to classify glaucomatous damage on fundus photographs have been limited by the requirement for human labeling of a reference training set we propose a new approach using quantitative spectral-domain sd oct data to train a dl algorithm to quantify glaucomatous structural damage on optic disc photographs,0,0,0,0,0,0,0,0,0,1
90544,"an ensemble deep learning based approach for red lesion detection in fundus images diabetic retinopathy dr is one of the leading causes of preventable blindness in the world its earliest sign are red lesions, a general term that groups both microaneurysms mas and hemorrhages hes in daily clinical practice, these lesions are manually detected by physicians using fundus photographs however, this task is tedious and time consuming, and requires an intensive effort due to the small size of the lesions and their lack of contrast computer-assisted diagnosis of dr based on red lesion detection is being actively explored due to its improvement effects both in clinicians consistency and accuracy moreover, it provides comprehensive feedback that is easy to assess by the physicians several methods for detecting red lesions have been proposed in the literature, most of them based on characterizing lesion candidates using hand crafted features, and classifying them into true or false positive detections deep learning based approaches, by contrast, are scarce in this domain due to the high expense of annotating the lesions manually",0,0,0,0,0,0,0,0,0,1
6202,"region of interest-based predictive algorithm for subretinal hemorrhage detection using faster r-cnn macular edema me is an essential sort of macular issue caused due to the storing of fluid underneath the macula age-related macular degeneration amd and diabetic macular edema dme are the two customary visual contaminations that can lead to fragmentary or complete vision loss this paper proposes a deep learning-based predictive algorithm that can be used to detect the presence of a subretinal hemorrhage region convolutional neural network r-cnn and faster r-cnn are used to develop the predictive algorithm that can improve the classification accuracy this method initially detects the presence of subretinal hemorrhage, and it then segments the region of interest roi by a semantic segmentation process the segmented roi is applied to a predictive algorithm which is derived from the fast region convolutional neural network algorithm, that can categorize the subretinal hemorrhage as responsive or non-responsive the dataset, provided by a medical institution, comprised of optical coherence tomography oct images of both pre- and post-treatment images, was used for training the proposed faster region convolutional neural network faster r-cnn we also used the kaggle dataset for performance comparison with the traditional methods that are derived from the convolutional neural network cnn algorithm the evaluation results using the kaggle dataset and the hospital images provide an average sensitivity, selectivity, and accuracy of 853%, 8964%, and 9348% respectively further, the proposed method provides a time complexity in testing as 264s, which is less than the traditional schemes like cnn, r-cnn, and fast r-cnn",0,0,0,0,0,0,0,0,0,1
29448,"classification of retinal images based on convolutional neural network automatic detection of maculopathy disease is a very important step to achieve high-accuracy results for the early discovery of the disease to help ophthalmologists to treat patients manual detection of diabetic maculopathy needs much effort and time from ophthalmologists detection of exudates from retinal images is applied for the maculopathy disease diagnosis the first proposed framework in this paper for retinal image classification begins with fuzzy preprocessing in order to improve the original image to enhance the contrast between the objects and the background after that, image segmentation is performed through binarization of the image to extract both blood vessels and the optic disc and then remove them from the original image a gradient process is performed on the retinal image after this removal process for discrimination between normal and abnormal cases histogram of the gradients is estimated, and consequently the cumulative histogram of gradients is obtained and compared with a threshold cumulative histogram at certain bins to determine the threshold cumulative histogram, cumulative histograms of images with exudates and images without exudates are obtained and averaged for each type, and the threshold cumulative histogram is set as the average of both cumulative histograms certain histogram bins are selected and thresholded according to the estimated threshold cumulative histogram, and the results are used for retinal image classification in the second framework in this paper, a convolutional neural network cnn is utilized to classify normal and abnormal cases",0,0,0,0,0,0,0,0,0,1
25415,automatic prediction of treatment outcomes in patients with diabetic macular edema using ensemble machine learning this study aimed to predict the treatment outcomes in patients with diabetic macular edema dme after 3 monthly anti-vascular endothelial growth factor vegf injections using machine learning ml based on pretreatment optical coherence tomography oct images and clinical variables,0,0,0,0,0,0,0,0,0,1
55944,"deep learning based sub-retinal fluid segmentation in central serous chorioretinopathy optical coherence tomography scans development of an automated sub-retinal fluid segmentation technique from optical coherence tomography oct scans is faced with challenges such as noise and motion artifacts present in oct images, variation in size, shape and location of fluid pockets within the retina the ability of a fully convolutional neural network to automatically learn significant low level features to differentiate subtle spatial variations makes it suitable for retinal fluid segmentation task hence, a fully convolutional neural network has been proposed in this work for the automatic segmentation of sub-retinal fluid in oct scans of central serous chorioretinopathy csc pathology the proposed method has been evaluated on a dataset of 15 oct volumes and an average dice rate, precision and recall of 091, 093 and 089 respectively has been achieved over the test set",0,0,0,0,0,0,0,0,0,1


In [39]:
## MAMMOGRAM

## text
feat['mamm_text'] = np.where(groups['text'].str.contains("mammog"), "1", "0")

##output
print('text counts:')
print(Counter(feat['mamm_text']))

text counts:
Counter({'0': 33611, '1': 568})


In [40]:
## FIBREOPTIC ENDOSCOPY

## text
text = ['colonoscop', 'endoscop', 'bronchoscop', 'fiberoptic', 'fiber-optic', 'fiberscop', 'fibrescop',
       'cystoscop', 'enteroscop', 'hysteroscop']

feat['endo_text'] = np.where(groups['text'].str.contains('endoscopy'), "1", "0")

for x in text:
    feat['endo_text'] = np.where(groups['text'].str.contains(x), "1", feat['endo_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['endo_text']))

text counts:
Counter({'0': 33576, '1': 603})


In [41]:
## DERMATOLOGY IMAGES

## text
feat['derm_text'] = np.where(groups['text'].str.contains("dermoscop"), "1", "0")
feat['derm_text'] = np.where(groups['text'].str.contains("dermascop"), "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("skin cancer")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("skin cancer")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("dermat")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("dermat")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("melanoma")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("melanoma")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("skin lesion")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("skin lesion")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("rash")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("rash")) , "1", feat['derm_text'])

feat['derm_text'] = np.where(groups['text'].str.contains("histo"), "0", feat['derm_text']) # exclude histological studies
feat['derm_text'] = np.where(groups['text'].str.contains("microsc"), "0", feat['derm_text']) # exclude microscopy

##output
print('text counts:')
print(Counter(feat['derm_text']))

text counts:
Counter({'0': 33876, '1': 303})


In [42]:
feat[feat['derm_text']=='1'].sample(20)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text,histo_text,oct_text,mamm_text,endo_text,derm_text
161554,automatic lesion boundary detection in dermoscopy images using gradient vector flow snakes malignant melanoma has a good prognosis if treated early dermoscopy images of pigmented lesions are most commonly taken at x 10 magnification under lighting at a low angle of incidence while the skin is immersed in oil under a glass plate accurate skin lesion segmentation from the background skin is important because some of the features anticipated to be used for diagnosis deal with shape of the lesion and others deal with the color of the lesion compared with the color of the surrounding skin,0,0,0,0,0,0,0,0,0,0,0,0,1
132026,"computer-aided pattern classification system for dermoscopy images computer-aided pattern classification of melanoma and other pigmented skin lesions is one of the most important tasks for clinical diagnosis to differentiate between benign and malignant lesions, the extraction of color, architectural order, symmetry of pattern and homogeneity cash is a challenging task",0,0,0,0,0,0,0,0,0,0,0,0,1
136351,"skin lesion segmentation using an improved snake model accurate identification of lesion borders is an important task in the analysis of dermoscopy images since the extraction of skin lesion borders provides important cues for accurate diagnosis snakes have been used for segmenting a variety of medical imagery including dermoscopy, however, due to the compromise of internal and external energy forces they can lead to under- or over-segmentation problems in this paper, we introduce a mean shift based gradient vector flow gvf snake algorithm that drives the internal/external energies towards the correct direction the proposed segmentation method incorporates a mean shift operation within the standard gvf cost function experimental results on a large set of diverse dermoscopy images demonstrate that the presented method accurately determines skin lesion borders in dermoscopy images",0,0,0,0,0,0,0,0,0,0,0,0,1
83254,"learning to detect blue-white structures in dermoscopy images with weak supervision we propose a novel approach to identify one of the most significant dermoscopic criteria in the diagnosis of cutaneous melanoma: the blue-white structure bws in this paper, we achieve this goal in a multiple instance learning mil framework using only image-level labels indicating whether the feature is present or not to this aim, each image is represented as a bag of nonoverlapping regions, where each region may or may not be identified as an instance of bws a probabilistic graphical model is trained in mil fashion to predict the bag image labels as output, we predict the classification label for the image ie, the presence or absence of bws in each image and we also localize the feature in the image experiments are conducted on a challenging dataset with results outperforming state-of-the-art techniques, with bws detection besting competing methods in terms of performance this study provides an improvement on the scope of modeling for computerized image analysis of skin lesions in particular, it propounds a framework for identification of dermoscopic local features from weakly labeled data",0,0,0,0,0,0,0,0,0,0,0,0,1
85384,dense deconvolution net: multi path fusion and dense deconvolution for high resolution skin lesion segmentation dermoscopy imaging has been a routine examination approach for skin lesion diagnosis accurate segmentation is the first step for automatic dermoscopy image assessment,0,0,0,0,0,0,0,0,0,0,0,0,1
73042,"a convolutional neural network trained with dermoscopic images performed on par with 145 dermatologists in a clinical melanoma image classification task recent studies have demonstrated the use of convolutional neural networks cnns to classify images of melanoma with accuracies comparable to those achieved by board-certified dermatologists however, the performance of a cnn exclusively trained with dermoscopic images in a clinical image classification task in direct competition with a large number of dermatologists has not been measured to date this study compares the performance of a convolutional neuronal network trained with dermoscopic images exclusively for identifying melanoma in clinical photographs with the manual grading of the same images by dermatologists",0,0,0,0,0,0,0,0,0,0,0,0,1
27979,"robustness of convolutional neural networks in recognition of pigmented skin lesions a basic requirement for artificial intelligence ai-based image analysis systems, which are to be integrated into clinical practice, is a high robustness minor changes in how those images are acquired, for example, during routine skin cancer screening, should not change the diagnosis of such assistance systems",0,0,0,0,0,0,0,0,0,0,0,0,1
3162,"non-melanoma skin cancer diagnosis: a comparison between dermoscopic and smartphone images by unified visual and sonification deep learning algorithms non-melanoma skin cancer nmsc is the most frequent keratinocyte-origin skin tumor it is confirmed that dermoscopy of nmsc confers a diagnostic advantage as compared to visual face-to-face assessment covid-19 restrictions diagnostics by telemedicine photos, which are analogous to visual inspection, displaced part of in-person visits this study evaluated by a dual convolutional neural network cnn performance metrics in dermoscopic di versus smartphone-captured images si and tested if artificial intelligence narrows the proclaimed gap in diagnostic accuracy",0,0,0,0,0,0,0,0,0,0,0,0,1
54350,multiple skin lesions diagnostics via integrated deep convolutional networks for segmentation and classification computer automated diagnosis of various skin lesions through medical dermoscopy images remains a challenging task,0,0,0,0,0,0,0,0,0,0,0,0,1
5207,"data augmentation using adversarial image-to-image translation for the segmentation of mobile-acquired dermatological images dermoscopic images allow the detailed examination of subsurface characteristics of the skin, which led to creating several substantial databases of diverse skin lesions however, the dermoscope is not an easily accessible tool in some regions a less expensive alternative could be acquiring medium resolution clinical macroscopic images of skin lesions however, the limited volume of macroscopic images available, especially mobile-acquired, hinders developing a clinical mobile-based deep learning approach in this work, we present a technique to efficiently utilize the sizable number of dermoscopic images to improve the segmentation capacity of macroscopic skin lesion images a cycle-consistent adversarial network is used to translate the image between the two distinct domains created by the different image acquisition devices a visual inspection was performed on several databases for qualitative evaluation of the results, based on the disappearance and appearance of intrinsic dermoscopic and macroscopic features moreover, the fréchet inception distance was used as a quantitative metric the quantitative segmentation results are demonstrated on the available macroscopic segmentation databases, smartskins and dermofit image library, yielding test set thresholded jaccard index of 8513% and 7430% these results establish a new state-of-the-art performance in the smartskins database",0,0,0,0,0,0,0,0,0,0,0,0,1


In [43]:
## GENOMIC

## text
text = ['candidate gene', 'prognostic gene', ' gene ', ' genes ', ' dna ', ' rna ']

feat['gene_text'] = np.where(groups['text'].str.contains('genomic'), "1", "0")

for x in text:
    feat['gene_text'] = np.where(groups['text'].str.contains(x), "1", feat['gene_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['gene_text']))

text counts:
Counter({'0': 32439, '1': 1740})


In [44]:
feat[feat['gene_text']=='1'].sample(20)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text,histo_text,oct_text,mamm_text,endo_text,derm_text,gene_text
26797,"prediction and interpretation of cancer survival using graph convolution neural networks the survival rate of cancer has increased significantly during the past two decades for breast, prostate, testicular, and colon cancer, while the brain and pancreatic cancers have a much lower median survival rate that has not improved much over the last forty years this has imposed the challenge of finding gene markers for early cancer detection and treatment strategies different methods including regression-based cox-ph, artificial neural networks, and recently deep learning algorithms have been proposed to predict the survival rate for cancers we established in this work a novel graph convolution neural network gcnn approach called surv_gcnn to predict the survival rate for 13 different cancer types using the tcga dataset for each cancer type, 6 surv_gcnn models with graphs generated by correlation analysis, genemania database, and correlation + genemania were trained with and without clinical data to predict the risk score rs the performance of the 6 surv_gcnn models was compared with two other existing models, cox-ph and cox-nnet the results showed that cox-ph has the worst performance among 8 tested models across the 13 cancer types while surv_gcnn models with clinical data reported the best overall performance, outperforming other competing models in 7 out of 13 cancer types including blca, brca, coad, lusc, sarc, stad, and ucec a novel network-based interpretation of surv_gcnn was also proposed to identify potential gene markers for breast cancer the signatures learned by the nodes in the hidden layer of surv_gcnn were identified and were linked to potential gene markers by network modularization the identified gene markers for breast cancer have been compared to a total of 213 gene markers from three widely cited lists for breast cancer survival analysis about 57% of gene markers obtained by surv_gcnn with correlation + genemania graph either overlap or directly interact with the 213 genes, confirming the effectiveness of the identified markers by surv_gcnn",0,0,0,0,0,0,0,0,0,0,0,0,0,1
29326,a genomic-pathologic annotated risk model to predict recurrence in early-stage lung adenocarcinoma recommendations for adjuvant therapy after surgical resection of lung adenocarcinoma luad are based solely on tnm classification but are agnostic to genomic and high-risk clinicopathologic factors creation of a prediction model that integrates tumor genomic and clinicopathologic factors may better identify patients at risk for recurrence,0,0,0,0,0,0,0,0,0,0,0,0,0,1
114138,"a new 25d representation for lymph node detection using random sets of deep convolutional neural network observations automated lymph node ln detection is an important clinical diagnostic task but very challenging due to the low contrast of surrounding structures in computed tomography ct and to their varying sizes, poses, shapes and sparsely distributed locations state-of-the-art studies show the performance range of 529% sensitivity at 31 false-positives per volume fp/vol, or 609% at 61 fp/vol for mediastinal ln, by one-shot boosting on 3d haar features in this paper, we first operate a preliminary candidate generation stage, towards -100% sensitivity at the cost of high fp levels -40 per patient, to harvest volumes of interest voi our 25d approach consequently decomposes any 3d voi by resampling 2d reformatted orthogonal views n times, via scale, random translations, and rotations with respect to the voi centroid coordinates these random views are then used to train a deep convolutional neural network cnn classifier in testing, the cnn is employed to assign ln probabilities for all n random views that can be simply averaged as a set to compute the final classification probability per voi we validate the approach on two datasets: 90 ct volumes with 388 mediastinal lns and 86 patients with 595 abdominal lns we achieve sensitivities of 70%/83% at 3 fp/vol and 84%/90% at 6 fp/vol in mediastinum and abdomen respectively, which drastically improves over the previous state-of-the-art work",0,1,0,0,0,0,0,0,0,0,0,0,0,1
51360,"transfer learning with convolutional neural networks for cancer survival prediction using gene-expression data precision medicine in oncology aims at obtaining data from heterogeneous sources to have a precise estimation of a given patients state and prognosis with the purpose of advancing to personalized medicine framework, accurate diagnoses allow prescription of more effective treatments adapted to the specificities of each individual case in the last years, next-generation sequencing has impelled cancer research by providing physicians with an overwhelming amount of gene-expression data from rna-seq high-throughput platforms in this scenario, data mining and machine learning techniques have widely contribute to gene-expression data analysis by supplying computational models to supporting decision-making on real-world data nevertheless, existing public gene-expression databases are characterized by the unfavorable imbalance between the huge number of genes in the order of tenths of thousands and the small number of samples in the order of a few hundreds available despite diverse feature selection and extraction strategies have been traditionally applied to surpass derived over-fitting issues, the efficacy of standard machine learning pipelines is far from being satisfactory for the prediction of relevant clinical outcomes like follow-up end-points or patients survival using the public pan-cancer dataset, in this study we pre-train convolutional neural network architectures for survival prediction on a subset composed of thousands of gene-expression samples from thirty-one tumor types the resulting architectures are subsequently fine-tuned to predict lung cancer progression-free interval the application of convolutional networks to gene-expression data has many limitations, derived from the unstructured nature of these data in this work we propose a methodology to rearrange rna-seq data by transforming rna-seq samples into gene-expression images, from which convolutional networks can extract high-level features as an additional objective, we investigate whether leveraging the information extracted from other tumor-type samples contributes to the extraction of high-level features that improve lung cancer progression prediction, compared to other machine learning approaches",0,0,0,0,0,0,0,0,0,0,0,0,0,1
44111,"comparison of unsupervised machine-learning methods to identify metabolomic signatures in patients with localized breast cancer genomics and transcriptomics have led to the widely-used molecular classification of breast cancer bc however, heterogeneous biological behaviors persist within breast cancer subtypes metabolomics is a rapidly-expanding field of study dedicated to cellular metabolisms affected by the environment the aim of this study was to compare metabolomic signatures of bc obtained by 5 different unsupervised machine learning ml methods fifty-two consecutive patients with bc with an indication for adjuvant chemotherapy between 2013 and 2016 were retrospectively included we performed metabolomic profiling of tumor resection samples using liquid chromatography-mass spectrometry here, four hundred and forty-nine identified metabolites were selected for further analysis clusters obtained using 5 unsupervised ml methods pca k-means, sparse k-means, spectral clustering, simlr and k-sparse were compared in terms of clinical and biological characteristics with an optimal partitioning parameter k = 3, the five methods identified three prognosis groups of patients favorable, intermediate, unfavorable with different clinical and biological profiles simlr and k-sparse methods were the most effective techniques in terms of clustering <i>in-silico</i> survival analysis revealed a significant difference for 5-year predicted os between the 3 clusters further pathway analysis using the 449 selected metabolites showed significant differences in amino acid and glucose metabolism between bc histologic subtypes our results provide proof-of-concept for the use of unsupervised ml metabolomics enabling stratification and personalized management of bc patients the design of novel computational methods incorporating ml and bioinformatics techniques should make available tools particularly suited to improving the outcome of cancer treatment and reducing cancer-related mortalities",0,0,0,0,0,0,0,0,1,0,0,0,0,1
77046,"laplacian regularized low-rank representation for cancer samples clustering cancer samples clustering based on biomolecular data has been becoming an important tool for cancer classification the recognition of cancer types is of great importance for cancer treatment in this paper, in order to improve the accuracy of cancer recognition, we propose to use laplacian regularized low-rank representation llrr to cluster the cancer samples based on genomic data in llrr method, the high-dimensional genomic data are approximately treated as samples extracted from a combination of several low-rank subspaces the purpose of llrr method is to seek the lowest-rank representation matrix based on a dictionary because a laplacian regularization based on manifold is introduced into llrr, compared to the low-rank representation lrr method, besides capturing the global geometric structure, llrr can capture the intrinsic local structure of high-dimensional observation data well and what is more, in llrr, the original data themselves are selected as a dictionary, so the lowest-rank representation is actually a similar expression between the samples therefore, corresponding to the low-rank representation matrix, the samples with high similarity are considered to come from the same subspace and are grouped into a class the experiment results on real genomic data illustrate that llrr method, compared with lrr and mllrr, is more robust to noise and has a better ability to learn the inherent subspace structure of data, and achieves remarkable performance in the clustering of cancer samples",0,0,0,0,0,0,0,0,0,0,0,0,0,1
27449,updates in using a molecular classifier to identify usual interstitial pneumonia in conventional transbronchial lung biopsy samples <b>a molecular classifier using a machine-learning algorithm based on genomic data could provide an objective method to aid clinicians and multidisciplinary teams to establish the diagnosis of ipf in less-invasive transbronchial lung biopsy samples</b> https://bitly/2qldwim,0,0,0,0,0,0,0,0,0,0,0,0,0,1
163389,"tree-structured supervised learning and the genetics of hypertension this paper is about an algorithm, flextree, for general supervised learning it extends the binary tree-structured approach classification and regression trees, cart although it differs greatly in its selection and combination of predictors it is particularly applicable to assessing interactions: gene by gene and gene by environment as they bear on complex disease one model for predisposition to complex disease involves many genes of them, most are pure noise; each of the values that is not the prevalent genotype for the minority of genes that contribute to the signal carries a score scores add individuals with scores above an unknown threshold are predisposed to the disease for the additive score problem and simulated data, flextree has cross-validated risk better than many cutting-edge technologies to which it was compared when small fractions of candidate genes carry the signal for the model where only a precise list of aberrant genotypes is predisposing, there is not a systematic pattern of absolute superiority; however, overall, flextree seems better than the other technologies we tried the algorithm on data from 563 chinese women, 206 hypotensive, 357 hypertensive, with information on ethnicity, menopausal status, insulin-resistant status, and 21 loci flextree and logic regression appear better than the others in terms of bayes risk however, the differences are not significant in the usual statistical sense",0,0,0,0,0,0,0,0,0,0,0,0,0,1
20388,"gated graph attention network for cancer prediction with its increasing incidence, cancer has become one of the main causes of worldwide mortality in this work, we mainly propose a novel attention-based neural network model named gated graph attention network ggat for cancer prediction, where a gating mechanism gm is introduced to work with the attention mechanism am, to break through the previous works limitation of 1-hop neighbourhood reasoning in this way, our ggat is capable of fully mining the potential correlation between related samples, helping for improving the cancer prediction accuracy additionally, to simplify the datasets, we propose a hybrid feature selection algorithm to strictly select gene features, which significantly reduces training time without affecting prediction accuracy to the best of our knowledge, our proposed ggat achieves the state-of-the-art results in cancer prediction task on lihc, luad, kirc compared to other traditional machine learning methods and neural network models, and improves the accuracy by 1% to 2% on cora dataset, compared to the state-of-the-art graph neural network methods",0,0,0,0,0,0,0,0,0,0,0,0,0,1
49609,"a novel microrna signature for pathological grading in lung adenocarcinoma based on tcga and geo data lung adenocarcinoma luad is one of the most common types of lung cancer and its poor prognosis largely depends on the tumor pathological stage critical roles of micrornas mirnas have been reported in the tumorigenesis and progression of lung cancer however, whether the differential expression pattern of mirnas could be used to distinguish early‑stage stage i from mid‑late‑stage stages ii‑iv luad tumors is still unclear in this study, clinical information and mirna expression profiles of patients with luad were downloaded from the cancer genome atlas tcga and gene expression omnibus databases tcga‑luad n=470 dataset was used for model training and validation, and the gse62182 n=94 and gse83527 n=36 datasets were used as external independent test datasets the diagnostic model was created through mirna feature selection followed by svm classifier and was confirmed by 5‑fold cross‑validation a receiver operating characteristic curve was calculated to evaluate the accuracy and robustness of the model using the dx score and libsvm tool, a 16‑mirna signature that could distinguish luad pathological stages was identified the area under the curve rates were 062 95% confidence interval ci: 056‑067, 066 95% ci: 054‑076 and 063 95% ci: 043‑082 in tcga‑luad internal validation dataset, the gse62182 external validation dataset, and the gse83527 external validation dataset, respectively kyoto encyclopedia of genes and genomes and gene ontology enrichment analyses suggested that the target genes of the 16‑mirna signature were mainly involved in metabolic pathways the present findings demonstrate that a 16‑mirna signature could serve as a promising diagnostic biomarker for pathological staging in luad",0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [45]:
## PROTEINOMICS/BIOMARKERS

## text
text = ['proteinomic', 'immunoglob', 'cytokine', 'biomarker', 'tumor marker', 'tumour marker', 'inflammatory marker',
       'peptide', 'interferon', 'laboratory test', 'blood test']

feat['bio_text'] = np.where(groups['text'].str.contains('serum marker'), "1", "0")

for x in text:
    feat['bio_text'] = np.where(groups['text'].str.contains(x), "1", feat['bio_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['bio_text']))

text counts:
Counter({'0': 31589, '1': 2590})


In [46]:
## NATURAL LANGUAGE PROCESSING

## text
feat['nlp_text'] = np.where(groups['text'].str.contains("natural language"), "1", "0")

## output
print('text counts:')
print(Counter(feat['nlp_text']))

text counts:
Counter({'0': 33668, '1': 511})


In [47]:
## EHR RECORDS

## text
text = ['electronic health', 'health record', 'electronic record', 'patient record', 'medical record',
        'care record', 'patient registry', 'research registr', 'clinical note', 'patient note', 'patient data',
        'care data', 'care note', 'medical data', 'clinical data', 'hospital data', 'hospital note', 'admission note',
        'physiological data', 'observational data', 'patient features', 'patient observations', 'patient history',
        'medical history', 'care history']

feat['ehr_text'] = np.where(groups['text'].str.contains('snomed'), "1", "0")

for x in text:
    feat['ehr_text'] = np.where(groups['text'].str.contains(x), "1", feat['ehr_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['ehr_text']))

text counts:
Counter({'0': 31583, '1': 2596})


In [48]:
## WEARABLE_SENSORS

## text
text = ['wearable sensor', 'smartwatch', 'internet of thing', 'sensor device', 'smart sensor', 'fitbit', 'fitness band',
       'activity tracker', 'fitness tracker']

feat['sensor_text'] = np.where(groups['text'].str.contains('smart watch'), "1", "0")

for x in text:
    feat['sensor_text'] = np.where(groups['text'].str.contains(x), "1", feat['sensor_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['sensor_text']))

text counts:
Counter({'0': 33731, '1': 448})


In [49]:
## PROM

## text
feat['prom_text'] = np.where(groups['text'].str.contains("patient reported outcome"), "1", "0")
feat['prom_text'] = np.where(groups['text'].str.contains("patient-reported outcome"), "1", feat['prom_text'])

##output
print('text counts:')
print(Counter(feat['prom_text']))

text counts:
Counter({'0': 34139, '1': 40})


In [50]:
## SMARTPHONE

## text
feat['phone_text'] = np.where(groups['text'].str.contains("smartphone"), "1", "0")
feat['phone_text'] = np.where(groups['text'].str.contains("iphone"), "1", feat['phone_text'])

##output
print('text counts:')
print(Counter(feat['phone_text']))

text counts:
Counter({'0': 33877, '1': 302})


In [51]:
#### DIGITAL STETH / sound

## text
text = ['heart sound', 'heart murmur', 'breath sound', 'auscultat', 'phonocardio', 'digital steth']

feat['sound_text'] = np.where(groups['text'].str.contains('electronic steth'), "1", "0")

for x in text:
    feat['sound_text'] = np.where(groups['text'].str.contains(x), "1", feat['sound_text']) #if yes then 1, if no, keep current

feat['sound_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("valve")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("murmur")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("resp")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("breath")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])

## output
print('text counts:')
print(Counter(feat['sound_text']))

text counts:
Counter({'0': 34000, '1': 179})


In [52]:
feat[feat['sound_text']=='1'].sample(20)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text,histo_text,oct_text,mamm_text,endo_text,derm_text,gene_text,bio_text,nlp_text,ehr_text,sensor_text,prom_text,phone_text,sound_text
83646,"structural risk evaluation of a deep neural network and a markov model in extracting medical information from phonocardiography this paper presents a method for exploring structural risk of any artificial intelligence-based method in bioinformatics, the a-test method this method provides a way to not only quantitate the structural risk associated with a classification method, but provides a graphical representation to compare the learning capacity of different classification methods two different methods, deep time growing neural network dtgnn and hidden markov model hmm, are selected as two classification methods for comparison time series of heart sound signals are employed as the case study where the classifiers are trained to learn the disease-related changes results showed that the dtgnn offers a superior performance both in terms of the capacity and the structural risk the a-test method can be especially employed in comparing the learning methods with small data size",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
76809,"artificial intelligence-assisted auscultation of heart murmurs: validation by virtual clinical trial artificial intelligence ai has potential to improve the accuracy of screening for valvular and congenital heart disease by auscultation however, despite recent advances in signal processing and classification algorithms focused on heart sounds, clinical acceptance of this technology has been limited, in part due to lack of objective performance data we hypothesized that a heart murmur detection algorithm could be quantitatively and objectively evaluated by virtual clinical trial all cases from the johns hopkins cardiac auscultatory recording database card with either a pathologic murmur, an innocent murmur or no murmur were selected the test algorithm, developed independently of card, analyzed each recording using an automated batch processing protocol 3180 heart sound recordings from 603 outpatient visits were selected from card algorithm estimation of heart rate was similar to gold standard sensitivity and specificity for detection of pathologic cases were 93% ci 90-95% and 81% ci 75-85%, respectively, with accuracy 88% ci 85-91% performance varied according to algorithm certainty measure, age of patient, heart rate, murmur intensity, location of recording on the chest and pathologic diagnosis this is the first reported comprehensive and objective evaluation of an ai-based murmur detection algorithm to our knowledge the test algorithm performed well in this virtual clinical trial this strategy can be used to efficiently compare performance of other algorithms against the same dataset and improve understanding of the potential clinical usefulness of ai-assisted auscultation",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
134904,"comparative classification of thrombotic formations on bileaflet mechanical heart valves by phonographic analysis haemodynamic performance of bileaflet mechanical heart valves can be severely affected by the formation of thrombotic deposits hence, early detection of thrombi is fundamental for a prompt diagnosis and adequate therapy this article aims at designing a novel diagnostic and prognostic tool able to detect valvular thrombosis at early stages of formation, ie, before the appearance of critical symptoms in patients who can be effectively treated by pharmacological therapy, preventing re-operation this approach relies on the acquisition of the acoustic signals produced by mechanical heart valves in the closing phase; the corresponding power spectra are then analysed by means of artificial neural networks trained to identify the presence of thrombi and classify their occurrence five commercial bileaflet mechanical heart valves were investigated in vitro in a sheffield pulse duplicator; for each valve six functional conditions were considered, each corresponding to a risk class for patients one normofunctioning and five thrombosed: they have been simulated by placing artificial deposits of increasing weight and different shape on the valve leaflet and on the annular housing; the case of one completely blocked leaflet was also investigated these six functional conditions represent risk classes: they were examined under various hydrodynamic regimes the acoustic signals produced by the valves were acquired by means of a phonocardiographic apparatus, then analysed and classified the ability to detect and classify thrombotic formations on mechanical valve leaflet would allow ranking patients by assigning them to one of the six risk classes, helping clinicians in establish adequate therapeutic approaches",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
133324,"non-invasive algorithm for bowel motility estimation using a back-propagation neural network model of bowel sounds radiological scoring methods such as colon transit time ctt have been widely used for the assessment of bowel motility however, these radiograph-based methods need cumbersome radiological instruments and their frequent exposure to radiation therefore, a non-invasive estimation algorithm of bowel motility, based on a back-propagation neural network bpnn model of bowel sounds bs obtained by an auscultation, was devised",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
7728,"detection of subclinical rheumatic heart disease in children using a deep learning algorithm on digital stethoscope: a study protocol rheumatic heart diseases rhds contribute significant morbidity and mortality globally to reduce the burden of rhd, timely initiation of secondary prophylaxis is important the objectives of this study are to determine the frequency of subclinical rhd and to train a deep learning dl algorithm using waveform data from the digital auscultatory stethoscope das in predicting subclinical rhd",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
59081,"towards classifying non-segmented heart sound records using instantaneous frequency based features heart sound and its recorded signal which is known as phonocardiograph pcg are one of the most important biosignals that can be used to diagnose cardiac diseases alongside electrocardiogram ecg over the past few years, the use of pcg signals has become more widespread and researchers pay their attention to it and aim to provide an automated heart sound analysis and classification system that supports medical professionals in their decision in this paper, a new method for heart sound features extraction for the classification of non-segmented signals using instantaneous frequency was proposed the method has two major phases: the first phase is to estimate the instantaneous frequency of the recorded signal; the second phase is to extract a set of eleven features from the estimated instantaneous frequency the method was tested into two different datasets, one for binary classification normal and abnormal and the other for multi-classification five classes to ensure the robustness of the extracted features the overall accuracy, sensitivity, specificity, and precision for binary classification and multi-classification were all above 95% using both random forest and knn classifiers",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
160364,"automatic wheeze detection based on auditory modelling automatic wheeze detection has several potential benefits compared with reliance on human auscultation: it is experience independent, an automated historical record can easily be kept, and it allows quantification of wheeze severity previous attempts to detect wheezes automatically have had partial success but have not been reliable enough to become widely accepted as a useful tool in this paper an improved algorithm for automatic wheeze detection based on auditory modelling is developed, called the frequency- and duration-dependent threshold algorithm the mean frequency and duration of each wheeze component are obtained automatically the detected wheezes are marked on a spectrogram in the new algorithm, the concept of a frequency- and duration-dependent threshold for wheeze detection is introduced another departure from previous work is that the threshold is based not on global power but on power corresponding to a particular frequency range the algorithm has been tested on 36 subjects, 11 of whom exhibited characteristics of wheeze the results show a marked improvement in the accuracy of wheeze detection when compared with previous algorithms",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
39299,"a preliminary investigation of whether hrca signals can differentiate between swallows from healthy people and swallows from people with neurodegenerative diseases high-resolution cervical auscultation hrca is an emerging method for non-invasively assessing swallowing by using acoustic signals from a contact microphone, vibratory signals from an accelerometer, and advanced signal processing and machine learning techniques hrca has differentiated between safe and unsafe swallows, predicted components of the modified barium swallow impairment profile, and predicted kinematic events of swallowing such as hyoid bone displacement, laryngeal vestibular closure, and upper esophageal sphincter opening with a high degree of accuracy however, hrca has not been used to characterize swallow function in specific patient populations this study investigated the ability of hrca to differentiate between swallows from healthy people and people with neurodegenerative diseases we hypothesized that hrca would differentiate between swallows from healthy people and people with neurodegenerative diseases with a high degree of accuracy we analyzed 170 swallows from 20 patients with neurodegenerative diseases and 170 swallows from 51 healthy age-matched adults who underwent concurrent video fluoroscopy with non-invasive neck sensors we used a linear mixed model and several supervised machine learning classifiers that use hrca signal features and a leave-one-out procedure to differentiate between swallows twenty-two hrca signal features were statistically significant p < 005 for predicting whether swallows were from healthy people or from patients with neurodegenerative diseases using the hrca signal features alone, logistic regression and decision trees classified swallows between the two groups with 99% accuracy, 100% sensitivity, and 99% specificity this provides preliminary research evidence that hrca can differentiate swallow function between healthy and patient populations",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
78084,"deep unsupervised representation learning for abnormal heart sound classification given the world-wide prevalence of heart disease, the robust and automatic detection of abnormal heart sounds could have profound effects on patient care and outcomes in this regard, a comparison of conventional and state-of-theart deep learning based computer audition paradigms for the audio classification task of normal, mild abnormalities, and moderate/severe abnormalities as present in phonocardiogram recordings, is presented herein in particular, we explore the suitability of deep feature representations as learnt by sequence to sequence autoencoders based on the audeep toolkit key results, gained on the new heart sounds shenzhen corpus, indicate that a fused combination of deep unsupervised features is well suited to the three-way classification problem, achieving our highest unweighted average recall of 479% on the test partition",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
70210,"classifying heart sounds using images of motifs, mfcc and temporal features cardiovascular disease is the leading cause of death in the world, and its early detection is a key to improving long-term health outcomes the auscultation of the heart is still an important method in the medical process because it is very simple and cheap to detect possible heart anomalies at an early stage, an automatic method enabling cardiac health low-cost screening for the general population would be highly valuable by analyzing the phonocardiogram signals, it is possible to perform cardiac diagnosis and find possible anomalies at an early-term therefore, the development of intelligent and automated analysis tools of the phonocardiogram is very relevant in this work, we use simultaneously collected electrocardiograms and phonocardiograms from the physionet challenge database with the main objective of determining whether a phonocardiogram corresponds to a normal or abnormal physiological state our main contribution is the methodological combination of time domain features and frequency domain features of phonocardiogram signals to improve cardiac disease automatic classification this novel approach is developed using both features first, the phonocardiogram signals are segmented with an algorithm based on a logistic regression hidden semi-markov model, which uses electrocardiogram signals as a reference then, two groups of features from the time and frequency domain are extracted from the phonocardiogram segments one group is based on motifs and the other on mel-frequency cepstral coefficients after that, we combine these features into a two-dimensional time-frequency heat map representation lastly, a binary classifier is applied to both groups of features to learn a model that discriminates between normal and abnormal phonocardiogram signals in the experiments, three classification algorithms are used: support vector machines, convolutional neural network, and random forest the best results are achieved when both time and mel-frequency cepstral coefficients features are considered using a support vector machines with a radial kernel",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [53]:
## COMBINE
labelled['feat_xr'] = np.where(feat['xr_text'].str.contains("1"), "1", "0")

labelled['feat_ct'] = np.where(feat['ct_text'].str.contains("1"), "1", "0")

labelled['feat_mri'] = np.where(feat['mri_text'].str.contains("1"), "1", "0")

labelled['feat_eeg'] = np.where(feat['eeg_text'].str.contains("1"), "1", "0")

labelled['feat_ecg'] = np.where(feat['ecg_text'].str.contains("1"), "1", "0")

labelled['feat_emg'] = np.where(feat['emg_text'].str.contains("1"), "1", "0")

labelled['feat_us'] = np.where(feat['us_text'].str.contains("1"), "1", "0")

labelled['feat_echo'] = np.where(feat['echo_text'].str.contains("1"), "1", "0")

labelled['feat_histo'] = np.where(feat['histo_text'].str.contains("1"), "1", "0")

labelled['feat_oct'] = np.where(feat['oct_text'].str.contains("1"), "1", "0")

labelled['feat_mamm'] = np.where(feat['mamm_text'].str.contains("1"), "1", "0")

labelled['feat_endoscop'] = np.where(feat['endo_text'].str.contains("1"), "1", "0")

labelled['feat_derm'] = np.where(feat['derm_text'].str.contains("1"), "1", "0")

labelled['feat_gene'] = np.where(feat['gene_text'].str.contains("1"), "1", "0")

labelled['feat_bio'] = np.where(feat['bio_text'].str.contains("1"), "1", "0")

labelled['feat_nlp'] = np.where(feat['nlp_text'].str.contains("1"), "1", "0")

labelled['feat_ehr'] = np.where(feat['ehr_text'].str.contains("1"), "1", "0")

labelled['feat_sensor'] = np.where(feat['sensor_text'].str.contains("1"), "1", "0")

labelled['feat_phone'] = np.where(feat['phone_text'].str.contains("1"), "1", "0")

labelled['feat_prom'] = np.where(feat['prom_text'].str.contains("1"), "1", "0")

labelled['feat_sound'] = np.where(feat['sound_text'].str.contains("1"), "1", "0")

#feat.to_csv('output/feat_tagged.csv')

## Tag Specialties / Use-Cases

In [54]:
######################
## CLASS TAGS - by mesh for disease type
######################


######################
## CLASS TAGS - by specialty, not mutually exclusive
######################
## INTENSIVE CARE MEDICINE / icu

## EMERGENCY MEDICINE / ed

## INFECTIONS [C01] / id
    #### SEPSIS / sepsis
    #### COVID-19 / cov19
    #### MALARIA / malaria
    #### HIV / hiv
    #### TB / tb
    #### TROPICAL DISEASE / tropic
    
## DERMATOLOGY [C17] / derm
    ####SKIN CANCERS / dermca

## NEOPLASMS [C04] / onc
    #### RADIOTHERAPY / rx
    #### LUNG / lungca
    #### NEURO / neuroca
    #### GI / gica
    #### HPB / hepca
    #### GYNAE / gynonc
    #### PROSTATE / prosca
    #### RENAL / renalca
    #### HAEM / haemonc
    
## BREAST / breast (<- almost entirely onc)
    #### BREAST CA / breastca
    
## PSYCHIATRY / psych
    #### SUICIDE / suicide
    
## MUSCULOSKELETAL [C05] / msk
    #### FRACTURE / frac

## CONNECTIVE TISSUE [C17] / rheum

## GASTROINTESTINAL [C06] / gi

## HEPATOLOGY & BILIARY [C06] / hep

## RESPIRATORY [C08] / resp
    #### PNEUMONIA / pneum
    #### OBSTRUCTIVE SLEEP / osa
    #### PULMONARY EMBOLISM / pe
    
## NERVOUS SYSTEM [C10] / neuro
    #### STROKE / cva
    #### SEIZURE / epilep
    #### DEMENTIA / alzh

## CARDIOVASCULAR [C14] / cvs
    #### ISCHAEMIC HEART DISEASE / ihd
    #### CARDIAC FAILURE / hf
    #### ARRHYTHMIA / arrhyt
    
## ENDOCRINE [C19] (no dm) / endo

## DIABETES / dm
    #### INSULIN / insulin
    #### RETINOPATHY / retina
        
## OPHTHALMOLOGY [C11] / eye

## HAEMATOLOGIC [C15] / haem

## GYNAE/OBSTETRIC [C13] / obs

## NEPHROLOGY [C12] / renal
    #### ACUTE & CHRONIC KIDNEY / ackd
    
## PAEDIATRICS / paeds

## STOMATOGNATHIC [C07] / dental

## AUDIOLOGY [C09] / ent

## PUBLIC HEALTH / pubh

########exclude?############# 

## ALCOHOL & SUBSTANCES [C25] / etoh
## WOUNDS AND INJURIES [C26] -> TRAUMA
## ENVIRONMENTAL [C21] / env


######################
## SPECIAL
######################
## BCI
## CONTROL
#### PROSTHESIS CONTROL
#### WHEELCHAIR CONTROL



## vitals monitoring / deterioration
## trauma?
## sleep
## pulmonary embolism

In [55]:
spec = groups[['text']].copy()

In [56]:
## INTENSIVE CARE MEDICINE / icu

## text
text = ['intensive care', 'critical care', 'mechanical ventilation', 'invasive ventilation', 'ventilator', 'pressure ventilation', 
       'acute respiratory distress syndrome', 'organ failure', 'tracheal intubation', 'vasopressor', 'inotrope',
       'hemofiltration', 'membrane oxygenation', 'ecmo', ' ett ', 'layngoscope', 'endotracheal tube']

spec['icu_text'] = np.where(groups['text'].str.contains('intensive therapy unit'), "1", "0")

for x in text:
    spec['icu_text'] = np.where(groups['text'].str.contains(x), "1", spec['icu_text']) #if yes then 1, if no, keep current

##output    
print('text counts:')
print(Counter(spec['icu_text']))

text counts:
Counter({'0': 33431, '1': 748})


In [57]:
## EMERGENCY MEDICINE / ed

## text
text = ['emergency department', 'emergency room', 'emergency physician', 'emergency doctor', 'emergency medicine',
       'emergency care', 'accident and emergency', 'a&e', 'accident & emergency', 'prehospital', 'pre-hospital',
       'casualty room', 'emergency ward']

spec['ed_text'] = np.where(groups['text'].str.contains('casualty department'), "1", "0")

for x in text:
    spec['ed_text'] = np.where(groups['text'].str.contains(x), "1", spec['ed_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['ed_text']))

text counts:
Counter({'0': 33768, '1': 411})


In [58]:
## INFECTIONS / id + bacteriology/virology/parasitology

## text
text = ['bacter', 'microbiol', 'sepsis', 'septic', 'toxic shock', 'microbe', 'tuberculosis',
       'cholera', 'shigella', 'bubonic', 'plague', 'anthrax', 'gonorrhea', 'syphilis', 'diphtheria', 'legionell',
       'leptospirosis', 'listeriosis', 'tetanus', 'pertussis', 'staph', 'strep', 'escherichia', 'leprosy', 
        'mycobacter', 'blood culture',
       
       'fungus', 'fungal', 'fungaemia', 'fungemia', 'candida', 'aspergill',
       
       'virolog', 'virus', 'viral', 'virulen', 'influenza', 'hepatitis', 'herpes', 'varicella',
       'measles', 'covid', 'sars-cov', 'coronavirus', 'severe acute respiratory syndrome', 'yellow fever', 'dengue',
       'rabies', 'zika', 'ebola', 'polio', 'hemorrhagic fever', 'haemorrhagic fever', 'rabies',
       
       'transmitted disease', 'sexually transmit', 'sexual transmis',
       
       'lyme', 'malaria', 'falciparum', 'anopheles', 'parasit', 'helminth', 'protozoa', 
        'leishmaniasis', 'trypanosom', 'chagas', 'schistosomiasis', 'filariasis', 'toxoplasm' 'tropical disease',
       
       ' hiv ', 'human immunodeficiency virus', 'acquired immune deficiency syndrome']

spec['id_text'] = np.where(groups['text'].str.contains('infectio'), "1", "0")

for x in text:
    spec['id_text'] = np.where(groups['text'].str.contains(x), "1", spec['id_text']) #if yes then 1, if no, keep current
    
## output
print('text counts:')
print(Counter(spec['id_text']))


text counts:
Counter({'0': 31031, '1': 3148})


In [59]:
#### SEPSIS / sepsis

## text
text = ['sepsis', 'septic', 'bacteraem', 'bacterem', 'toxic shock syndrome', 'pyaemia']

spec['sepsis_text'] = np.where(groups['text'].str.contains('pyemia'), "1", "0")

for x in text:
    spec['sepsis_text'] = np.where(groups['text'].str.contains(x), "1", spec['sepsis_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['sepsis_text']))

text counts:
Counter({'0': 33928, '1': 251})


In [60]:
#### COVID-19 / cov19

## text
text = ['sars-cov', 'coronavirus disease 2019', 'novel coronavirus', 'coronavirus disease 19', 'sars cov']

spec['cov19_text'] = np.where(groups['text'].str.contains('covid'), "1", "0")

for x in text:
    spec['cov19_text'] = np.where(groups['text'].str.contains(x), "1", spec['cov19_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['cov19_text']))

text counts:
Counter({'0': 32994, '1': 1185})


In [61]:
#### HIV / hiv

## text
text = ['human immunodeficiency virus', 'acquired immune deficiency syndrome', ' aids ']

spec['hiv_text'] = np.where(groups['text'].str.contains(' hiv '), "1", "0")

for x in text:
    spec['hiv_text'] = np.where(groups['text'].str.contains(x), "1", spec['hiv_text']) #if yes then 1, if no, keep current

    
## output
print('text counts:')
print(Counter(spec['hiv_text']))

text counts:
Counter({'0': 33996, '1': 183})


In [62]:
#### TUBERCULOSIS / tb

## text
text = ['tuberculosis', 'mycobacterium tuberc']

spec['tb_text'] = np.where(groups['text'].str.contains('tubercu'), "1", "0")

for x in text:
    spec['tb_text'] = np.where(groups['text'].str.contains(x), "1", spec['tb_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['tb_text']))

text counts:
Counter({'0': 34011, '1': 168})


In [63]:
#### TROPICAL DISEASE / tropic

## text
text = ['malaria', 'falciparum', 'anopheles', 'parasit', 'helminth', 'protozoa', 
        'leishmaniasis', 'trypanosom', 'chagas', 'schistosomiasis', 'filariasis', 'toxoplasm',
       'yellow fever', 'dengue', 'rabies', 'cholera', 'zika', 'ebola', 'hemorrhagic fever', 'haemorrhagic fever',
        'tropical disease', 'tropical medicine', 'filariasis']

spec['tropic_text'] = np.where(groups['text'].str.contains('falciparum'), "1", "0")

for x in text:
    spec['tropic_text'] = np.where(groups['text'].str.contains(x), "1", spec['tropic_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['tropic_text']))

text counts:
Counter({'0': 34053, '1': 126})


In [64]:
#### MALARIA / malaria

## text
text = ['malaria', 'anopheles']

spec['malaria_text'] = np.where(groups['text'].str.contains('falciparum'), "1", "0")

for x in text:
    spec['malaria_text'] = np.where(groups['text'].str.contains(x), "1", spec['malaria_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['malaria_text']))

text counts:
Counter({'0': 34120, '1': 59})


In [65]:
## DERMATOLOGY / derm

## text
text = ['dermato', 'dermatitis', 'erythema', 'cutaneous', 'eczema', 'psoriasis', 'rosacea', 'vitiligo', 'urticaria',
       'pruritus', 'impetigo', 'pemphigoid', 'pityriasis', 'melanoma', 'basal cell ca', 'merkel cell',
       'skin cancer', 'skin lesion', 'skin rash', 'nevus', 'naevus', 'dermal cancer', 'dermal lesion']

spec['derm_text'] = np.where(groups['text'].str.contains('emollient'), "1", "0")

for x in text:
    spec['derm_text'] = np.where(groups['text'].str.contains(x), "1", spec['derm_text']) #if yes then 1, if no, keep current

spec['derm_text'] = np.where((groups['text'].str.contains("skin")) &
                             (groups['text'].str.contains("squamous cell")) , "1", spec['derm_text'])
spec['derm_text'] = np.where((groups['text'].str.contains("dermal")) &
                             (groups['text'].str.contains("squamous cell")) , "1", spec['derm_text'])
spec['derm_text'] = np.where((groups['text'].str.contains("skin")) &
                             (groups['text'].str.contains(" scc ")) , "1", spec['derm_text'])
spec['derm_text'] = np.where((groups['text'].str.contains("dermal")) &
                             (groups['text'].str.contains(" scc ")) , "1", spec['derm_text'])
                             
## output
print('text counts:')
print(Counter(spec['derm_text']))

text counts:
Counter({'0': 33397, '1': 782})


In [66]:
#### SKIN CANCERS / dermca

## text
text = ['melanoma', 'melanocytic', 'casal cell ca', 'skin cancer', 'dysplastic nevus', 'dysplastic naevus',
       'merkel cell', 'atypical nevus', 'atypical naevus']

spec['dermca_text'] = np.where(groups['text'].str.contains('skin cancer'), "1", "0")

for x in text:
    spec['dermca_text'] = np.where(groups['text'].str.contains(x), "1", spec['dermca_text']) #if yes then 1, if no, keep current

spec['dermca_text'] = np.where((groups['text'].str.contains("skin")) &
                             (groups['text'].str.contains("squamous cell")) , "1", spec['dermca_text'])
spec['dermca_text'] = np.where((groups['text'].str.contains("dermal")) &
                             (groups['text'].str.contains("squamous cell")) , "1", spec['dermca_text'])
spec['dermca_text'] = np.where((groups['text'].str.contains("skin")) &
                             (groups['text'].str.contains(" scc ")) , "1", spec['dermca_text'])
spec['dermca_text'] = np.where((groups['text'].str.contains("dermal")) &
                             (groups['text'].str.contains(" scc ")) , "1", spec['dermca_text'])
                          
## output
print('text counts:')
print(Counter(spec['dermca_text']))

text counts:
Counter({'0': 33787, '1': 392})


In [67]:
## ONCOLOGY / onc

## text
text = ['radiotherapy', 'radiation therapy', 'mammog', 'breast ca', 'breast tum', 'invasive lobular carcinoma', 
        ' dcis ', 'ductal carcinoma in situ', 'lung cancer', 'lung malignancy', 'lung carcinoma', 'lung nodule',
        'pulmonary nodule', 'mesothelioma', 'nsclc',
       'neuroonc', 'neuro onc', 'neuro-onc', 'brain cancer', 'brain tumor', 'brain tumour', 'brain malignancy',
       'glioma', 'glioblastoma', 'astrocytoma', 'pituitary adenoma', 'acoustic neuroma', 'meningioma',
       'cns lymphoma', 'oligodendroglioma', 'meningeal cancer', 'meningeal carcinomatosis',
       'melanoma', 'melanocytic', 'casal cell ca', 'skin cancer', 'dysplastic nevus', 'dysplastic naevus',
       'merkel cell', 'atypical nevus', 'atypical naevus',
       'gi cancer', 'gastrointestinal cancer', 'colon cancer', 'colon carcinoma', 'colon polyp', 'colon adeno', 'colon tumo',
       'colonic cancer', 'colonic carcinoma', 'colonic adeno', 'colonic polyp', 'colonic tumo', 'colonic neoplasm',
        'rectal cancer', 'rectal carcinoma', 'rectal polyp', 'rectal tumo', 'rectal neoplasm', 'bowel cancer', 'bowel neoplasm',
       'bowel tumo', 'stomach cancer', 'gastric cancer', 'gastric carcinoma', 'gastric neoplasm', 'gastric tumo',
       'esophageal cancer', 'esophageal tumo', 'esophageal neoplasm',
       'hepatocellular cancer', 'hepatocellular carcinoma', 'hepatic cancer', 'hepatic carcinoma', 'hepatic tumo',
       'hepatic neoplasm', 'liver cancer', 'liver carcinoma', 'liver tumo', 'cholangioca', 'pancreatic cancer',
       'pancreatic neoplasm', 'pancreatic tumo', 'biliary cancer', 'bile duct cancer',
       'prostate cancer', 'prostate specific antigen', 'prostate carcinoma', 'prostate neoplasm', 'prostate tumo',
       'prostate adeno', 'prostatic cancer', 'prostatic neoplasm', 'prostatic tumo', 'prostatic adeno', 'prostatectomy',
       ' psa ', 'kidney cancer', 'kidney tumo', 'renal cell carcinoma', 'renal call cancer', 'renal tumo', 'renal cancer',
       'wilms tumo', 'bladder cancer', 'bladder carcinoma', 'transitional cell ca', 'urothelial cancer', 'urothelial carcinoma',
        'gynecologic cancer', 'gynecological cancer', 'gynaecologic cancer', 'gynaecological cancer', 'ovarian cancer',
       'ovarian carcinoma', 'uterine cancer', 'uterine carcinoma', 'cervical cancer', 'cervical carcinoma', 'colposcop',
       'haematological cancer', 'hematological cancer', 'haematological malig', 'hematological malig', 'myelodysplas',
       'myeloprolif', 'lymphoprolif', 'leukaemoa', 'leukemia', 'myelofibro', 'thrombocythemia', 'polycythemia vera',
       'polycythemia rubra vera', 'thrombocythaemia', 'polycythaemia vera', 'polycythaemia rubra vera', 'lymphoma',
       'myeloma', ' gvhd', 'stem cell transpl', 'bone marrow aspirate']

spec['onc_text'] = np.where(groups['text'].str.contains('metasta'), "1", "0")

for x in text:
    spec['onc_text'] = np.where(groups['text'].str.contains(x), "1", spec['onc_text']) #if yes then 1, if no, keep current


## output
print('text counts:')
print(Counter(spec['onc_text']))

text counts:
Counter({'0': 26595, '1': 7584})


In [68]:
#### RADIOTHERAPY / rx

## text
spec['rx_text'] = np.where(groups['text'].str.contains("radiotherapy"), "1", "0")
spec['rx_text'] = np.where(groups['text'].str.contains("radiation therapy"), "1", "0")

##output
print('text counts:')
print(Counter(spec['rx_text']))

text counts:
Counter({'0': 33920, '1': 259})


In [69]:
#### BREAST / breast

## text
text = ['mammog', 'breast ca', 'breast tum', 'invasive lobular carcinoma', ' dcis ', 'ductal carcinoma in situ']

spec['breast_text'] = np.where(groups['text'].str.contains(' breast '), "1", "0")

for x in text:
    spec['breast_text'] = np.where(groups['text'].str.contains(x), "1", spec['breast_text']) #if yes then 1, if no, keep current

    
## output
print('text counts:')
print(Counter(spec['breast_text']))

text counts:
Counter({'0': 32173, '1': 2006})


In [70]:
#### BREAST CANCER / breastca

## text
text = ['mammog', 'breast ca', 'breast tum', 'invasive lobular carcinoma', ' dcis ', 'ductal carcinoma in situ']

spec['breastca_text'] = np.where(groups['text'].str.contains('breast cancer'), "1", "0")

for x in text:
    spec['breastca_text'] = np.where(groups['text'].str.contains(x), "1", spec['breastca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['breastca_text']))

text counts:
Counter({'0': 32422, '1': 1757})


In [71]:
#### LUNG CA / lungca

## text
text = ['lung cancer', 'lung malignancy', 'lung carcinoma', 'lung nodule', 'pulmonary nodule', 'mesothelioma', 'nsclc']

spec['lungca_text'] = np.where(groups['text'].str.contains('lung cancer'), "1", "0")

for x in text:
    spec['lungca_text'] = np.where(groups['text'].str.contains(x), "1", spec['lungca_text']) #if yes then 1, if no, keep current

spec['lungca_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("adenoca")) , "1", spec['lungca_text'])
spec['lungca_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("small cell")) , "1", spec['lungca_text'])
spec['lungca_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("squamous")) , "1", spec['lungca_text'])
spec['lungca_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("small-cell")) , "1", spec['lungca_text'])

## output
print('text counts:')
print(Counter(spec['lungca_text']))

text counts:
Counter({'0': 33001, '1': 1178})


In [72]:
#### NEURO ONC / neuroca

## text
text = ['neuroonc', 'neuro onc', 'neuro-onc', 'brain cancer', 'brain tumor', 'brain tumour', 'brain malignancy',
       'glioma', 'glioblastoma', 'astrocytoma', 'pituitary adenoma', 'acoustic neuroma', 'meningioma',
       'cns lymphoma', 'oligodendroglioma', 'meningeal cancer', 'meningeal carcinomatosis']

spec['brainca_text'] = np.where(groups['text'].str.contains('brain cancer'), "1", "0")

for x in text:
    spec['brainca_text'] = np.where(groups['text'].str.contains(x), "1", spec['brainca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['brainca_text']))

text counts:
Counter({'0': 33275, '1': 904})


In [73]:
#### GI ONC / gica

## text
text = ['gi cancer', 'gastrointestinal cancer', 'colon cancer', 'colon carcinoma', 'colon polyp', 'colon adeno', 'colon tumo',
       'colonic cancer', 'colonic carcinoma', 'colonic adeno', 'colonic polyp', 'colonic tumo', 'colonic neoplasm',
        'rectal cancer', 'rectal carcinoma', 'rectal polyp', 'rectal tumo', 'rectal neoplasm', 'bowel cancer', 'bowel neoplasm',
       'bowel tumo', 'stomach cancer', 'gastric cancer', 'gastric carcinoma', 'gastric neoplasm', 'gastric tumo',
       'esophageal cancer', 'esophageal tumo', 'esophageal neoplasm']

spec['gica_text'] = np.where(groups['text'].str.contains('luminal cancer'), "1", "0")

for x in text:
    spec['gica_text'] = np.where(groups['text'].str.contains(x), "1", spec['gica_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['gica_text']))

text counts:
Counter({'0': 33331, '1': 848})


In [74]:
#### HPB ONC / hepca

## text
text = ['hepatocellular cancer', 'hepatocellular carcinoma', 'hepatic cancer', 'hepatic carcinoma', 'hepatic tumo',
       'hepatic neoplasm', 'liver cancer', 'liver carcinoma', 'liver tumo', 'cholangioca', 'pancreatic cancer',
       'pancreatic neoplasm', 'pancreatic tumo', 'biliary cancer', 'bile duct cancer']

spec['hepca_text'] = np.where(groups['text'].str.contains('cancer of the pancreas'), "1", "0")

for x in text:
    spec['hepca_text'] = np.where(groups['text'].str.contains(x), "1", spec['hepca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['hepca_text']))

text counts:
Counter({'0': 33682, '1': 497})


In [75]:
#### PROSTATE ONC / prosca

## text
text = ['prostate cancer', 'prostate specific antigen', 'prostate carcinoma', 'prostate neoplasm', 'prostate tumo',
       'prostate adeno', 'prostatic cancer', 'prostatic neoplasm', 'prostatic tumo', 'prostatic adeno', 'prostatectomy',
       ' psa ']

spec['prosca_text'] = np.where(groups['text'].str.contains('prostatectomy'), "1", "0")

for x in text:
    spec['prosca_text'] = np.where(groups['text'].str.contains(x), "1", spec['prosca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['prosca_text']))

text counts:
Counter({'0': 33506, '1': 673})


In [76]:
#### RENAL & BLADDER / renalca

## text
text = ['kidney cancer', 'kidney tumo', 'renal cell carcinoma', 'renal call cancer', 'renal tumo', 'renal cancer',
       'wilms tumo', 'bladder cancer', 'bladder carcinoma', 'transitional cell ca', 'urothelial cancer', 'urothelial carcinoma']

spec['renalca_text'] = np.where(groups['text'].str.contains('renal carcinoma'), "1", "0")

for x in text:
    spec['renalca_text'] = np.where(groups['text'].str.contains(x), "1", spec['renalca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['renalca_text']))

text counts:
Counter({'0': 33920, '1': 259})


In [77]:
#### GYNAE / gynonc

## text
text = ['gynecologic cancer', 'gynecological cancer', 'gynaecologic cancer', 'gynaecological cancer', 'ovarian cancer',
       'ovarian carcinoma', 'uterine cancer', 'uterine carcinoma', 'cervical cancer', 'cervical carcinoma', 'colposcop',
       'endometrial cancer']

spec['gynonc_text'] = np.where(groups['text'].str.contains('pap smear'), "1", "0")

for x in text:
    spec['gynonc_text'] = np.where(groups['text'].str.contains(x), "1", spec['gynonc_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['gynonc_text']))

text counts:
Counter({'0': 33810, '1': 369})


In [78]:
#### HAEM / haemonc

## text
text = ['haematological cancer', 'hematological cancer', 'haematological malig', 'hematological malig', 'myelodysplas',
       'myeloprolif', 'lymphoprolif', 'leukaemia', 'leukemia', 'myelofibro', 'thrombocythemia', 'polycythemia vera',
       'polycythemia rubra vera', 'thrombocythaemia', 'polycythaemia vera', 'polycythaemia rubra vera', 'lymphoma',
       'myeloma', ' gvhd', 'stem cell transpl', 'bone marrow aspirate']

spec['haemonc_text'] = np.where(groups['text'].str.contains('bone marrow biopsy'), "1", "0")

for x in text:
    spec['haemonc_text'] = np.where(groups['text'].str.contains(x), "1", spec['haemonc_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['haemonc_text']))

text counts:
Counter({'0': 33775, '1': 404})


In [79]:
## PSYCHIATRY / psych

## text
text = ['psych', 'schizo', 'depressive disorder', 'anxiety disorder', 'stress disorder', 'suicide', 'suicidal', 'mood disorder',
        'self harm', 'self-harm', 'self injury', 'self-injury',
        'mental disorder', 'hyperactivity disorder', 'hyperactive disorder', 'psychological distress', 'bipolar', 
       'addiction disorder', 'autism', 'autistic']

spec['psych_text'] = np.where(groups['text'].str.contains('mental health'), "1", "0")

for x in text:
    spec['psych_text'] = np.where(groups['text'].str.contains(x), "1", spec['psych_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['psych_text']))

text counts:
Counter({'0': 32061, '1': 2118})


In [80]:
## SUICIDE / suicide

## text
text = ['suicide', 'suicidal', 'self harm', 'self-harm', 'self injury', 'self-injury', 'depressive disorder']

spec['suicide_text'] = np.where(groups['text'].str.contains('low mood'), "1", "0")

for x in text:
    spec['suicide_text'] = np.where(groups['text'].str.contains(x), "1", spec['suicide_text']) #if yes then 1, if no, keep current

spec['suicide_text'] = np.where((groups['text'].str.contains("psych")) &
                             (groups['text'].str.contains("depression")) , "1", spec['suicide_text'])
spec['suicide_text'] = np.where((groups['text'].str.contains("mental")) &
                             (groups['text'].str.contains("depression")) , "1", spec['suicide_text'])

## output
print('text counts:')
print(Counter(spec['suicide_text']))

text counts:
Counter({'0': 33660, '1': 519})


In [81]:
## MUSCULOSKELETAL / msk

## text
text = ['musculoskeletal', 'bone disease', 'bone cyst', 'chondritis', 'fasciitis', 'ankylos', 'osteoarth', 'orthoped',
       'orthopaed', 'bursitis', 'synovitis', 'congenital hip', 'joint instability', 'joint stability', 'myositis',
       'polymyalgia', 'fibromyalgia', ' gout', 'tendinopath', 'arthro', 'ligament', 'fracture', 'hip surgery',
       'hip replacement', 'acetabul', 'cruciate', 'joint space', 'dysplatic hip', 'hip dysplas', 'vertebral', 'discectomy',
       'lumbar spine', 'thoracic spine', 'cervical spine', 'whole spine', 'osteoporosis', 'bone mineral density']

spec['msk_text'] = np.where(groups['text'].str.contains('broken bone'), "1", "0")

for x in text:
    spec['msk_text'] = np.where(groups['text'].str.contains(x), "1", spec['msk_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['msk_text']))

text counts:
Counter({'0': 33302, '1': 877})


In [82]:
#### FRACTURE / frac

##text
spec['frac_text'] = np.where(groups['text'].str.contains("fracture"), "1", "0")

print('text counts:')
print(Counter(spec['frac_text']))

text counts:
Counter({'0': 33942, '1': 237})


In [83]:
## CONNECTIVE TISSUE [C17] / rheum

## text
text = ['rheumatoid', 'scleroderma', 'wegener', 'polyangiitis', 'churg-strauss', 'lupus', 'connective tissue disease',
        'mixed connective tissue', 'polymyositis', 'dermatomyositis', 'sjogren', 'vasculitis', 'vasculitide', 'marfan',
       'ehlers-danlos', 'osteogenesis imperfecta']

spec['rheum_text'] = np.where(groups['text'].str.contains('rheumatolog'), "1", "0")

for x in text:
    spec['rheum_text'] = np.where(groups['text'].str.contains(x), "1", spec['rheum_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['rheum_text']))

text counts:
Counter({'0': 34015, '1': 164})


In [84]:
## LUMINAL GI / gi

## text
text = ['gastro', 'gastri', 'intestin', 'duoden', 'colonic', 'colonoscop', 'colitis', 'rectal', 'ileus', 'ileitis',
       'crohn', 'esophag', 'proctitis', 'proctolog', 'bowel disease', 'bowel cancer', 'bowel neoplasm' ,'bowel tumo',
       'celiac', 'coeliac', 'diverticulitis', 'diverticulosis', 'stomach', 'small bowel', 'large bowel']

spec['gi_text'] = np.where(groups['text'].str.contains('gi tract'), "1", "0")

for x in text:
    spec['gi_text'] = np.where(groups['text'].str.contains(x), "1", spec['gi_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['gi_text']))


text counts:
Counter({'0': 32628, '1': 1551})


In [85]:
## HEPATOLOGY (and pancreatobiliary) / hep

## text
text = ['hepato', 'hepati', 'cholang', 'gallbladder', 'gall bladder', 'biliary' , 'pancreas', 'pancreat', 'wilson disease',
       'wilsons disease', 'liver fibrosis' ,'liver cirrhosis', 'nafld', 'hemochromatosis', 'haemochromatosis']

spec['hep_text'] = np.where(groups['text'].str.contains(' liver '), "1", "0")

for x in text:
    spec['hep_text'] = np.where(groups['text'].str.contains(x), "1", spec['hep_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['hep_text']))


text counts:
Counter({'0': 32964, '1': 1215})


In [86]:
## RESPIRATORY / resp

## text
text = ['respiratory', 'pneumonia', 'lung cancer', 'lung disease', 'lung nodule', 'pulmonary', 'asthma', 'obstructive sleep ap',
       'copd', 'pleura', 'mesothelioma', 'lung fibrosis', 'lung adeno', 'nsclc', 'interstitial lung', 'occupational lung', 'tuberculosis',
       'bronch']

spec['resp_text'] = np.where(groups['text'].str.contains(' lung '), "1", "0")

for x in text:
    spec['resp_text'] = np.where(groups['text'].str.contains(x), "1", spec['resp_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['resp_text']))

text counts:
Counter({'0': 30722, '1': 3457})


In [87]:
#### PNEUMONIA / pneum

## text
text = ['respiratory infection', 'pulmonary infection', 'pneumonia', 'alveolar consolidation', 'lung consolidation', 'lung infection',
       'pulmonary consolidation']

spec['pneum_text'] = np.where(groups['text'].str.contains('lower respiratory tract infection'), "1", "0")

for x in text:
    spec['pneum_text'] = np.where(groups['text'].str.contains(x), "1", spec['pneum_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['pneum_text']))

text counts:
Counter({'0': 33598, '1': 581})


In [88]:
#### OBSTRUCTIVE SLEEP / osa

## text
text = ['obstructive sleep ap', 'sleep apnoea']

spec['osa_text'] = np.where(groups['text'].str.contains('sleep apnea'), "1", "0")

for x in text:
    spec['osa_text'] = np.where(groups['text'].str.contains(x), "1", spec['osa_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['osa_text']))

text counts:
Counter({'0': 33909, '1': 270})


In [89]:
#### PULMONARY EMBOLISM / pe

## text
text = ['saddle embol', 'pulmonary angiogr']

spec['pe_text'] = np.where(groups['text'].str.contains('pulmonary embol'), "1", "0")

for x in text:
    spec['pe_text'] = np.where(groups['text'].str.contains(x), "1", spec['pe_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['pe_text']))

text counts:
Counter({'0': 34126, '1': 53})


In [90]:
#### PUBLIC HEALTH / pubh

## text
spec['pubh_text'] = np.where(groups['text'].str.contains("public health"), "1", "0")
spec['pubh_text'] = np.where(groups['text'].str.contains("population health"), "1", spec['pubh_text'])
spec['pubh_text'] = np.where(groups['text'].str.contains("health protection"), "1", spec['pubh_text'])

print('text counts:')
print(Counter(spec['pubh_text']))

text counts:
Counter({'0': 33865, '1': 314})


In [91]:
## NERVOUS SYSTEM / neuro

## text
text = ['neuro', 'brain', 'nervous system', 'multiple sclerosis', 'amyotrophic', 'motor neuron disease',
       'dementia', 'cognitive impairment', 'alzheimer', 'epilepsy', 'parkinson', 'dyskinesia', 'cerebellar', 'cerebral',
       'guillain', 'myelin', 'migraine', 'headache', 'meningeal', 'meningitis', 'encephalitis', 'ischemic stroke', 'ischaemic stroke',
       'hemorrhagic stroke', 'haemorrhagic stroke', 'embolic stroke', 'thrombotic stroke', 'myasthenia', 'movement disorder',
       'subdural', 'extradural', 'arachnoid', 'glioma', 'astrocytoma', 'glioblast', ' mci ', 'cerebrovascular']

spec['neuro_text'] = np.where(groups['text'].str.contains('white matter'), "1", "0")

for x in text:
    spec['neuro_text'] = np.where(groups['text'].str.contains(x), "1", spec['neuro_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['neuro_text']))

text counts:
Counter({'0': 26409, '1': 7770})


In [92]:
#### STROKE/bleed / cva

## text
text = ['cerebrovascular', 'ischemic stroke', 'ischaemic stroke', 'hemorrhagic stroke', 'haemorrhagic stroke', 
        'embolic stroke', 'thrombotic stroke', 'subarachnoid hemorrhage', 'subarachnoid haemorrhage', 'cerebral artery stroke',
       'cerebral artery infarct', 'malignant middle cerebral', 'malignant mca']

spec['cva_text'] = np.where(groups['text'].str.contains(' ich '), "1", "0")

for x in text:
    spec['cva_text'] = np.where(groups['text'].str.contains(x), "1", spec['cva_text']) #if yes then 1, if no, keep current


spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("infarct")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("infarct")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("stroke")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("stroke")) , "1", spec['cva_text'])    
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("vessel occlusion")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("vessel occlusion")) , "1", spec['cva_text'])   
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("bleed")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("bleed")) , "1", spec['cva_text'])   
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("haemorrhage")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("haemorrhage")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cranial")) &
                             (groups['text'].str.contains("haemorrhage")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("hemorrhage")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("hemorrhage")) , "1", spec['cva_text'])   
spec['cva_text'] = np.where((groups['text'].str.contains("cranial")) &
                             (groups['text'].str.contains("hemorrhage")) , "1", spec['cva_text'])
## output
print('text counts:')
print(Counter(spec['cva_text']))

text counts:
Counter({'0': 33523, '1': 656})


In [93]:
#### EPILEPSY / epilep

## text
spec['epilep_text'] = np.where(groups['text'].str.contains("epilep"), "1", "0")
spec['epilep_text'] = np.where(groups['text'].str.contains("seizure"), "1", spec['epilep_text'])

print('text counts:')
print(Counter(spec['epilep_text']))

text counts:
Counter({'0': 33357, '1': 822})


In [94]:
#### DEMENTIA / alzh

## text
text = ['dementia', 'cognitive impairment', 'alzheimer', 'cognitive dysfunction', 'cognitive decline', 'lewy body',
       'huntington', 'progressive supranuclear', 'corticobasal degen']

spec['alzh_text'] = np.where(groups['text'].str.contains(' mci '), "1", "0")

for x in text:
    spec['alzh_text'] = np.where(groups['text'].str.contains(x), "1", spec['alzh_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['alzh_text']))

text counts:
Counter({'0': 32660, '1': 1519})


In [95]:
## CARDIOVASCULAR / cvs

## text
text = ['cardiac', 'cardiovascular', 'cardial', 'cardiol', 'carditis', 'cardium', 'atherosclerosis', 'coronary', 'heart disease',
       'cardiomegaly', 'cardiomyopathy', 'valve disease', 'mitral', 'tricuspid', 'pulmonary valve', 'aortic', 'atrial', 'heart failure',
       'ventricular failure', 'right heart', 'left heart', 'cor pulm', 'hypertension', 'vascular disease', 'arrhythmia', 
       'vena cava', 'venous insuff', 'echocard', 'electrocard', 'sinus node', 'sinoatrial node', ' ecg', ' ekg', 'ventricular tachy', 'ventricular fibrillation',
       'ischemic heart', 'ischaemic heart']

spec['cvs_text'] = np.where(groups['text'].str.contains('cardiac'), "1", "0")

for x in text:
    spec['cvs_text'] = np.where(groups['text'].str.contains(x), "1", spec['cvs_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['cvs_text']))


text counts:
Counter({'0': 30439, '1': 3740})


In [96]:
#### ISCHAEMIC HEART DISEASE / ihd

## text
text = ['coronary', 'cardiac risk', 'cardiovascular risk', 'cardiac stent',
       'ischemic heart', 'ischaemic heart', 'cardial infarction']

spec['ihd_text'] = np.where(groups['text'].str.contains('heart attack'), "1", "0")

for x in text:
    spec['ihd_text'] = np.where(groups['text'].str.contains(x), "1", spec['ihd_text']) #if yes then 1, if no, keep current
    

spec['ihd_text'] = np.where((groups['text'].str.contains("cardia")) &
                             (groups['text'].str.contains("ischemi")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("cardia")) &
                             (groups['text'].str.contains("ischaemi")) , "1", spec['ihd_text'])    
spec['ihd_text'] = np.where((groups['text'].str.contains("cardia")) &
                             (groups['text'].str.contains("infarction")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("infarction")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("cardia")) &
                             (groups['text'].str.contains("vessel occlusion")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("vessel occlusion")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("angio")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("angio")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("atherosclero")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("atherosclero")) , "1", spec['ihd_text'])

## output
print('text counts:')
print(Counter(spec['ihd_text']))

text counts:
Counter({'0': 33229, '1': 950})


In [97]:
spec[spec['ihd_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text
60082,"evaluation of risk prediction models of atrial fibrillation from the multi-ethnic study of atherosclerosis mesa atrial fibrillation af is prevalent and strongly associated with higher cardiovascular disease cvd risk machine learning is increasingly used to identify novel predictors of cvd risk, but prediction improvements beyond established risk scores are uncertain we evaluated improvements in predicting 5-year af risk when adding novel candidate variables identified by machine learning to the charge-af enriched score, which includes age, race/ethnicity, height, weight, systolic and diastolic blood pressure, current smoking, use of antihypertensive medication, diabetes, and nt-probnp we included 3,534 participants mean age, 613 years; 520% female with complete data from the prospective multi-ethnic study of atherosclerosis incident af was defined based on study electrocardiograms and hospital discharge diagnosis icd-9 codes, supplemented by medicare claims prediction performance was evaluated using cox regression and a parsimonious model was selected using lasso within 5 years of baseline, 124 participants had incident af compared with the charge-af enriched model c-statistic, 0804, variables identified by machine learning, including biomarkers, cardiac magnetic resonance imaging variables, electrocardiogram variables, and subclinical cvd variables, did not significantly improve prediction a 23-item score derived by machine learning achieved a c-statistic of 0806, whereas a parsimonious model including the clinical risk factors age, weight, current smoking, nt-probnp, coronary artery calcium score, and cardiac troponin-t achieved a c-statistic of 0802 this analysis confirms that the charge-af enriched model and a parsimonious 6-item model performed similarly to a more extensive model derived by machine learning in conclusion, these simple models remain the gold standard for risk prediction of af, although addition of the coronary artery calcium score should be considered",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
158310,"selection of patients for myocardial perfusion scintigraphy based on fuzzy sets theory applied to clinical-epidemiological data and treadmill test results coronary artery disease cad is a worldwide leading cause of death the standard method for evaluating critical partial occlusions is coronary arteriography, a catheterization technique which is invasive, time consuming, and costly there are noninvasive approaches for the early detection of cad the basis for the noninvasive diagnosis of cad has been laid in a sequential analysis of the risk factors, and the results of the treadmill test and myocardial perfusion scintigraphy mps many investigators have demonstrated that the diagnostic applications of mps are appropriate for patients who have an intermediate likelihood of disease although this information is useful, it is only partially utilized in clinical practice due to the difficulty to properly classify the patients since the seminal work of lotfi zadeh, fuzzy logic has been applied in numerous areas in the present study, we proposed and tested a model to select patients for mps based on fuzzy sets theory a group of 1053 patients was used to develop the model and another group of 1045 patients was used to test it receiver operating characteristic curves were used to compare the performance of the fuzzy model against expert physician opinions, and showed that the performance of the fuzzy model was equal or superior to that of the physicians therefore, we conclude that the fuzzy model could be a useful tool to assist the general practitioner in the selection of patients for mps",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
103279,electrocardiographic diagnosis of st segment elevation myocardial infarction: an evaluation of three automated interpretation algorithms to assess the validity of three different computerized electrocardiogram ecg interpretation algorithms in correctly identifying stemi patients in the prehospital environment who require emergent cardiac intervention,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
62798,"machine learning versus traditional risk stratification methods in acute coronary syndrome: a pooled randomized clinical trial analysis traditional statistical models allow population based inferences and comparisons machine learning ml explores datasets to develop algorithms that do not assume linear relationships between variables and outcomes and that may account for higher order interactions to make individualized outcome predictions to evaluate the performance of machine learning models compared to traditional risk stratification methods for the prediction of major adverse cardiovascular events mace and bleeding in patients with acute coronary syndrome acs that are treated with antithrombotic therapy data on 24,178 acs patients were pooled from four randomized controlled trials the super learner ensemble algorithm selected weights for 23 machine learning models and was compared to traditional models the efficacy endpoint was a composite of cardiovascular death, myocardial infarction, or stroke the safety endpoint was a composite of timi major and minor bleeding or bleeding requiring medical attention for the mace outcome, the super learner model produced a higher c-statistic 0734 than logistic regression 0714, the timi risk score 0489, and a new cardiovascular risk score developed in the dataset 0644 for the bleeding outcome, the super learner demonstrated a similar c-statistic as the logistic regression model 0670 vs 0671 the machine learning risk estimates were highly calibrated with observed efficacy and bleeding outcomes hosmer-lemeshow p value = 0692 and 0970, respectively the super learner algorithm was highly calibrated on both efficacy and safety outcomes and produced the highest c-statistic for prediction of mace compared to traditional risk stratification methods this analysis demonstrates a contemporary application of machine learning to guide patient-level antithrombotic therapy treatment decisionsclinical trial registration atlas acs-2 timi 46: https://clinicaltrialsgov/ct2/show/nct00402597 unique identifier: nct00402597 atlas acs-2 timi 51: https://clinicaltrialsgov/ct2/show/nct00809965 unique identifier: nct00809965 gemini acs-1: https://clinicaltrialsgov/ct2/show/nct02293395 unique identifier: nct02293395 pioneer-af pci: https://clinicaltrialsgov/ct2/show/nct01830543 unique identifier: nct01830543",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
70462,"predicting coronary artery disease: a comparison between two data mining algorithms cardiovascular diseases cads are the first leading cause of death across the world world health organization has estimated that morality rate caused by heart diseases will mount to 23 million cases by 2030 hence, the use of data mining algorithms could be useful in predicting coronary artery diseases therefore, the present study aimed to compare the positive predictive value ppv of cad using artificial neural network ann and svm algorithms and their distinction in terms of predicting cad in the selected hospitals",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
96346,automatic determination of cardiovascular risk by ct attenuation correction maps in rb-82 pet/ct we investigated fully automatic coronary artery calcium cac scoring and cardiovascular disease cvd risk categorization from ct attenuation correction ctac acquired at rest and stress during cardiac pet/ct and compared it with manual annotations in ctac and with dedicated calcium scoring ct csct,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
36752,"convolution pyramid network: a classification network on coronary artery angiogram images with the development of convolutional neural network, the classification on ordinary natural images has made remarkable progress by using single feature maps however, it is difficult to always produce good results on coronary artery angiograms because there is a lot of photographing noise and small class gaps between the classification targets on angiograms in this paper, we propose a new network to enhance the richness and relevance of features in the training process by using multiple convolutions with different kernel sizes, which can improve the final classification result our network has a strong generalization ability, that is, it can perform a variety of classification tasks on angiograms better compared with some state-of-the-art image classification networks, the classification recall increases by 305% and precision increases by 191% in the best results of our network",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
88735,"application of stacked convolutional and long short-term memory network for accurate identification of cad ecg signals coronary artery disease cad is the most common cause of heart disease globally this is because there is no symptom exhibited in its initial phase until the disease progresses to an advanced stage the electrocardiogram ecg is a widely accessible diagnostic tool to diagnose cad that captures abnormal activity of the heart however, it lacks diagnostic sensitivity one reason is that, it is very challenging to visually interpret the ecg signal due to its very low amplitude hence, identification of abnormal ecg morphology by clinicians may be prone to error thus, it is essential to develop a software which can provide an automated and objective interpretation of the ecg signal this paper proposes the implementation of long short-term memory lstm network with convolutional neural network cnn to automatically diagnose cad ecg signals accurately our proposed deep learning model is able to detect cad ecg signals with a diagnostic accuracy of 9985% with blindfold strategy the developed prototype model is ready to be tested with an appropriate huge database before the clinical usage",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
97442,"computer aided decision making for heart disease detection using hybrid neural network-genetic algorithm cardiovascular disease is one of the most rampant causes of death around the world and was deemed as a major illness in middle and old ages coronary artery disease, in particular, is a widespread cardiovascular malady entailing high mortality rates angiography is, more often than not, regarded as the best method for the diagnosis of coronary artery disease; on the other hand, it is associated with high costs and major side effects much research has, therefore, been conducted using machine learning and data mining so as to seek alternative modalities accordingly, we herein propose a highly accurate hybrid method for the diagnosis of coronary artery disease as a matter of fact, the proposed method is able to increase the performance of neural network by approximately 10% through enhancing its initial weights using genetic algorithm which suggests better weights for neural network making use of such methodology, we achieved accuracy, sensitivity and specificity rates of 9385%, 97% and 92% respectively, on z-alizadeh sani dataset",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
156758,"localizing calcifications in cardiac ct data sets using a new vessel segmentation approach the new generation of multislice computed tomography ct scanners allows for the acquisition of high-resolution images of the heart based on that image data, the heart can be analyzed in a noninvasive way-improving the diagnosis of cardiovascular malfunctions on one hand, and the planning of an eventually necessary intervention on the other one important parameter for the evaluation of the severity of a coronary artery disease is the number and localization of calcifications hard plaques this work presents a method for localizing these calcifications by employing a newly developed vessel segmentation approach this extraction technique has been developed for, and tested with, contrast-enhanced ct data sets of the heart the algorithm provides enough information to compute the vessel diameter along the extracted segment an approach for automatically detecting calcified regions that combines diameter information and gray value analysis is presented in addition, specially adapted methods for the visualization of these analysis results are described",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [98]:
#### HEART FAILURE or VENTRICULAR FUNCTION / hf

## text
text = ['heart failure', 'cardiac failure', 'ejection fraction', 'ventricular dysfunction', 'cardiac dysfunction']

spec['hf_text'] = np.where(groups['text'].str.contains(' lvf '), "1", "0")

for x in text:
    spec['hf_text'] = np.where(groups['text'].str.contains(x), "1", spec['hf_text']) #if yes then 1, if no, keep current
    
spec['hf_text'] = np.where((groups['text'].str.contains("left ventric")) &
                             (groups['text'].str.contains("function")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("right ventric")) &
                             (groups['text'].str.contains("function")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("left ventric")) &
                             (groups['text'].str.contains("failure")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("right ventric")) &
                             (groups['text'].str.contains("failure")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("diastolic")) &
                             (groups['text'].str.contains("failure")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("diastolic")) &
                             (groups['text'].str.contains("function")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("systolic")) &
                             (groups['text'].str.contains("function")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("systolic")) &
                             (groups['text'].str.contains("failure")) , "1", spec['hf_text'])
##output
print('text counts:')
print(Counter(spec['hf_text']))

text counts:
Counter({'0': 33731, '1': 448})


In [99]:
spec[spec['hf_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text
5065,a machine-learning-based method to predict adverse events in patients with dilated cardiomyopathy and severely reduced ejection fractions patients with dilated cardiomyopathy dcm and severely reduced left ventricular ejection fractions lvefs are at very high risks of experiencing adverse cardiac events a machine learning ml method could enable more effective risk stratification for these high-risk patients by incorporating various types of data the aim of this study was to build an ml model to predict adverse events including all-cause deaths and heart transplantation in dcm patients with severely impaired lv systolic function,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
15425,deep-learning models for the echocardiographic assessment of diastolic dysfunction the authors explored a deep neural network deepnn model that integrates multidimensional echocardiographic data to identify distinct patient subgroups with heart failure with preserved ejection fraction hfpef,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
80680,"direct delineation of myocardial infarction without contrast agents using a joint motion feature learning architecture changes in mechanical properties of myocardium caused by a infarction can lead to kinematic abnormalities this phenomenon has inspired us to develop this work for delineation of myocardial infarction area directly from non-contrast agents cardiac mr imaging sequences the main contribution of this work is to develop a new joint motion feature learning architecture to efficiently establish direct correspondences between motion features and tissue properties this architecture consists of three seamless connected function layers: the heart localization layers can automatically crop the region of interest roi sequences involving the left ventricle from the cardiac mr imaging sequences; the motion feature extraction layers, using long short-term memory-recurrent neural networks, a builds patch-based motion features through local intensity changes between fixed-size patch sequences cropped from image sequences, and b uses optical flow techniques to build image-based features through global intensity changes between adjacent images to describe the motion of each pixel; the fully connected discriminative layers can combine two types of motion features together in each pixel and then build the correspondences between motion features and tissue identities that is, infarct or not in each pixel we validated the performance of our framework in 165 cine cardiac mr imaging datasets by comparing to the ground truths manually segmented from delayed gadolinium-enhanced mr cardiac images by two radiologists with more than 10 years of experience our experimental results show that our proposed method has a high and stable accuracy pixel-level: 9503% and consistency kappa statistic: 091; dice: 8987%; rmse: 072 mm; hausdorff distance: 591 mm compared to manual delineation results overall, the advantage of our framework is that it can determine the tissue identity in each pixel from its motion pattern captured by normal cine cardiac mr images, which makes it an attractive tool for the clinical diagnosis of infarction",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
11969,"deep learning-based automated left ventricular ejection fraction assessment using 2-d echocardiography deep learning dl has been applied for automatic left ventricle lv ejection fraction ef measurement, but the diagnostic performance was rarely evaluated for various phenotypes of heart disease this study aims to evaluate a new dl algorithm for automated lvef measurement using two-dimensional echocardiography 2de images collected from three centers the impact of three ultrasound machines and three phenotypes of heart diseases on the automatic lvef measurement was evaluated using 36890 frames of 2de from 340 patients, we developed a dl algorithm based on u-net dps-net and the biplane simpsons method was applied for lvef calculation results showed a high performance in lv segmentation and lvef measurement across phenotypes and echo systems by using dps-net good performance was obtained for lv segmentation when dps-net was tested on the camus data set dice coefficient of 0932 and 0928 for ed and es better performance of lv segmentation in study-wise evaluation was observed by comparing the dps-net v2 to the echonet-dynamic algorithm <i>p</i> = 0008 dps-net was associated with high correlations and good agreements for the lvef measurement high diagnostic performance was obtained that the area under receiver operator characteristic curve was 0974, 0948, 0968, and 0972 for normal hearts and disease phenotypes including atrial fibrillation, hypertrophic cardiomyopathy, dilated cardiomyopathy, respectively high performance was obtained by using dps-net in lv detection and lvef measurement for heart failure with several phenotypes high performance was observed in a large-scale dataset, suggesting that the dps-net was highly adaptive across different echocardiographic systems<b>new & noteworthy</b> a new strategy of feature extraction and fusion could enhance the accuracy of automatic lvef assessment based on multiview 2-d echocardiographic sequences high diagnostic performance for the determination of heart failure was obtained by using dps-net in cases with different phenotypes of heart diseases high performance for left ventricle segmentation was obtained by using dps-net, suggesting the potential for a wider range of application in the interpretation of 2de images",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
78478,"deep learning analysis of left ventricular myocardium in ct angiographic intermediate-degree coronary stenosis improves the diagnostic accuracy for identification of functionally significant stenosis to evaluate the added value of deep learning dl analysis of the left ventricular myocardium lvm in resting coronary ct angiography ccta over determination of coronary degree of stenosis ds, for identification of patients with functionally significant coronary artery stenosis",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
126156,"machine learning techniques for arterial pressure waveform analysis the arterial pressure waveform apw can provide essential information about arterial wall integrity and arterial stiffness most of apw analysis frameworks individually process each hemodynamic parameter and do not evaluate inter-dependencies in the overall pulse morphology the key contribution of this work is the use of machine learning algorithms to deal with vectorized features extracted from apw with this purpose, we follow a five-step evaluation methodology: 1 a custom-designed, non-invasive, electromechanical device was used in the data collection from 50 subjects; 2 the acquired position and amplitude of onset, systolic peak sp, point of inflection pi and dicrotic wave dw were used for the computation of some morphological attributes; 3 pre-processing work on the datasets was performed in order to reduce the number of input features and increase the model accuracy by selecting the most relevant ones; 4 classification of the dataset was carried out using four different machine learning algorithms: random forest, bayesnet probabilistic, j48 decision tree and ripper rule-based induction; and 5 we evaluate the trained models, using the majority-voting system, comparatively to the respective calculated augmentation index aix classification algorithms have been proved to be efficient, in particular random forest has shown good accuracy 9695% and high area under the curve auc of a receiver operating characteristic roc curve 0961 finally, during validation tests, a correlation between high risk labels, retrieved from the multi-parametric approach, and positive aix values was verified this approach gives allowance for designing new hemodynamic morphology vectors and techniques for multiple apw analysis, thus improving the arterial pulse understanding, especially when compared to traditional single-parameter analysis, where the failure in one parameter measurement component, such as pi, can jeopardize the whole evaluation",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
37851,a comparison of artificial intelligence-based algorithms for the identification of patients with depressed right ventricular function from 2-dimentional echocardiography parameters and clinical features recognizing low right ventricular rv function from 2-dimentiontial echocardiography 2d-echo is challenging when parameters are contradictory we aim to develop a model to predict low rv function integrating the various 2d-echo parameters in reference to cardiac magnetic resonance cmr-the gold standard,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
138429,"a neuro-fuzzy decision support system for the diagnosis of heart failure a neuro-fuzzy decision support system is proposed for the diagnosis of heart failure the system comprises; knowledge base database, neural networks and fuzzy logic of both the quantitative and qualitative knowledge of the diagnosis of heart failure, neuro-fuzzy inference engine and decision support engine the neural networks employ a multi-layers perception back propagation learning process while the fuzzy logic uses the root sum square inference procedure the neuro-fuzzy inference engine uses a weighted average of the premise and consequent parameters with the fuzzy rules serving as the nodes and the fuzzy sets representing the weights of the nodes the decision support engine carries out the cognitive and emotional filtering of the objective and subjective feelings of the medical practitioner an experimental study of the decision support system was carried out using cases of some patients from three hospitals in nigeria with the assistance of their medical personnel who collected patientsdata over a period of six months the results of the study show that the neuro-fuzzy system provides a highly reliable diagnosis, while the emotional and cognitive filters further refine the diagnosis results by taking care of the contextual elements of medical diagnosis",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1
86419,"machine learning methods improve prognostication, identify clinically distinct phenotypes, and detect heterogeneity in response to therapy in a large cohort of heart failure patients whereas heart failure hf is a complex clinical syndrome, conventional approaches to its management have treated it as a singular disease, leading to inadequate patient care and inefficient clinical trials we hypothesized that applying advanced analytics to a large cohort of hf patients would improve prognostication of outcomes, identify distinct patient phenotypes, and detect heterogeneity in treatment response",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
9626,prediction of 1-year mortality after heart transplantation using machine learning approaches: a single-center study from china heart transplantation htx remains the gold-standard treatment for end-stage heart failure the aim of this study was to establish a risk-prediction model for assessing prognosis of htx using machine-learning approach,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [100]:
#### ARRHYTHMIA / arrhyt

## text
text = ['sinus node', 'sinoatrial', 'atrial tachy', 'atrial flutter', 'accessory pathway', 'long qt', 'holter',
        'pacemaker', 'ventricular tachy', 'atrial fibrill', 'ventricular fibrill', 'supraventricular tachy',
        'cardiover', 'defibrillat', 'heart block', 'degree block', 'av block', 'ventricular block', ' p-wave', ' p wave', 'pr interval',
       'p-r interval', 'pr-interval', 'corrected qt', ' qtc ', ' qrs complex ', 'brugada', 'short qt', 'qt syndrome', 'long qt']

spec['arrhyt_text'] = np.where(groups['text'].str.contains('arrhythmi'), "1", "0")

for x in text:
    spec['arrhyt_text'] = np.where(groups['text'].str.contains(x), "1", spec['arrhyt_text']) #if yes then 1, if no, keep current

spec['arrhyt_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("ablation")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("ablation")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("bradycardia")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("bradycardia")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("electrophys")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("electrophys")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("rhythm")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("rhythm")) , "1", spec['arrhyt_text'])


## outputs
print('text counts:')
print(Counter(spec['arrhyt_text']))

text counts:
Counter({'0': 33397, '1': 782})


In [101]:
spec[spec['arrhyt_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text
38123,"classification of normal sinus rhythm, abnormal arrhythmia and congestive heart failure ecg signals using lstm and hybrid cnn-svm deep neural networks effective monitoring of heart patients according to heart signals can save a huge amount of life in the last decade, the classification and prediction of heart diseases according to ecg signals has gained great importance for patients and doctors in this paper, the deep learning architecture with high accuracy and popularity has been proposed in recent years for the classification of normal sinus rhythm, nsr abnormal arrhythmia arr and congestive heart failure chf ecg signals the proposed architecture is based on hybrid alexnet-svm support vector machine 96 arrhythmia, 30 chf, 36 nsr signals are available in a total of 192 ecg signals in order to demonstrate the classification performance of deep learning architectures, arr, chr and nsr signals are firstly classified by svm, knn algorithm, achieving 6875% and 6563% accuracy the signals are then classified in their raw form with lstm long short time memory with 9067% accuracy by obtaining the spectrograms of the signals, hybrid alexnet-svm algorithm is applied to the images and 9677% accuracy is obtained the results show that with the proposed deep learning architecture, it classifies ecg signals with higher accuracy than conventional machine learning classifiers",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
3865,"application of a time-series deep learning model to predict cardiac dysrhythmias in electronic health records cardiac dysrhythmias cd affect millions of americans in the united states us, and are associated with considerable morbidity and mortality new strategies to combat this growing problem are urgently needed",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
17870,"detecting digoxin toxicity by artificial intelligence-assisted electrocardiography although digoxin is important in heart rate control, the utilization of digoxin is declining due to its narrow therapeutic window misdiagnosis or delayed diagnosis of digoxin toxicity is common due to the lack of awareness and the time-consuming laboratory work that is involved electrocardiography ecg may be able to detect potential digoxin toxicity based on characteristic presentations our study attempted to develop a deep learning model to detect digoxin toxicity based on ecg manifestations this study included 61 ecgs from patients with digoxin toxicity and 177,066 ecgs from patients in the emergency room from november 2011 to february 2019 the deep learning algorithm was trained using approximately 80% of ecgs the other 20% of ecgs were used to validate the performance of the artificial intelligence ai system and to conduct a human-machine competition area under the receiver operating characteristic curve auc, sensitivity, and specificity were used to evaluate the performance of ecg interpretation between humans and our deep learning system the aucs of our deep learning system for identifying digoxin toxicity were 0912 and 0929 in the validation cohort and the human-machine competition, respectively, which reached 846% of sensitivity and 946% of specificity interestingly, the deep learning system using only lead i auc = 0960 was not worse than using complete 12 leads 0912 stratified analysis showed that our deep learning system was more applicable to patients with heart failure hf and without atrial fibrillation af than those without hf and with af our ecg-based deep learning system provides a high-accuracy, economical, rapid, and accessible way to detect digoxin toxicity, which can be applied as a promising decision supportive system for diagnosing digoxin toxicity in clinical practice",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
29065,"an iot and fog computing-based monitoring system for cardiovascular patients with automatic ecg classification using deep neural networks telemedicine and all types of monitoring systems have proven to be a useful and low-cost tool with a high level of applicability in cardiology the objective of this work is to present an iot-based monitoring system for cardiovascular patients the system sends the ecg signal to a fog layer service by using the lora communication protocol also, it includes an ai algorithm based on deep learning for the detection of atrial fibrillation and other heart rhythms the automatic detection of arrhythmias can be complementary to the diagnosis made by the physician, achieving a better clinical vision that improves therapeutic decision making the performance of the proposed system is evaluated on a dataset of 8528 short single-lead ecg records using two merge mobilenet networks that classify data with an accuracy of 90% for atrial fibrillation",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
144156,"finding features for real-time premature ventricular contraction detection using a fuzzy neural network system fuzzy neural networks fnns have been successfully applied to generate predictive rules for medical or diagnostic data this brief presents an approach to detect premature ventricular contractions pvcs using the neural network with weighted fuzzy membership functions newfms the newfm classifies normal and pvc beats by the trained bounded sum of weighted fuzzy membership functions bswfms using wavelet transformed coefficients from the mit-bih pvc database the eight generalized coefficients, locally related to the time signal, are extracted by the nonoverlap area distribution measurement method the eight generalized coefficients are used for the three pvc data sets with reliable accuracy rates of 9980%, 9921%, and 9878%, respectively, which means that the selected features are less dependent on the data sets it is shown that the locations of the eight features are not only around the qrs complex that represents ventricular depolarization in the electrocardiogram ecg containing a q wave, an r wave, and an s wave, but also the qr segment from the q wave to the r wave has more discriminate information than the rs segment from the r wave to the s wave the bswfms of the eight features trained by newfm are shown visually, which makes the features explicitly interpretable since each bswfm combines multiple weighted fuzzy membership functions into one using the bounded sum, the eight small-sized bswfms can realize real-time pvc detection in a mobile environment",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
127910,"dynamic time warping and machine learning for signal quality assessment of pulsatile signals in this work, we describe a beat-by-beat method for assessing the clinical utility of pulsatile waveforms, primarily recorded from cardiovascular blood volume or pressure changes, concentrating on the photoplethysmogram ppg physiological blood flow is nonstationary, with pulses changing in height, width and morphology due to changes in heart rate, cardiac output, sensor type and hardware or software pre-processing requirements moreover, considerable inter-individual and sensor-location variability exists simple template matching methods are therefore inappropriate, and a patient-specific adaptive initialization is therefore required we introduce dynamic time warping to stretch each beat to match a running template and combine it with several other features related to signal quality, including correlation and the percentage of the beat that appeared to be clipped the features were then presented to a multi-layer perceptron neural network to learn the relationships between the parameters in the presence of good- and bad-quality pulses an expert-labeled database of 1055 segments of ppg, each 6 s long, recorded from 104 separate critical care admissions during both normal and verified arrhythmic events, was used to train and test our algorithms an accuracy of 975% on the training set and 952% on test set was found the algorithm could be deployed as a stand-alone signal quality assessment algorithm for vetting the clinical utility of ppg traces or any similar quasi-periodic signal",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
104702,"medical decision support system for diagnosis of heart arrhythmia using dwt and random forests classifier in this study, random forests rf classifier is proposed for ecg heartbeat signal classification in diagnosis of heart arrhythmia discrete wavelet transform dwt is used to decompose ecg signals into different successive frequency bands a set of different statistical features were extracted from the obtained frequency bands to denote the distribution of wavelet coefficients this study shows that rf classifier achieves superior performances compared to other decision tree methods using 10-fold cross-validation for the ecg datasets and the obtained results suggest that further significant improvements in terms of classification accuracy can be accomplished by the proposed classification system accurate ecg signal classification is the major requirement for detection of all arrhythmia types performances of the proposed system have been evaluated on two different databases, namely mit-bih database and st -petersburg institute of cardiological technics 12-lead arrhythmia database for mit-bih database, rf classifier yielded an overall accuracy 9933 % against 9844 and 9867 % for the c45 and cart classifiers, respectively for st -petersburg institute of cardiological technics 12-lead arrhythmia database, rf classifier yielded an overall accuracy 9995 % against 9980 % for both c45 and cart classifiers, respectively the combined model with multiscale principal component analysis mspca de-noising, discrete wavelet transform dwt and rf classifier also achieves better performance with the area under the receiver operating characteristic roc curve auc and f-measure equal to 0999 and 0993 for mit-bih database and 1 and 0999 for and st -petersburg institute of cardiological technics 12-lead arrhythmia database, respectively obtained results demonstrate that the proposed system has capacity for reliable classification of ecg signals, and to assist the clinicians for making an accurate diagnosis of cardiovascular disorders cvds",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
12029,"premature beats detection based on a novel convolutional neural network <i>objective</i>automatic detection of premature beats on long electrocardiogram ecg recordings is of great significance for clinical diagnosis in this paper, we propose a novel deep learning model, the ecgdet, to detect premature beats, including premature ventricular contractions pvcs and supraventricular premature beats spbs on single-lead long-term ecgs<i>approach</i>the ecgdet is proposed based on a convolutional neural network and squeeze-and-excitation network it outputs the probabilities that the ecg samples belong to a premature contraction non-max suppression was used to select the most appropriate locations for the premature beats the ecgdet was trained and tested on the mit-bih arrhythmia database mitdb using a five-fold cross-validation approach a novel loss calculation method was introduced in the model training process then it was tuned and further tested on the china physiological signal challenge 2020 database cpscdb<i>main results</i>the results showed that the average f1 value of pvc detection was 926%, while that of spb detection was 722% on mitdb the ecgdet bagged the 2nd place for pvc detection and ranked 7th place of spb detection in the china physiological signal challenge 2020<i>significance</i>the proposed ecgdet can automatically detect premature heartbeats without manually extracting the features this technique can be used for long-term ecg signal analysis and has potential for clinical applications",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3533,"deep learning methods for screening patientss-icd implantation eligibility subcutaneous implantable cardioverter-defibrillators s-icds are used for prevention of sudden cardiac death triggered by ventricular arrhythmias t wave over sensing twos is an inherent risk with s-icds which can lead to inappropriate shocks a major predictor of twos is a high t:r ratio the ratio between the amplitudes of the t and r waves currently, patientselectrocardiograms ecgs are screened over 10 s to measure the t:r ratio to determine the patientseligibility for s-icd implantation due to temporal variations in the t:r ratio, 10 s is not a long enough window to reliably determine the normal values of a patients t:r ratio in this paper, we develop a convolutional neural network cnn based model utilising phase space reconstruction matrices to predict t:r ratios from 10-second ecg segments without explicitly locating the r or t waves, thus avoiding the issue of twos this tool can be used to automatically screen patients over a much longer period and provide an in-depth description of the behavior of the t:r ratio over that period the tool can also enable much more reliable and descriptive screenings to better assess patientseligibility for s-icd implantation",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
172090,"ann-based qrs-complex analysis of ecg reliable detection the qrs complex in either a normal or an abnormal ecg and its analysis is the first and foremost task in almost every ecg signal analysis system aimed at the diagnostic interpretation of ecg conventionally, detection of the qrs complex is accomplished using a rule-based/algorithmic approach this work, uses the learn and generalize approach of an artificial neural network ann for the detection of qrs complexes in either a normal or an abnormal ecg this is followed by the analysis of the qrs complex to designate and measure the morphological components within the qrs complex in all 12 standard leads an ann has been developed to detect the qrs complex in ecg and trained, with the help of back propagation algorithm, on more than a hundred ecgs selected from the cse data set-3 the trained ann was tested on all the recordings of the cse data set-3 and the sensitivity has been found to be 9911% subsequent to the identification of the qrs complex, an analysis of this complex and measurement of peak amplitudes of the component waves is done the results are validated using the cse multilead measurement results both the qrs detection and the qrs analysis software developed in c-language have been successfully implemented on a pc-at the results are found to be in agreement with visual measurements carried out by medical experts",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1


In [102]:
## ENDOCRINE [C19] - no dm / endo

## text
text = ['acromegaly', 'adrenal', 'addisons', 'conns syn', 'cushings synd', 'cushings disease', 'thyroid', 'graves disease',
       'hashimoto', 'polycystic ovary', 'prolactin', 'pituitar', 'androgen', 'testosterone', 'gonadism', 'gonadal']

spec['endo_text'] = np.where(groups['text'].str.contains('endocrin'), "1", "0")

for x in text:
    spec['endo_text'] = np.where(groups['text'].str.contains(x), "1", spec['endo_text']) #if yes then 1, if no, keep current


## outputs
print('text counts:')
print(Counter(spec['endo_text']))

text counts:
Counter({'0': 33695, '1': 484})


In [103]:
#### DIABETES - all / dm

## text
text = ['diabet', 'mellitus', 'hypoglycemia', 'hypoglycaemi', 'hyperglycemi', 'hyperglycaemi', 'insulin', 'glucagon',
        'islet cell'
       ]

spec['dm_text'] = np.where(groups['text'].str.contains('diabetes'), "1", "0")

for x in text:
    spec['dm_text'] = np.where(groups['text'].str.contains(x), "1", spec['dm_text']) #if yes then 1, if no, keep current

spec['dm_text'] = np.where(groups['text'].str.contains("insipidus"), "0", spec['dm_text'])
spec['dm_text'] = np.where(groups['text'].str.contains('growth factor'), "0", spec['dm_text'])
spec['dm_text'] = np.where(groups['text'].str.contains(' igf'), "1", spec['dm_text'])

## output
print('text counts:')
print(Counter(spec['dm_text']))

text counts:
Counter({'0': 32897, '1': 1282})


In [104]:
#### DIABETES - insulin / insulin

spec['insulin_text'] = np.where(groups['text'].str.contains('insulin'), "1", "0")


spec['insulin_text'] = np.where(groups['text'].str.contains('growth factor'), "0", spec['insulin_text'])
spec['insulin_text'] = np.where(groups['text'].str.contains(' igf'), "0", spec['insulin_text'])

## output
print('text counts:')
print(Counter(spec['insulin_text']))

text counts:
Counter({'0': 34015, '1': 164})


In [105]:
spec[spec['insulin_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text
67296,"predictive models for diabetic retinopathy from non-image teleretinal screening data <b>introduction:</b> timely diabetic retinopathy detection remains a problem in medically underserved settings in the us; diabetic patients in these locales have limited access to eye specialists teleretinal screening programs have been introduced to address this problem <b>methods:</b> using data on ethnicity, gender, age, hemoglobin a1c, insulin dependence, time since last eye examination, subjective diabetes control, and years with diabetes from 27,116 diabetic patients participating in a los angeles county teleretinal screening program, we compared different machine learning methods for predicting retinopathy the dataset exhibited a class imbalance <b>results:</b> six classifiers learned on the data were predictive of retinopathy the best model had an auc of 0754, sensitivity of 58% and specificity of 80% <b>discussion:</b> successfully detecting retinopathy from diabetic patientsroutinely collected clinical data could help clinicians in medically underserved areas identify unscreened diabetic patients who are at risk of developing retinopathy this work is a step towards that goal",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
34025,"obesity in qatar: a case-control study on the identification of associated risk factors obesity is an emerging public health problem in the western world as well as in the gulf region qatar, a tiny wealthy county, is among the top-ranked obese countries with a high obesity rate among its population compared to qatars severity of this health crisis, only a limited number of studies focused on the systematic identification of potential risk factors using multimodal datasets this study aims to develop machine learning ml models to distinguish healthy from obese individuals and reveal potential risk factors associated with obesity in qatar we designed a case-control study focused on 500 qatari subjects, comprising 250 obese and 250 healthy individuals- the later forming the control group we obtained the most extensive collection of clinical measurements for the qatari population from the qatar biobank qbb repertoire, including i physio-clinical biomarkers, ii spirometry, iii vicorder, iv dxa scan composition, and v dxa scan densitometry readings we developed several machine learning ml models to distinguish healthy from obese individuals and applied multiple feature selection techniques to identify potential risk factors associated with obesity the proposed ml model achieved over 90% accuracy, thereby outperforming the existing state of the art models the outcome from the ablation study on multimodal clinical datasets revealed physio-clinical measurements as the most influential risk factors in distinguishing healthy versus obese subjects furthermore, multiple feature ranking techniques confirmed known obesity risk factors c-peptide, insulin, albumin, uric acid and identified potential risk factors linked to obesity-related comorbidities such as diabetes eg, hba1c, glucose, liver function eg, alkaline phosphatase, gamma-glutamyl transferase, lipid profile eg, triglyceride, low density lipoprotein cholesterol, high density lipoprotein cholesterol, etc most of the dxa measurements eg, bone area, bone mineral composition, bone mineral density, etc were significantly <i>p</i>-value < 005 higher in the obese group overall, the net effect of hypothesized protective factors of obesity on bone mass seems to have surpassed the hypothesized harmful factors all the identified factors warrant further investigation in a clinical setup to understand their role in obesity",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
150524,"detection of hypoglycemic episodes in children with type 1 diabetes using an optimal bayesian neural network algorithm hypoglycemia or low blood glucose is a common and serious side effect of insulin therapy in patients with diabetes hypomon is a non-invasive monitor that measures some physiological parameters continuously to provide detection of hypoglycemic episodes in type 1 diabetes mellitus patients t1dm based on heart rate, corrected qt interval of the ecg signal and skin impedance, a bayesian neural network detection algorithm has been developed to recognize the presence of hypoglycemic episodes from a clinical study of 25 children with t1dm, associated with hypoglycemic episodes, their heart rates increased 1152+/-0157 vs 1035+/-0108, p<00001, their corrected qt intervals increased 1088+/-0086 vs 1020+/-0062, p<00001 and their skin impedances reduced significantly 0679+/-0195 vs 0837+/-0203, p<00001 the overall data were organized into a training set 14 cases and a test set 14 cases randomly selected using an optimal bayesian neural network with 11 hidden nodes, and an algorithm developed from the training set, a sensitivity of 08346 and specificity of 06388 were achieved for the test set",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1
122952,"a glucose model based on support vector regression for the prediction of hypoglycemic events under free-living conditions the prevention of hypoglycemic events is of paramount importance in the daily management of insulin-treated diabetes the use of short-term prediction algorithms of the subcutaneous sc glucose concentration may contribute significantly toward this direction the literature suggests that, although the recent glucose profile is a prominent predictor of hypoglycemia, the overall patients context greatly impacts its accurate estimation the objective of this study is to evaluate the performance of a support vector for regression svr sc glucose method on hypoglycemia prediction",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
150510,"controlling blood glucose levels in diabetics by neural network predictor in this study we develop a system that uses some variables such as, level of exercise, stress, food intake, injected insulin and blood glucose level in previous intervals, as input and accurately predicts the blood glucose level in the next interval the system is split up to make separate prediction of blood glucose level in the morning, afternoon, evening and night, using data from one patient covering a period of 77 days we have used rbf neural network, and compared our result with mlp neural network that was implemented by the others the assessment of the analysis resulted in a root mean square error of 004+/-00004 mmol/l",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
64728,"minimizing postprandial hypoglycemia in type 1 diabetes patients using multiple insulin injections and capillary blood glucose self-monitoring with machine learning techniques diabetic patients treated with intensive insulin therapies require a tight glycemic control and may benefit from advanced tools to predict blood glucose bg concentration levels and hypo/hyperglycemia events prediction systems using machine learning techniques have mainly focused on applications for sensor augmented pump sap therapy in contrast, insulin bolus calculators that rely on bg prediction for multiple daily insulin mdi injections for patients under self-monitoring blood glucose smbg are scarce because of insufficient data sources and limited prediction capability of forecasting models",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
24524,"the use of machine learning techniques to determine the predictive value of inflammatory biomarkers in the development of type 2 diabetes mellitus <b><i>background:</i></b> certain inflammatory biomarkers, such as interleukin-6, interleukin-1, c-reactive protein crp, and fibrinogen, are prototypical acute-phase parameters that can also be predictors of cardiovascular disease however, this inflammatory response can also be linked to the development of type 2 diabetes mellitus t2dm <b><i>methods:</i></b> we performed a cross-sectional, retrospective study of hypertensive patients in an outpatient setting demographic, clinical, and laboratory parameters, such as the homeostatic model assessment of insulin resistance homa-ir, crp, and fibrinogen, were recorded the outcome was progression to overt t2dm over the 12-year observation period <b><i>results:</i></b> a total of 3,472 hypertensive patients were screened, but 1,576 individuals without t2dm were ultimately included in the analyses patients with elevated fibrinogen, crp, and insulin resistance had a significantly greater incidence of progression to t2dm during follow-up, 199 patients progressed to t2dm multivariate logistic regression analyses showed that body mass index odds ratio or 104, 95% confidence interval ci: 101-107, homa-ir or 113, 95% ci: 108-116, age or 105, 95% ci: 103-107, logcrp or 137, 95% ci: 114-155, and fibrinogen or 144, 95% ci: 123-166 were the most important predictors of progression to t2dm the area under the receiver operating characteristic curve auc of this model was 076 using machine learning methods, we built a model that included homa-ir, fibrinogen, and logcrp that was more accurate than the logistic regression model, with an auc of 09 <b><i>conclusion:</i></b> our results suggest that inflammatory biomarkers and homa-ir have a strong prognostic value in predicting progression to t2dm machine learning methods can provide more accurate results to better understand the implications of these features in terms of progression to t2dm a successful therapeutic approach based on these features can avoid progression to t2dm and thus improve long-term survival",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1
172023,"simulation studies on neural predictive control of glucose using the subcutaneous route a novel strategy for closed-loop control of glucose using subcutaneous sc tissue glucose measurement and sc infusion of monomeric insulin analogues was developed and evaluated in a simulation study the proposed control strategy is an amalgamation of a neural network and nonlinear model predictive control npc technique a radial basis function neural network was used for off-line system identification of nonlinear auto regressive model with exogenous inputs narx model of the glucoregulatory system the explicit narx model obtained from the off-line identification procedure was then used to predict the effects of future control actions numerical studies were carried out using a comprehensive model of glucose regulation the system identification procedure enabled construction of a parsimonious network from the stimulated data, and consequently, design of a controller using multiple-step-ahead predictions of the previously identified model according to the simulation results, stable control is achievable in the presence of large noise levels and for unknown or variable physiological or technical time delays in conclusion, the simulation results suggest that closed-loop control of glucose will be achievable using sc glucose measurement and sc insulin administration however, the control limitations due to the sc insulin administration makes additional action of the patient at meal time necessary",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
55279,"a hybrid approach for modeling type 2 diabetes mellitus progression type 2 diabetes mellitus t2dm is a chronic, progressive metabolic disorder characterized by hyperglycemia resulting from abnormalities in insulin secretion, insulin action, or both it is associated with an increased risk of developing vascular complication of micro as well as macro nature because of its inconspicuous and heterogeneous character, the management of t2dm is very complex modeling physiological processes over time demonstrating the patients evolving health condition is imperative to comprehending the patients current status of health, projecting its likely dynamics and assessing the requisite care and treatment measures in future hidden markov model hmm is an effective approach for such prognostic modeling however, the nature of the clinical setting, together with the format of the electronic medical records emrs data, in particular the sparse and irregularly sampled clinical data which is well understood to present significant challenges, has confounded standard hmm in the present study, we proposed an approximation technique based on newtons divided difference method nddm as a component with hmm to determine the risk of developing diabetes in an individual over different time horizons using irregular and sparsely sampled emrs data the proposed method is capable of exploiting available sequences of clinical measurements obtained from a longitudinal sample of patients for effective imputation and improved prediction performance furthermore, results demonstrated that the discrimination capability of our proposed method, in prognosticating diabetes risk, is superior to the standard hmm",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
171982,"neural predictive controller for insulin delivery using the subcutaneous route a neural predictive controller for closed-loop control of glucose using subcutaneous sc tissue glucose measurement and sc infusion of monomeric insulin analogs was developed and evaluated in a simulation study the proposed control strategy is based on off-line system identification using neural networks nns and nonlinear model predictive controller design the system identification framework combines the concept of nonlinear autoregressive model with exogenous inputs narx system representation, regularization approach for constructing radial basis function nns, and validation methods for nonlinear systems numerical studies on system identification and closed-loop control of glucose were carried out using a comprehensive model of glucose regulation and a pharmacokinetic model for the absorption of monomeric insulin analogs from the sc depot the system identification procedure enabled construction of a parsimonious network from the simulated data, and consequently, design of a controller using multiple-step-ahead predictions of the previously identified model according to the simulation results, stable control is achievable in the presence of large noise levels, for unknown or variable time delays as well as for slow time variations of the controlled process however, the control limitations due to the sc insulin administration makes additional action from the patient at meal time necessary",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [106]:
#### DM RETINOPATHIES / retina

## text
spec['retina_text'] = np.where(groups['text'].str.contains('diabetic retin'), "1", "0")

spec['retina_text'] = np.where((groups['text'].str.contains("diabet")) &
                             (groups['text'].str.contains("retina")) , "1", spec['retina_text'])
spec['retina_text'] = np.where((groups['text'].str.contains("diabet")) &
                             (groups['text'].str.contains("retino")) , "1", spec['retina_text'])
spec['retina_text'] = np.where((groups['text'].str.contains("diabet")) &
                             (groups['text'].str.contains("eye")) , "1", spec['retina_text'])

print('text counts:')
print(Counter(spec['retina_text']))

text counts:
Counter({'0': 33794, '1': 385})


In [107]:
spec[spec['retina_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text
54347,"automatic detection of rare pathologies in fundus photographs using few-shot learning in the last decades, large datasets of fundus photographs have been collected in diabetic retinopathy dr screening networks through deep learning, these datasets were used to train automatic detectors for dr and a few other frequent pathologies, with the goal to automate screening one challenge limits the adoption of such systems so far: automatic detectors ignore rare conditions that ophthalmologists currently detect, such as papilledema or anterior ischemic optic neuropathy the reason is that standard deep learning requires too many examples of these conditions however, this limitation can be addressed with few-shot learning, a machine learning paradigm where a classifier has to generalize to a new category not seen in training, given only a few examples of this category this paper presents a new few-shot learning framework that extends convolutional neural networks cnns, trained for frequent conditions, with an unsupervised probabilistic model for rare condition detection it is based on the observation that cnns often perceive photographs containing the same anomalies as similar, even though these cnns were trained to detect unrelated conditions this observation was based on the t-sne visualization tool, which we decided to incorporate in our probabilistic model experiments on a dataset of 164,660 screening examinations from the ophdiat screening network show that 37 conditions, out of 41, can be detected with an area under the roc curve auc greater than 08 average auc: 0938 in particular, this framework significantly outperforms other frameworks for detecting rare conditions, including multitask learning, transfer learning and siamese networks, another few-shot learning solution we expect these richer predictions to trigger the adoption of automated eye pathology screening, which will revolutionize clinical practice in ophthalmology",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1
23793,"precise higher-order reflectivity and morphology models for early diagnosis of diabetic retinopathy using oct images this study proposes a novel computer assisted diagnostic cad system for early diagnosis of diabetic retinopathy dr using optical coherence tomography oct b-scans the cad system is based on fusing novel oct markers that describe both the morphology/anatomy and the reflectivity of retinal layers to improve dr diagnosis this system separates retinal layers automatically using a segmentation approach based on an adaptive appearance and their prior shape information high-order morphological and novel reflectivity markers are extracted from individual segmented layers namely, the morphological markers are layer thickness and tortuosity while the reflectivity markers are the 1st-order reflectivity of the layer in addition to local and global high-order reflectivity based on markov-gibbs random field mgrf and gray-level co-occurrence matrix glcm, respectively the extracted image-derived markers are represented using cumulative distribution function cdf descriptors the constructed cdfs are then described using their statistical measures, ie, the 10th through 90th percentiles with a 10% increment for individual layer classification, each extracted descriptor of a given layer is fed to a support vector machine svm classifier with a linear kernel the results of the four classifiers are then fused using a backpropagation neural network bnn to diagnose each retinal layer for global subject diagnosis, classification outputs probabilities of the twelve layers are fused using another bnn to make the final diagnosis of the b-scan this system is validated and tested on 130 patients, with two scans for both eyes ie 260 oct images, with a balanced number of normal and dr subjects using different validation metrics: 2-folds, 4-folds, 10-folds, and leave-one-subject-out loso cross-validation approaches the performance of the proposed system was evaluated using sensitivity, specificity, f1-score, and accuracy metrics the systems performance after the fusion of these different markers showed better performance compared with individual markers and other machine learning fusion methods namely, it achieved formula: see text, formula: see text, formula: see text, and formula: see text, respectively, using the loso cross-validation technique the reported results, based on the integration of morphology and reflectivity markers and by using state-of-the-art machine learning classifications, demonstrate the ability of the proposed system to diagnose the dr early",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
74208,diagnostic accuracy of a device for the automated detection of diabetic retinopathy in a primary care setting to determine the diagnostic accuracy in a real-world primary care setting of a deep learning-enhanced device for automated detection of diabetic retinopathy dr,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
137086,"active learning for an efficient training strategy of computer-aided diagnosis systems: application to diabetic retinopathy screening the performance of computer-aided diagnosis cad systems can be highly influenced by the training strategy cad systems are traditionally trained using available labeled data, extracted from a specific data distribution or from public databases due to the wide variability of medical data, these databases might not be representative enough when the cad system is applied to data extracted from a different clinical setting, diminishing the performance or requiring more labeled samples in order to get better data generalization in this work, we propose the incorporation of an active learning approach in the training phase of cad systems for reducing the number of required training samples while maximizing the system performance the benefit of this approach has been evaluated using a specific cad system for diabetic retinopathy screening the results show that 1 using a training set obtained from a different data source results in a considerable reduction of the cad performance; and 2 using active learning the selected training set can be reduced from 1000 to 200 samples while maintaining an area under the receiver operating characteristic curve of 0856",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
69238,"topological data analysis of high resolution diabetic retinopathy images diabetic retinopathy is a complication of diabetes that produces changes in the blood vessel structure in the retina, which can cause severe vision problems and even blindness in this paper, we demonstrate that by identifying topological features in very high resolution retinal images, we can construct a classifier that discriminates between healthy patients and those with diabetic retinopathy using summary statistics of these features topological data analysis identifies the features as connected components and holes in the images and describes the extent to which they persist across the image these features are encoded in persistence diagrams, summaries of which can be used to discrimate between diabetic and healthy patients the method has the potential to be an effective automated screening tool, with high sensitivity and specificity",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
67277,"microaneurysms segmentation with a u-net based on recurrent residual convolutional neural network microaneurysms mas play an important role in the diagnosis of clinical diabetic retinopathy at the early stage annotation of mas manually by experts is laborious and so it is essential to develop automatic segmentation methods automatic ma segmentation remains a challenging task mainly due to the low local contrast of the image and the small size of mas a deep learning-based method called u-net has become one of the most popular methods for the medical image segmentation task we propose an architecture for u-net, named deep recurrent u-net dru-net, obtained by combining the deep residual model and recurrent convolutional operations into u-net in the ma segmentation task, dru-net can accumulate effective features much better than the typical u-net the proposed method is evaluated on two publicly available datasets: e-ophtha and idrid our results show that the proposed dru-net achieves the best performance with 09999 accuracy value and 09943 area under curve auc value on the e-ophtha dataset and on the idrid dataset, it has achieved 0987 auc value to our knowledge, this is the first result of segmenting mas on this dataset compared with other methods, such as u-net, fcnn, and resu-net, our architecture dru-net achieves state-of-the-art performance",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
64727,"classification of diabetes-related retinal diseases using a deep learning approach in optical coherence tomography spectral domain optical coherence tomography sd-oct is a volumetric imaging technique that allows measuring patterns between layers such as small amounts of fluid since 2012, automatic medical image analysis performance has steadily increased through the use of deep learning models that automatically learn relevant features for specific tasks, instead of designing visual features manually nevertheless, providing insights and interpretation of the predictions made by the model is still a challenge this paper describes a deep learning model able to detect medically interpretable information in relevant images from a volume to classify diabetes-related retinal diseases",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
14171,"analysis and comparison of two artificial intelligence diabetic retinopathy screening algorithms in a pilot study: idx-dr and retinalyze the prevalence of diabetic retinopathy dr is expected to increase this will put an increasing strain on health care resources recently, artificial intelligence-based, autonomous dr screening systems have been developed a direct comparison between different systems is often difficult and only two such comparisons have been published so far as different screening solutions are now available commercially, with more in the pipeline, choosing a system is not a simple matter based on the images gathered in a local dr screening program we performed a retrospective comparison of idx-dr and retinalyze",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
67936,"supervised machine learning based multi-task artificial intelligence classification of retinopathies artificial intelligence ai classification holds promise as a novel and affordable screening tool for clinical management of ocular diseases rural and underserved areas, which suffer from lack of access to experienced ophthalmologists may particularly benefit from this technology quantitative optical coherence tomography angiography octa imaging provides excellent capability to identify subtle vascular distortions, which are useful for classifying retinovascular diseases however, application of ai for differentiation and classification of multiple eye diseases is not yet established in this study, we demonstrate supervised machine learning based multi-task octa classification we sought 1 to differentiate normal from diseased ocular conditions, 2 to differentiate different ocular disease conditions from each other, and 3 to stage the severity of each ocular condition quantitative octa features, including blood vessel tortuosity bvt, blood vascular caliber bvc, vessel perimeter index vpi, blood vessel density bvd, foveal avascular zone faz area faz-a, and faz contour irregularity faz-ci were fully automatically extracted from the octa images a stepwise backward elimination approach was employed to identify sensitive octa features and optimal-feature-combinations for the multi-task classification for proof-of-concept demonstration, diabetic retinopathy dr and sickle cell retinopathy scr were used to validate the supervised machine leaning classifier the presented ai classification methodology is applicable and can be readily extended to other ocular diseases, holding promise to enable a mass-screening platform for clinical deployment and telemedicine",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
105801,"computer-assisted identification of proliferative diabetic retinopathy in color retinal images advanced proliferative stage of diabetic retinopathy dr is indicated by the growth of thin, fragile and highly unregulated vessels, neovascularization nv in order to identify proliferative diabetic retinopathy pdr, our approach models the micro-pattern of local variations using texture based analysis and quantifies the structural changes in vessel patterns in localized patches, to map them to the confidence score of being neovascular using supervised learning framework rule-based criteria on patch-level neovascularity scores in an image is used for the decision of absence or presence of pdr evaluated using 3 datasets, our method achieves 96% sensitivity and 926% specificity for localizing nv image-level identification of pdr achieves high sensitivity of 9672% at 796% specificity and high specificity of 9650% at 7322% sensitivity our approach could have potential application in dr grading where it can localize nve regions and identify pdr images for immediate intervention",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [108]:
## OPHTHALMOLOGY [C11] / eye

## text
text = ['ophth', 'retina', 'retino', 'retinitis', 'eye disease', 'uveitis', 'iritis', 'conjunctiv', 'cornea', 'blephar',
       'optic nerve', 'optic atrophy', 'optic disk', 'optic disc', 'optic neuropathy', 'choroid', 'blindness', 'macular',
       'strabismus', 'ocular', 'glaucoma']

spec['eye_text'] = np.where(groups['text'].str.contains('eye disease'), "1", "0")

for x in text:
    spec['eye_text'] = np.where(groups['text'].str.contains(x), "1", spec['eye_text']) #if yes then 1, if no, keep current

spec['eye_text'] = np.where((groups['text'].str.contains("eye")) &
                             (groups['text'].str.contains("optic")) , "1", spec['eye_text'])
spec['eye_text'] = np.where((groups['text'].str.contains("eye")) &
                             (groups['text'].str.contains("fundus")) , "1", spec['eye_text'])
spec['eye_text'] = np.where((groups['text'].str.contains("eye")) &
                             (groups['text'].str.contains("fundal")) , "1", spec['eye_text'])
    
## output
print('text counts:')
print(Counter(spec['eye_text']))

text counts:
Counter({'0': 32751, '1': 1428})


In [109]:
spec[spec['eye_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text
105801,"computer-assisted identification of proliferative diabetic retinopathy in color retinal images advanced proliferative stage of diabetic retinopathy dr is indicated by the growth of thin, fragile and highly unregulated vessels, neovascularization nv in order to identify proliferative diabetic retinopathy pdr, our approach models the micro-pattern of local variations using texture based analysis and quantifies the structural changes in vessel patterns in localized patches, to map them to the confidence score of being neovascular using supervised learning framework rule-based criteria on patch-level neovascularity scores in an image is used for the decision of absence or presence of pdr evaluated using 3 datasets, our method achieves 96% sensitivity and 926% specificity for localizing nv image-level identification of pdr achieves high sensitivity of 9672% at 796% specificity and high specificity of 9650% at 7322% sensitivity our approach could have potential application in dr grading where it can localize nve regions and identify pdr images for immediate intervention",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
28237,"corneal edema visualization with optical coherence tomography using deep learning: proof of concept optical coherence tomography oct is essential for the diagnosis and follow-up of corneal edema, but assessment can be challenging in minimal or localized edema the objective was to develop and validate a novel automated tool to detect and visualize corneal edema with oct",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
22522,"the bayesian additive regression trees formula for safe machine learning-based intraocular lens predictions <b>purpose:</b> our work introduces a highly accurate, safe, and sufficiently explicable machine-learning artificial intelligence model of intraocular lens power iol translating into better post-surgical outcomes for patients with cataracts we also demonstrate its improved predictive accuracy over previous formulas <b>methods:</b> we collected retrospective eye measurement data on 5,331 eyes from 3,276 patients across multiple centers who received a lens implantation during cataract surgery the dependent measure is the post-operative manifest spherical equivalent error from intended and the independent variables are the patient- and eye-specific characteristics this dataset was split so that one subset was for formula construction and the other for validating our new formula data excluded fellow eyes, so as not to confound the prediction with bilateral eyes <b>results:</b> our formula is three times more precise than reported studies with a median absolute iol error of 0204 diopters d when converted to absolute predictive refraction errors on the cornea, the median error is 0137 d which is close to the iol manufacturer tolerance these estimates are validated out-of-sample and thus are expected to reflect the future performance of our prediction formula, especially since our data were collected from a wide variety of patients, clinics, and manufacturers <b>conclusion:</b> the increased precision of iol power calculations has the potential to optimize patient positive refractive outcomes our model also provides uncertainty plots that can be used in tandem with the clinicians expertise and previous formula output, further enhancing the safety <b>translational relavance:</b> our new machine learning process has the potential to significantly improve patient iol refractive outcomes safely",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
52855,predicting the likelihood of need for future keratoplasty intervention using artificial intelligence to apply artificial intelligence ai for automated identification of corneal condition and prediction of the likelihood of need for future keratoplasty intervention from optical coherence tomography oct-based corneal parameters,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
60425,utility of a public-available artificial intelligence in diagnosis of polypoidal choroidal vasculopathy to investigate the feasibility of training an artificial intelligence ai on a public-available ai platform to diagnose polypoidal choroidal vasculopathy pcv using indocyanine green angiography icga,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [110]:
## HAEMATOLOGIC [C15] / haem

## text
text = ['haematological cancer', 'hematological cancer', 'haematological malig', 'hematological malig', 'myelodysplas',
       'myeloprolif', 'lymphoprolif', 'leukaemia', 'leukemia', 'myelofibro', 'thrombocythemia', 'polycythemia vera',
       'polycythemia rubra vera', 'thrombocythaemia', 'polycythaemia vera', 'polycythaemia rubra vera', 'lymphoma',
       'myeloma', ' gvhd', 'stem cell transpl', 'bone marrow aspirate',
       'haematolog', 'anemia', 'anaemia', 'hemoglobin', 'haemoglobin', 'sickle cell', 'thalassemia', 'thalassaemia',
       'sickle crisis', 'clotting disorder', 'coagulation disorder', 'coagulopathy', 'hemophilia', 'haemophilia',
       'von willebrand', 'disseminated intrasvascular', 'thrombocytopeni', 'hemoly', 'haemoly', 'cryoglob', 'thrombim',
       'bone marrow', 'coagulation']

spec['haem_text'] = np.where(groups['text'].str.contains('hematolog'), "1", "0")

for x in text:
    spec['haem_text'] = np.where(groups['text'].str.contains(x), "1", spec['haem_text']) #if yes then 1, if no, keep current


## output
print('text counts:')
print(Counter(spec['haem_text']))

text counts:
Counter({'0': 33435, '1': 744})


In [111]:
## GYNAE/OBSTETRIC [C13] / obs

## text
text = ['obstetric', 'fetal', 'foetal', 'foetus', 'fetus', 'gestation', 'pregnan', 'endometriosis', 'ovarian', 'gynecolog', 'uterine', 'uterus'
       'cervix', 'pap smear', 'cervical cancer', 'cervical carcinoma', ' vagina ', 'vaginal', 'vaginosis', 'macrosomia', 'colposcop',
       'gynaecolog', 'menopaus', 'eclamp', ' iugr ', 'caesarean', 'endometrial']

spec['obs_text'] = np.where(groups['text'].str.contains('cesarean'), "1", "0")

for x in text:
    spec['obs_text'] = np.where(groups['text'].str.contains(x), "1", spec['obs_text']) #if yes then 1, if no, keep current


## output
print('text counts:')
print(Counter(spec['obs_text']))

text counts:
Counter({'0': 33140, '1': 1039})


In [112]:
## NEPHROLOGY [C12] AND UROLOGY / renal

## text
text = [' renal ', 'kidney', 'hemodialysis', 'haemodialysis', 'hemofilt', 'haemofilt', 'nephro', 'nephrit', 'glomerulus',
       'bladder', 'urethral']

spec['renal_text'] = np.where(groups['text'].str.contains('renovasc'), "1", "0")

for x in text:
    spec['renal_text'] = np.where(groups['text'].str.contains(x), "1", spec['renal_text']) #if yes then 1, if no, keep current


## output
print('text counts:')
print(Counter(spec['renal_text']))

text counts:
Counter({'0': 33427, '1': 752})


In [113]:
spec[spec['renal_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text
88378,"a lasso method to identify protein signature predicting post-transplant renal graft survival identifying novel biomarkers to predict renal graft survival is important in post-transplant clinical practice serum creatinine, currently the most popular surrogate biomarker, offers limited information of the underlying allograft profiles it is known to perform unsatisfactorily to predict renal function in this paper, we apply a lasso machine-learning algorithm in the cox proportional hazards model to identify promising proteins that are associated with the hazard of allograft loss after renal transplantation, motivated by a clinical pilot study that collected 47 patients receiving renal transplants at the university of michigan hospital we assess the association of 17 proteins previously identified by cibrik et al 5 with allograft rejection in our regularized cox regression analysis, where the lasso variable selection method is applied to select important proteins that predict the hazard of allograft loss we also develop a post-selection inference to further investigate the statistical significance of the proteins on the hazard of allograft loss, and conclude that two proteins kim-1 and vegf-r2 are important protein markers for risk prediction",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
32516,"coupled mass-spectrometry-based lipidomics machine learning approach for early detection of clear cell renal cell carcinoma a discovery-based lipid profiling study of serum samples from a cohort that included patients with clear cell renal cell carcinoma ccrcc stages i, ii, iii, and iv <i>n</i> = 112 and controls <i>n</i> = 52 was performed using ultraperformance liquid chromatography coupled to quadrupole-time-of-flight mass spectrometry and machine learning techniques multivariate models based on support vector machines and the lasso variable selection method yielded two discriminant lipid panels for ccrcc detection and early diagnosis a 16-lipid panel allowed discriminating ccrcc patients from controls with 957% accuracy in a training set under cross-validation and 771% accuracy in an independent test set a second model trained to discriminate early i and ii from late iii and iv stage ccrcc yielded a panel of 26 compounds that classified stage i patients from an independent test set with 821% accuracy thirteen species, including cholic acid, undecylenic acid, lauric acid, lpc16:0/0:0, and pc18:2/18:2, identified with level 1 exhibited significantly lower levels in samples from ccrcc patients compared to controls moreover, 3α-hydroxy-5α-androstan-17-one 3-sulfate, <i>cis</i>-5-dodecenoic acid, arachidonic acid, <i>cis</i>-13-docosenoic acid, pi16:0/18:1, pc16:0/18:2, and pco-16:0/20:4 contributed to discriminate early from late ccrcc stage patients the results are auspicious for early ccrcc diagnosis after validation of the panels in larger and different cohorts",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
118293,"an active learning approach for rapid characterization of endothelial cells in human tumors currently, no available pathological or molecular measures of tumor angiogenesis predict response to antiangiogenic therapies used in clinical practice recognizing that tumor endothelial cells ec and ec activation and survival signaling are the direct targets of these therapies, we sought to develop an automated platform for quantifying activity of critical signaling pathways and other biological events in ec of patient tumors by histopathology computer image analysis of ec in highly heterogeneous human tumors by a statistical classifier trained using examples selected by human experts performed poorly due to subjectivity and selection bias we hypothesized that the analysis can be optimized by a more active process to aid experts in identifying informative training examples to test this hypothesis, we incorporated a novel active learning al algorithm into farsight image analysis software that aids the expert by seeking out informative examples for the operator to label the resulting farsight-al system identified ec with specificity and sensitivity consistently greater than 09 and outperformed traditional supervised classification algorithms the system modeled individual operator preferences and generated reproducible results using the results of ec classification, we also quantified proliferation ki67 and activity in important signal transduction pathways map kinase, stat3 in immunostained human clear cell renal cell carcinoma and other tumors farsight-al enables characterization of ec in conventionally preserved human tumors in a more automated process suitable for testing and validating in clinical trials the results of our study support a unique opportunity for quantifying angiogenesis in a manner that can now be tested for its ability to identify novel predictive and response biomarkers",0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
50125,"weakly-supervised convolutional neural networks of renal tumor segmentation in abdominal cta images renal cancer is one of the 10 most common cancers in human beings the laparoscopic partial nephrectomy lpn is an effective way to treat renal cancer localization and delineation of the renal tumor from pre-operative ct angiography cta is an important step for lpn surgery planning recently, with the development of the technique of deep learning, deep neural networks can be trained to provide accurate pixel-wise renal tumor segmentation in cta images however, constructing the training dataset with a large amount of pixel-wise annotations is a time-consuming task for the radiologists therefore, weakly-supervised approaches attract more interest in research",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
51684,noninvasive fuhrman grading of clear cell renal cell carcinoma using computed tomography radiomic features and machine learning to identify optimal classification methods for computed tomography ct radiomics-based preoperative prediction of clear cell renal cell carcinoma ccrcc grade,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [114]:
## ACUTE & CHRONIC KIDNEY DISEASE / ACKD

## text
spec['ackd_text'] = np.where(groups['text'].str.contains("acute kidney"), "1", "0")
spec['ackd_text'] = np.where(groups['text'].str.contains("acute renal"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("kidney failure"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("renal failure"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("chronic kidney disease"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("chronic renal disease"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("stage kidney"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("stage renal"), "1", spec['ackd_text'])

print('text counts:')
print(Counter(spec['ackd_text']))

text counts:
Counter({'0': 33917, '1': 262})


In [115]:
## PAEDIATRICS / paeds

## text
text = ['paedia', 'pedia', 'neonate', 'neonatal', 'teenage', 'youth', 'children', 'childhood', 'infant', 
       'newborn', 'baby', 'babies', 'toddler']

spec['paeds_text'] = np.where(groups['text'].str.contains(' child '), "1", "0")

for x in text:
    spec['paeds_text'] = np.where(groups['text'].str.contains(x), "1", spec['paeds_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['paeds_text']))

text counts:
Counter({'0': 32370, '1': 1809})


In [116]:
spec[spec['paeds_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text
21566,"machine learning models on adc features to assess brain changes of children with pierre robin sequence in order to evaluate brain changes in young children with pierre robin sequence prs using machine learning based on apparent diffusion coefficient adc features, we retrospectively enrolled a total of 60 cases 42 in the training dataset and 18 in the testing dataset which included 30 prs and 30 controls from the childrens hospital affiliated to the nanjing medical university from january 2017-december 2019 there were 21 and nine prs cases in each dataset, with the remainder belonging to the control group in the same age range a total of 105 adc features were extracted from magnetic resonance imaging mri data features were pruned using least absolute shrinkage and selection operator lasso regression and seven adc features were developed as the optimal signatures for training machine learning models support vector machine svm achieved an area under the receiver operating characteristic curve auc of 099 for the training set and 085 for the testing set the auc of the multivariable logistic regression mlr and the adaboost for the training and validation dataset were 098/084 and 094/069, respectively based on the adc features, the two groups of cases ie, the prs group and the control group could be well-distinguished by the machine learning models, indicating that there is a significant difference in brain development between children with prs and normal controls",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2306,"a simple and robust methylation test for risk stratification of patients with juvenile myelomonocytic leukemia juvenile myelomonocytic leukemia jmml is a rare myelodysplastic/myeloproliferative neoplasm that develops during infancy and early childhood the array-based international consensus definition of dna methylation has recently classified patients with jmml into the following three groups: high methylation hm, intermediate methylation im, and low methylation lm to develop a simple and robust methylation clinical test, 137 patients with jmml have been analyzed using the digital restriction enzyme analysis of methylation dream, which is a next-generation sequencing based methylation analysis unsupervised consensus clustering of the discovery cohort n=99 using the dream data has identified hm and lm subgroups hm_dream, n=35; lm_dream; n=64 of the 98 cases that could be compared with the international consensus classification, 90 cases of hm n=30 and lm n=60 had 100% concordance with the dream clustering results for the remaining eight cases classified as the im group, four cases were classified into the hm_dream group and four cases into the lm_dream group a machine-learning classifier has been successfully constructed using a support vector machine svm, which divided the validation cohort n=38 into hm hm_svm; n=18 and lm lm_svm; n=20 groups patients with the hm_svm profile had a significantly poorer 5-year overall survival rate than those with the lm_svm profile in conclusion, a robust methylation test has been developed using the dream analysis for patients with jmml this simple and straightforward test can be easily incorporated in diagnosis to generate a methylation classification for patients so that they can receive risk-adapted treatment in the context of future clinical trials",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
121207,"automated down syndrome detection using facial photographs down syndrome, the most common single cause of human birth defects, produces alterations in physical growth and mental retardation; its early detection is crucial children with down syndrome generally have distinctive facial characteristics, which brings an opportunity for the computer-aided diagnosis of down syndrome using photographs of patients in this study, we propose a novel strategy based on machine learning techniques to detect down syndrome automatically a modified constrained local model is used to locate facial landmarks then geometric features and texture features based on local binary patterns are extracted around each landmark finally, down syndrome is detected using a variety of classifiers the best performance achieved 946% accuracy, 933% precision and 955% recall by using support vector machine with radial basis function kernel the results indicate that our method could assist in down syndrome screening effectively in a simple, non-invasive way",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4356,"machine learning for detection of correct peripherally inserted central catheter tip position from radiology reports in infants in critically ill infants, the position of a peripherally inserted central catheter picc must be confirmed frequently, as the tip may move from its original position and run the risk of hyperosmolar vascular damage or extravasation into surrounding spaces automated detection of picc tip position holds great promise for alerting bedside clinicians to noncentral piccs",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
67163,"monitoring disease progression with a quantitative severity scale for retinopathy of prematurity using deep learning retinopathy of prematurity rop is a leading cause of childhood blindness worldwide, but clinical diagnosis is subjective and qualitative",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [117]:
## STOMATOGNATHIC, DENTAL [C07]  / dent

## text
text = [' dental', 'dentist', 'dentition', 'teeth', 'tooth', 'canine', 'incisor', 'molars', 'maxilla', 'mandibul', 'mandible',
       'stomatognathic', 'gingiva', 'buccal', 'peridont']

spec['dent_text'] = np.where(groups['text'].str.contains('maxillofacial'), "1", "0")

for x in text:
    spec['dent_text'] = np.where(groups['text'].str.contains(x), "1", spec['dent_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['dent_text']))

text counts:
Counter({'0': 33852, '1': 327})


In [118]:
spec[spec['dent_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text
1562,"artificial intelligence and infrared thermography as auxiliary tools in the diagnosis of temporomandibular disorder to assess three machine learning ml attribute extraction methods: radiomic, semantic and radiomic-semantic association on temporomandibular disorder tmd detection using infrared thermography it; and to determine which ml classifier, knn, svm and mlp, is the most efficient for this purpose",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9362,"caries and restoration detection using bitewing film based on transfer learning with cnns caries is a dental disease caused by bacterial infection if the cause of the caries is detected early, the treatment will be relatively easy, which in turn prevents caries from spreading the current common procedure of dentists is to first perform radiographic examination on the patient and mark the lesions manually however, the work of judging lesions and markings requires professional experience and is very time-consuming and repetitive taking advantage of the rapid development of artificial intelligence imaging research and technical methods will help dentists make accurate markings and improve medical treatments it can also shorten the judgment time of professionals in addition to the use of gaussian high-pass filter and otsus threshold image enhancement technology, this research solves the problem that the original cutting technology cannot extract certain single teeth, and it proposes a caries and lesions area analysis model based on convolutional neural networks cnn, which can identify caries and restorations from the bitewing images moreover, it provides dentists with more accurate objective judgment data to achieve the purpose of automatic diagnosis and treatment planning as a technology for assisting precision medicine a standardized database established following a defined set of steps is also proposed in this study there are three main steps to generate the image of a single tooth from a bitewing image, which can increase the accuracy of the analysis model the steps include 1 preprocessing of the dental image to obtain a high-quality binarization, 2 a dental image cropping procedure to obtain individually separated tooth samples, and 3 a dental image masking step which masks the fine broken teeth from the sample and enhances the quality of the training among the current four common neural networks, namely, alexnet, googlenet, vgg19, and resnet50, experimental results show that the proposed alexnet model in this study for restoration and caries judgments has an accuracy as high as 9556% and 9030%, respectively these are promising results that lead to the possibility of developing an automatic judgment method of bitewing film",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
34369,"three-dimensional mandibular motion trajectory-tracking system based on bp neural network the aim of this study was to develop a prototype three-dimensional optical motion capture system based on binocular stereo vision, back-propagation bp neural network and 3d compen-sation method for accurate and real-time recording of mandibular movement a specialized 3d method of compensation to eliminate the involuntary vibration motions by human heart beating and respiration a kind of binocular visual 3d measurement method based on projection line and a calibration method based on bp neural network is proposed to solve the problem of the high complexity of camera calibration process and the low accuracy of 3d measurement the accuracy of the proposed system is systematically evaluated by means of electric platform and clinical trials, and the root-mean-square is 00773 mm finally, comparisons with state-of-the-art methods demonstrate that our system has higher reliability and accuracy meanwhile, the motion trajectory-tracking system is expected to be used in the diagnosis of clinical oral diseases and digital design of restoration",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
143555,"construction of a dental caries prediction model by data mining recently, the distribution of dental caries has been shown to be skewed, and precise prediction models cannot be obtained using all the data we applied a balancing technique to obtain more appropriate and robust models, and compared their accuracy with that of the conventional model the data were obtained from annual oral check-ups for schoolchildren conducted in japan five hundred children were followed from ages 5 to 8, and the three-year follow-up data were used the variables used were salivary levels of mutans streptococci and lactobacilli, 3-min stimulated saliva volume, salivary ph, fluoride usage, and frequency of consumption of sweet snacks and beverages initially, conventional models were constructed by logistic regression analysis, neural network a kind of prediction method, and decision analysis next, the balancing technique was used to construct new models, we randomly sampled the same number of subjects with and without new dental caries by repeated sampling, 10 models were constructed for each method application of the balancing technique resulted in the most robust model, with 073 sensitivity and 077 specificity obtained by c 50 analysis for data with a skewed distribution, the balancing method could be one of the important techniques for obtaining a suitable and robust prediction model for dental caries",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4494,"development of an exosomal gene signature to detect residual disease in dogs with osteosarcoma using a novel xenograft platform and machine learning osteosarcoma has a guarded prognosis a major hurdle in developing more effective osteosarcoma therapies is the lack of disease-specific biomarkers to predict risk, prognosis, or therapeutic response exosomes are secreted extracellular microvesicles emerging as powerful diagnostic tools however, their clinical application is precluded by challenges in identifying disease-associated cargo from the vastly larger background of normal exosome cargo we developed a method using canine osteosarcoma in mouse xenografts to distinguish tumor-derived from host-response exosomal messenger rnas mrnas the model allows for the identification of canine osteosarcoma-specific gene signatures by rna sequencing and a species-differentiating bioinformatics pipeline an osteosarcoma-associated signature consisting of five gene transcripts ska2, neu1, paf1, psmg2, and nob1 was validated in dogs with spontaneous osteosarcoma by real-time quantitative reverse transcription pcr qrt-pcr, while a machine learning model assigned dogs into healthy or disease groups serum/plasma exosomes were isolated from 53 dogs in distinct clinical groups healthy, osteosarcoma, other bone tumor, or non-neoplastic disease pre-treatment samples from osteosarcoma cases were used as the training set, and a validation set from post-treatment samples was used for testing, classifying as osteosarcoma detected or osteosarcoma-not detected dogs in a validation set whose post-treatment samples were classified as osteosarcoma-not detected had longer remissions, up to 15 months after treatment in conclusion, we identified a gene signature predictive of molecular remissions with potential applications in the early detection and minimal residual disease settings these results provide proof of concept for our discovery platform and its utilization in future studies to inform cancer risk, diagnosis, prognosis, and therapeutic response",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [119]:
## AUDIOLOGY [C09] / audio

## text
text = ['audiology', ' ear disease', 'earache', 'labyrinth', 'otitis', 'otosclerosis', 'cochlear', 'tympanic memb',
       'otoscop', 'acoustic neuroma', 'meniere', 'hearing loss', 'hearing impairment', 'cholesteatoma', 'otoacoustic', 'deafness', ' deaf ',
       'middle ear', 'outer ear', 'inner ear', 'otolog', 'paroxysmal positional vertigo']

spec['audio_text'] = np.where(groups['text'].str.contains('hearing aid'), "1", "0")

for x in text:
    spec['audio_text'] = np.where(groups['text'].str.contains(x), "1", spec['audio_text']) #if yes then 1, if no, keep current

print('text counts:')
print(Counter(spec['audio_text']))

text counts:
Counter({'0': 34020, '1': 159})


In [120]:
spec[spec['audio_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text,audio_text
9512,"the use of explainable artificial intelligence to explore types of fenestral otosclerosis misdiagnosed when using temporal bone high-resolution computed tomography the purpose of this study was to explore the common characteristics of fenestral otosclerosis os which are misdiagnosed, and develop a deep learning model for the diagnosis of fenestral os based on temporal bone high-resolution computed tomography scans",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
166658,"the application of bionic wavelet transform to speech signal processing in cochlear implants using neural network simulations cochlear implants cis restore partial hearing to people with severe to profound sensorineural deafness; but there is still a marked performance gap in speech recognition between those who have received cochlear implant and people with a normal hearing capability one of the factors that may lead to this performance gap is the inadequate signal processing method used in cis this paper investigates the application of an improved signal-processing method called bionic wavelet transform bwt this method is based upon the auditory model and allows for signal processing comparing the neural network simulations on the same experimental materials processed by wavelet transform wt and bwt, the application of bwt to speech signal processing in ci has a number of advantages, including: improvement in recognition rates for both consonants and vowels, reduction of the number of required channels, reduction of the average stimulation duration for words, and high noise tolerance consonant recognition results in 15 normal hearing subjects show that the bwt produces significantly better performance than the wt t = -436276, p = 000065 the bwt has great potential to reduce the performance gap between ci listeners and people with a normal hearing capability in the future",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5540,"environmental noise classification with inception-dense blocks for hearing aids hearing aids are increasingly essential for people with hearing loss for this purpose, environmental noise estimation and classification are some of the required technologies however, some noise classifiers utilize multiple audio features, which cause intense computation in addition, such noise classifiers employ inputs of different time lengths, which may affect classification performance thus, this paper proposes a model architecture for noise classification, and performs experiments with three different audio segment time lengths the proposed model attains fewer floating-point operations and parameters by utilizing the log-scaled mel-spectrogram as an input feature the proposed models are evaluated with classification accuracy, computational complexity, trainable parameters, and inference time on the urbansound8k dataset and hans dataset the experimental results showed that the proposed model outperforms other models on two datasets furthermore, compared with other models, the proposed model reduces model complexity and inference time while maintaining classification accuracy as a result, the proposed noise classification for hearing aids offers less computational complexity without compromising performance",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
103805,"coregistered photoacoustic and ultrasound imaging and classification of ovarian cancer: ex vivo and in vivo studies most ovarian cancers are diagnosed at advanced stages due to the lack of efficacious screening techniques photoacoustic tomography pat has a potential to image tumor angiogenesis and detect early neovascular changes of the ovary we have developed a coregistered pat and ultrasound us prototype system for real-time assessment of ovarian masses features extracted from pat and us angular beams, envelopes, and images were input to a logistic classifier and a support vector machine svm classifier to diagnose ovaries as benign or malignant a total of 25 excised ovaries of 15 patients were studied and the logistic and svm classifiers achieved sensitivities of 704 and 877%, and specificities of 956 and 979%, respectively furthermore, the ovaries of two patients were noninvasively imaged using the pat/us system before surgical excision by using five significant features and the logistic classifier, 12 out of 14 images 86% sensitivity from a malignant ovarian mass and all 17 images 100% specificity from a benign mass were accurately classified; the svm correctly classified 10 out of 14 malignant images 71% sensitivity and all 17 benign images 100% specificity these initial results demonstrate the clinical potential of the pat/us technique for ovarian cancer diagnosis",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
12219,identifying genetic risk variants associated with noise-induced hearing loss based on a novel strategy for evaluating individual susceptibility the overall genetic profile for noise-induced hearing loss nihl remains elusive herein we proposed a novel machine learning ml based strategy to evaluate individual susceptibility to nihl and identify the underlying genetic risk variants based on a subsample of participants with extreme phenotypes,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [121]:
## BRAIN COMPUTER / bci

## text
spec['bci_text'] = np.where(groups['text'].str.contains("brain control"), "1", "0")
spec['bci_text'] = np.where(groups['text'].str.contains("brain computer"), "1", "0")

print('text counts:')
print(Counter(spec['bci_text']))

text counts:
Counter({'0': 34079, '1': 100})


In [122]:
## PROSTHESIS CONTROL / prosth

## text
spec['prosth_text'] = np.where(groups['text'].str.contains("prosthetic"), "1", "0")
spec['prosth_text'] = np.where(groups['text'].str.contains("prosthesis"), "1", spec['prosth_text'])

print('text counts:')
print(Counter(spec['prosth_text']))

text counts:
Counter({'0': 33915, '1': 264})


In [123]:
spec[spec['prosth_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text,audio_text,bci_text,prosth_text
62466,"applying deep artificial neural network approach to maxillofacial prostheses coloration maxillofacial prosthetic rehabilitation replaces missing structures to recover the function and aesthetics relating to facial defects or injuries deep learning is rapidly expanding with respect to applications in medical fields in this study, we apply the artificial neural network ann-based deep learning approach to coloration support for fabricating maxillofacial prostheses",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
32881,"development of a shoulder disarticulation prosthesis system intuitively controlled with the trunk surface electromyogram we developed an intuitively operational shoulder disarticulation prosthesis system that can be used without long-term training the developed system consisted of four degrees of freedom joints, as well as a user adapting control system based on a machine learning technique and surface electromyogram emg of the trunk we measured the surface emg of the trunk of healthy subjects at multiple points and analyzed through principal component analysis to identify the proper emg measurement portion of the trunk, which was determined to be distributed in the chest and back additionally, evaluation experiments demonstrated the capability of four healthy subjects to grasp and move objects in the horizontal as well as the vertical directions, using our developed system controlled via the emg of the chest and back moreover, we also quantitatively confirmed the ability of a bilateral shoulder disarticulation amputee to complete the evaluation experiment similar to healthy subjects",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
39944,automated detection of periprosthetic joint infections and data elements using natural language processing periprosthetic joint infection pji data elements are contained in both structured and unstructured documents in electronic health records and require manual data collection the goal of this study is to develop a natural language processing nlp algorithm to replicate manual chart review for pji data elements,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
142817,"decoding of individuated finger movements using surface electromyography upper limb prostheses are increasingly resembling the limbs they seek to replace in both form and functionality, including the design and development of multifingered hands and wrists hence, it becomes necessary to control large numbers of degrees of freedom dofs, required for individuated finger movements, preferably using noninvasive signals while existing control paradigms are typically used to drive a single-dof hook-based configurations, dexterous tasks such as individual finger movements would require more elaborate control schemes we show that it is possible to decode individual flexion and extension movements of each finger ten movements with greater than 90% accuracy in a transradial amputee using only noninvasive surface myoelectric signals further, comparison of decoding accuracy from a transradial amputee and able-bodied subjects shows no statistically significant difference p < 005 between these subjects these results are encouraging for the development of real-time control strategies based on the surface myoelectric signal to control dexterous prosthetic hands",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
149948,real-time intelligent pattern recognition algorithm for surface emg signals electromyography emg is the study of muscle function through the inquiry of electrical signals that the muscles emanate emg signals collected from the surface of the skin surface electromyogram: semg can be used in different applications such as recognizing musculoskeletal neural based patterns intercepted for hand prosthesis movements current systems designed for controlling the prosthetic hands either have limited functions or can only be used to perform simple movements or use excessive amount of electrodes in order to achieve acceptable results in an attempt to overcome these problems we have proposed an intelligent system to recognize hand movements and have provided a user assessment routine to evaluate the correctness of executed movements,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [124]:
## ASSISTIVE DEVICE CONTROL / assist

## text
spec['assist_text'] = np.where(groups['text'].str.contains("wheelchair"), "1", "0")
spec['assist_text'] = np.where(groups['text'].str.contains("scooter"), "1", "0")
spec['assist_text'] = np.where(groups['text'].str.contains("mobility device"), "1", "0")
spec['assist_text'] = np.where(groups['text'].str.contains("assistive device"), "1", "0")
spec['assist_text'] = np.where(groups['text'].str.contains("exoskeleton"), "1", "0")

print('text counts:')
print(Counter(spec['assist_text']))

text counts:
Counter({'0': 34109, '1': 70})


In [125]:
spec[spec['assist_text']=='1'].sample(15)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text,audio_text,bci_text,prosth_text,assist_text
88354,"implementation of a surface electromyography-based upper extremity exoskeleton controller using learning from demonstration upper-extremity exoskeletons have demonstrated potential as augmentative, assistive, and rehabilitative devices typical control of upper-extremity exoskeletons have relied on switches, force/torque sensors, and surface electromyography semg, but these systems are usually reactionary, and/or rely on entirely hand-tuned parameters semg-based systems may be able to provide anticipatory control, since they interface directly with muscle signals, but typically require expert placement of sensors on muscle bodies we present an implementation of an adaptive semg-based exoskeleton controller that learns a mapping between muscle activation and the desired system state during interaction with a user, generating a personalized semg feature classifier to allow for anticipatory control this system is robust to novice placement of semg sensors, as well as subdermal muscle shifts we validate this method with 18 subjects using a thumb exoskeleton to complete a book-placement task this learning-from-demonstration system for exoskeleton control allows for very short training times, as well as the potential for improvement in intent recognition over time, and adaptation to physiological changes in the user, such as those due to fatigue",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
65390,"rnn-based on-line continuous gait phase estimation from shank-mounted imus to control ankle exoskeletons several research groups have developed and studied powered ankle exoskeletons to improve energetics of healthy subjects and the mobility of elderly subjects, or to reduce asymmetry in gaits induced by strokes to achieve optimal effect, the timing of assistive torque has been proved to be of crucial importance previous studies estimated the onset timings mostly by extrapolating the time horizon from past gait events observed with sensors such methods have inherently limited performance when subjects are not walking at steady frequencies to overcome such limitation and allow the use of exoskeletons in various scenarios in a daily life, we propose to estimate the gait phase as a continuous variable progressing over a gait cycle, hence allowing immediate response to frequency changes rather than iteratively correcting it after each cycle our method uses recurrent neural networks to estimate gait phases out of an inertial measurement unit imu every 10 ms by replacing foot sensors with an imu we can obtain rich enough information to estimate gait phase continuously as well as avoid physical damage in sensors from ground impacts our preliminary tests with 2 healthy subjects showed qualitatively positive outcomes regarding the gait phase estimation and the assistive torque control",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
39334,"continuous estimation of knee joint angle based on surface electromyography using a long short-term memory neural network and time-advanced feature continuous joint angle estimation based on a surface electromyography semg signal can be used to improve the man-machine coordination performance of the exoskeleton in this study, we proposed a time-advanced feature and utilized long short-term memory lstm with a root mean square rms feature and its time-advanced feature rmstaf; collectively referred to as rrtaf of semg to estimate the knee joint angle to evaluate the effect of joint angle estimation, we used root mean square error rmse and cross-correlation coefficient <i>ρ</i> between the estimated angle and actual angle we also compared three methods ie, lstm using rms, bpnn back propagation neural network using rrtaf, and bpnn using rms with lstm using rrtaf to highlight its good performance five healthy subjects participated in the experiment and their eight muscle ie, rectus femoris rf, biceps femoris bf, semitendinosus st, gracilis gc, semimembranosus sm, sartorius sr, medial gastrocnemius mg, and tibialis anterior ta semg signals were taken as algorithm inputs moreover, the knee joint angles were used as target values the experimental results showed that, compared with lstm using rms, bpnn using rrtaf, and bpnn using rms, the average rmse values of lstm using rrtaf were respectively reduced by 857%, 4662%, and 6869%, whereas the average <i>ρ</i> values were respectively increased by 031%, 415%, and 1835% the results demonstrated that lstm using rrtaf, which contained the time-advanced feature, had better performance for estimating the knee joint motion",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
55778,"prediction of plantar forces during gait using wearable sensors and deep neural networks<sup></sup> to enable on-time and high-fidelity lower-limb exoskeleton control, it is effective to predict the future human motion from the observed status in this research, we propose a novel method to predict future plantar force during the gait using imu and plantar sensors deep neural networks dnn are used to learn the non-linear relationship between the measured sensor data and the future plantar force data using the trained network, we can predict the plantar force not only during walking but also at the start and end of walking in the experiments, the performance of the proposed method is confirmed for different prediction time",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
96518,"classifying three imaginary states of the same upper extremity using time-domain features brain-computer interface bci allows collaboration between humans and machines it translates the electrical activity of the brain to understandable commands to operate a machine or a device in this study, we propose a method to improve the accuracy of a 3-class bci using electroencephalographic eeg signals this bci discriminates rest against imaginary grasps and elbow movements of the same limb this classification task is challenging because imaginary movements within the same limb have close spatial representations on the motor cortex area the proposed method extracts time-domain features and classifies them using a support vector machine svm with a radial basis kernel function rbf an average accuracy of 742% was obtained when using the proposed method on a dataset collected, prior to this study, from 12 healthy individuals this accuracy was higher than that obtained when other widely used methods, such as common spatial patterns csp, filter bank csp fbcsp, and band power methods, were used on the same dataset these results are encouraging and the proposed method could potentially be used in future applications including bci-driven robotic devices, such as a portable exoskeleton for the arm, to assist individuals with impaired upper extremity functions in performing daily tasks",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
101171,"eeg classification for motor imagery and resting state in bci applications using multi-class adaboost extreme learning machine brain-computer interface bci systems provide an alternative communication and control approach for people with limited motor function therefore, the feature extraction and classification approach should differentiate the relative unusual state of motion intention from a common resting state in this paper, we sought a novel approach for multi-class classification in bci applications we collected electroencephalographic eeg signals registered by electrodes placed over the scalp during left hand motor imagery, right hand motor imagery, and resting state for ten healthy human subjects we proposed using the kolmogorov complexity kc for feature extraction and a multi-class adaboost classifier with extreme learning machine as base classifier for classification, in order to classify the three-class eeg samples an average classification accuracy of 795% was obtained for ten subjects, which greatly outperformed commonly used approaches thus, it is concluded that the proposed method could improve the performance for classification of motor imagery tasks for multi-class samples it could be applied in further studies to generate the control commands to initiate the movement of a robotic exoskeleton or orthosis, which finally facilitates the rehabilitation of disabled people",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
55818,"classification and transfer learning of eeg during a kinesthetic motor imagery task using deep convolutional neural networks the reliable classification of electroencephalography eeg signals is a crucial step towards making eeg-controlled non-invasive neuro-exoskeleton rehabilitation a practical reality eeg signals collected during motor imagery tasks have been proposed to act as a control signal for exoskeleton applications here, a deep convolutional neural network dcnn was optimized to classify a two-class kinesthetic motor imagery eeg dataset, leading to an optimized architecture consisting of four convolutional layers and three fully connected layers transfer learning, or the leveraging of data from past subjects to classify the intentions of a new subject, is important for rehabilitation as it helps to minimize the number of training sessions required from subjects who lack full motor functionality the transfer learning training paradigm investigated through this study utilized region criticality trends to reduce the number of new subject training sessions and increase the classification performance when compared against a single-subject non-transfer-learning classifier",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
49131,"characterization of forearm muscle activation in duchenne muscular dystrophy via high-density electromyography: a case study on the implications for myoelectric control duchenne muscular dystrophy dmd is a genetic disorder that results in progressive muscular degeneration although medical advances increased their life expectancy, dmd individuals are still highly dependent on caregivers hand/wrist function is central for providing independence, and robotic exoskeletons are good candidates for effectively compensating for deteriorating functionality robotic hand exoskeletons require the accurate decoding of motor intention typically via surface electromyography semg traditional low-density semg was used in the past to explore the muscular activations of individuals with dmd; however, it cannot provide high spatial resolution this study characterized, for the first time, the forearm high-density hd electromyograms of three individuals with dmd while performing seven hand/wrist-related tasks and compared them to eight healthy individuals all data available online we looked into the spatial distribution of hd-semg patterns by using principal component analysis pca and also assessed the repeatability and the amplitude distributions of muscle activity additionally, we used a machine learning approach to assess dmd individualspotentials for myocontrol our analysis showed that although participants with dmd were able to repeat similar hd-semg patterns across gestures similarly to healthy participants, a fewer number of electrodes was activated during their gestures compared to the healthy participants additionally, participants with dmd activated their muscles close to maximal contraction level 063 ± 023, whereas healthy participants had lower normalized activations 026 ± 02 lastly, participants with dmd showed on average fewer pcs 3, explaining 90% of the complete gesture space than the healthy 5 however, the ability of the dmd participants to produce repeatable hd-semg patterns was unexpectedly comparable to that of healthy participants, and the same holds true for their offline myocontrol performance, disproving our hypothesis and suggesting a clear potential for the myocontrol of wearable exoskeletons our findings present evidence for the first time on how dmd leads to progressive alterations in hand/wrist motor control in dmd individuals compared to healthy the better understanding of these alterations can lead to further developments for the intuitive and robust myoelectric control of active hand exoskeletons for individuals with dmd",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
60710,"myoelectric control of a soft hand exoskeleton using kinematic synergies soft hand exoskeletons offer a lightweight, low-profile alternative to rigid rehabilitative robotic systems, enabling their use to restore activities of daily living adl in those with hand paresis due to stroke or other conditions the hand exoskeleton with embedded synergies hexoes is a soft cable-driven hand exoskeleton capable of independently actuating and sensing 10 degrees of freedom dof of the hand control of the 10 dof exoskeleton is dimensionally reduced using three manually defined synergies in software corresponding to thumb, index, and 3-finger flexion and extension in this paper, five healthy subjects control hexoes using a neural network which decodes synergy weights from contralateral electromyography emg activity the three synergies are manipulated in real time to grasp and lift 15 adl objects of various sizes and weights the neural networks training and validation mean squared error, object grasp time, and grasp success rate were measured for five healthy subjects the final training error of the neural network was 48 ± 18% averaged across subjects and tasks, with 83 ± 34% validation error the time to reach, grasp, and lift an object was 1115 ± 435 s on average, with an average success rate of 667% across all objects the complete system demonstrates real time use of biosignals and machine learning to allow subjects to operate kinematic synergies to grasp objects using a wearable hand exoskeleton future work and applications are further discussed, including possible design improvements and enrollment of individuals with stroke",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
63793,"sub-optimally solving actuator redundancy in a hybrid neuroprosthetic system with a multi-layer neural network structure functional electrical stimulation fes has recently been proposed as a supplementary torque assist in lower-limb powered exoskeletons for persons with paraplegia in the combined system, also known as a hybrid neuroprosthesis, both fes-assist and the exoskeleton act to generate lower-limb torques to achieve standing and walking functions due to this actuator redundancy, we are motivated to optimally allocate fes-assist and exoskeleton torque based on a performance index that penalizes fes overuse to minimize muscle fatigue while also minimizing regulation or tracking errors traditional optimal control approaches need a system model to optimize; however, it is often difficult to formulate a musculoskeletal model that accurately predicts muscle responses due to fes in this paper, we use a novel identification and control structure that contains a recurrent neural network rnn and several feedforward neural networks fnns the rnn is trained by supervised learning to identify the system dynamics, while the fnns are trained by a reinforcement learning method to provide sub-optimal control actions the output layer of each fnn has its unique activation functions, so that the asymmetric constraint of fes and the symmetric constraint of exoskeleton motor control input can be realized this new structure is experimentally validated on a seated human participant using a single joint hybrid neuroprosthesis",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [126]:
## HOME ACTIVITY / active

## text
spec['activity_text'] = np.where(groups['text'].str.contains("activity monitor"), "1", "0")
spec['activity_text'] = np.where(groups['text'].str.contains("activity detect"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("activities monitor"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("activities detect"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("home environ"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("fall detect"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("fall monitor"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("falls detect"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("falls monitor"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("daily activit"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("activity classif"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("daily living"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("fall prevent"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("falls in home"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("falls at home"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("home sensor"), "1", spec['activity_text'])

print('text counts:')
print(Counter(spec['activity_text']))

text counts:
Counter({'0': 33728, '1': 451})


In [127]:
spec[spec['activity_text']=='1'].sample(15)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text,audio_text,bci_text,prosth_text,assist_text,activity_text
97256,"training classifiers with shadow features for sensor-based human activity recognition in this paper, a novel training/testing process for building/using a classification model based on human activity recognition har is proposed traditionally, har has been accomplished by a classifier that learns the activities of a person by training with skeletal data obtained from a motion sensor, such as microsoft kinect these skeletal data are the spatial coordinates x, y, z of different parts of the human body the numeric information forms time series, temporal records of movement sequences that can be used for training a classifier in addition to the spatial features that describe current positions in the skeletal data, new features called shadow featuresare used to improve the supervised learning efficacy of the classifier shadow features are inferred from the dynamics of body movements, and thereby modelling the underlying momentum of the performed activities they provide extra dimensions of information for characterising activities in the classification process, and thereby significantly improve the classification accuracy two cases of har are tested using a classification model trained with shadow features: one is by using wearable sensor and the other is by a kinect-based remote sensor our experiments can demonstrate the advantages of the new method, which will have an impact on human activity detection research",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
97061,"principal component analysis can decrease neural networks performance for incipient falls detection: a preliminary study with hands and feet accelerations fall-related accidents constitute a major problem for elderly people and a burden to the health-care national system it is therefore important to design devices eg, accelerometers and machine learning algorithms able to recognize incipient falls as quickly and reliably as possible blind source separation bss methods are often used as a preprocessing step before classification, however the effects of bss on classification performance are not well understood the aim of this work is to preliminarily characterize the effect that two methods, namely principal and independent component analysis pca and ica and their combined use have on the performance of a neural network in detecting incipient falls we used the feet and arms 3d kinematics of subjects while managing unexpected perturbations during walking results show that pca needs to be used carefully as depending on the initial dataset, the pca might lump variance together thus impairing the performance of an artificial neural networks ann classifier the use of pca with 85% residual variance threshold significantly decreased the classifier performance, which was restored with a subsequent ica pca + ica the results suggest that bss techniques, though linear, might have an adverse effect on nonlinear classifiers such as ann that might be dependent on the initial dataset redundancy",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
71539,"accelerometer-based human fall detection using convolutional neural networks human falls are a global public health issue resulting in over 373 million severe injuries and 646,000 deaths yearly falls result in direct financial cost to health systems and indirectly to society productivity unsurprisingly, human fall detection and prevention are a major focus of health research in this article, we consider deep learning for fall detection in an iot and fog computing environment we propose a convolutional neural network composed of three convolutional layers, two maxpool, and three fully-connected layers as our deep learning model we evaluate its performance using three open data sets and against extant research our approach for resolving dimensionality and modelling simplicity issues is outlined accuracy, precision, sensitivity, specificity, and the matthews correlation coefficient are used to evaluate performance the best results are achieved when using data augmentation during the training process the paper concludes with a discussion of challenges and future directions for research in this domain",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
110383,"a personalized health-monitoring system for elderly by combining rules and case-based reasoning health-monitoring system for elderly in home environment is a promising solution to provide efficient medical services that increasingly interest by the researchers within this area it is often more challenging when the system is self-served and functioning as personalized provision this paper proposed a personalized self-served health-monitoring system for elderly in home environment by combining general rules with a case-based reasoning approach here, the system generates feedback, recommendation and alarm in a personalized manner based on elderlys medical information and health parameters such as blood pressure, blood glucose, weight, activity, pulse, etc a set of general rules has used to classify individual health parameters the case-based reasoning approach is used to combine all different health parameters, which generates an overall classification of health condition according to the evaluation result considering 323 cases and k=2 ie, top 2 most similar retrieved cases, the sensitivity, specificity and overall accuracy are achieved as 90%, 97% and 96% respectively the preliminary result of the system is acceptable since the feedback; recommendation and alarm messages are personalized and differ from the general messages thus, this approach could be possibly adapted for other situations in personalized elderly monitoring",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
20222,"early detection of freezing of gait during walking using inertial measurement unit and plantar pressure distribution data freezing of gait fog is a sudden and highly disruptive gait dysfunction that appears in mid to late-stage parkinsons disease pd and can lead to falling and injury a system that predicts freezing before it occurs or detects freezing immediately after onset would generate an opportunity for fog prevention or mitigation and thus enhance safe mobility and quality of life this research used accelerometer, gyroscope, and plantar pressure sensors to extract 861 features from walking data collected from 11 people with fog minimum-redundancy maximum-relevance and relief-f feature selection were performed prior to training boosted ensembles of decision trees the binary classification models identified total-fog or no fog states, wherein the total-fog class included data windows from 2 s before the fog onset until the end of the fog episode three feature sets were compared: plantar pressure, inertial measurement unit imu, and both plantar pressure and imu features the plantar-pressure-only model had the greatest sensitivity and the imu-only model had the greatest specificity the best overall model used the combination of plantar pressure and imu features, achieving 764% sensitivity and 862% specificity next, the total-fog class components were evaluated individually ie, pre-fog windows, freeze windows, transition windows between pre-fog and freeze the best model detected windows that contained both pre-fog and fog data with 852% sensitivity, which is equivalent to detecting fog less than 1 s after the freeze began windows of fog data were detected with 934% sensitivity the imu and plantar pressure feature-based model slightly outperformed models that used data from a single sensor type the model achieved early detection by identifying the transition from pre-fog to fog while maintaining excellent fog detection performance 934% sensitivity therefore, if used as part of an intelligent, real-time fog identification and cueing system, even if the pre-fog state were missed, the model would perform well as a freeze detection and cueing system that could improve the mobility and independence of people with pd during their daily activities",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
19961,"deep artificial neural network based on environmental sound data for the generation of a children activity classification model children activity recognition car is a subject for which numerous works have been developed in recent years, most of them focused on monitoring and safety commonly, these works use as data source different types of sensors that can interfere with the natural behavior of children, since these sensors are embedded in their clothes this article proposes the use of environmental sound data for the creation of a children activity classification model, through the development of a deep artificial neural network ann initially, the ann architecture is proposed, specifying its parameters and defining the necessary values for the creation of the classification model the ann is trained and tested in two ways: using a 70-30 approach 70% of the data for training and 30% for testing and with a k-fold cross-validation approach according to the results obtained in the two validation processes 70-30 splitting and k-fold cross validation, the ann with the proposed architecture achieves an accuracy of 9451% and 9419%, respectively, which allows to conclude that the developed model using the ann and its proposed architecture achieves significant accuracy in the children activity classification by analyzing environmental sound",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
79566,"automatic timed up-and-go sub-task segmentation for parkinsons disease patients using video-based activity classification the timed up-and-go tug test has been widely accepted as a standard assessment for measuring the basic functional mobility of patients with parkinsons disease several basic mobility sub-tasks sit, sit-to-stand, walk, turn, walk-back, and sit-back are included in a tug test it has been shown that the time costs of these sub-tasks are useful clinical parameters for the assessment of parkinsons disease several automatic methods have been proposed to segment and time these sub-tasks in a tug test however, these methods usually require either well-controlled environments for the tug video recording or information from special devices, such as wearable inertial sensors, ambient sensors, or depth cameras in this paper, an automatic tug sub-task segmentation method using video-based activity classification is proposed and validated in a study with 24 parkinsons disease patients videos used in this paper are recorded in semi-controlled environments with various backgrounds the state-of-the-art deep learning-base 2-d human pose estimation technologies are used for feature extraction a support vector machine and a long short-term memory network are then used for the activity classification and the subtask segmentation our method can be used to automatically acquire clinical parameters for the assessment of parkinsons disease using tug videos-only, leading to the possibility of remote monitoring of the patientscondition",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
137278,"pattern mining of multichannel semg for tremor classification tremor is defined as the involuntary rhythmic or quasi-rhythmic oscillation of a body part, resulting from alternating or simultaneous contractions of antagonistic muscle groups while tremor may be physiological, those who have disabling pathological tremors find that performing typical activities for daily living to be physically challenging and emotionally draining detecting the presence of tremor and its proper identification are crucial in prescribing the appropriate therapy to lessen its deleterious physical, emotional, psychological, and social impact while diagnosis relies heavily on clinical evaluation, pattern analysis of surface electromyogram semg signals can be a useful diagnostic aid for an objective identification of tremor types using semg system attached to several parts of the patients body while performing several tasks, this research aims to develop a classifier system that automates the process of tremor types recognition finding the optimal model and its corresponding parameters is not a straightforward process the resulting workflow, however, provides valuable information in understanding the interplay and impact of the different features and their parameters to the behavior and performance of the classifier system the resulting model analysis helps identify the necessary locations for the placement of semg electrodes and relevant features that have significant impact in the process of classification these information can help clinicians in streamlining the process of diagnosis without sacrificing its accuracy",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
39775,"mobile sensor based human activity recognition: distinguishing of challenging activities by applying long short-term memory deep learning modified by residual network concept automated recognition of daily human tasks is a novel method for continuous monitoring of the health of elderly people nowadays mobile devices ie smartphone and smartwatch are equipped with a variety of sensors, therefore activity classification algorithms have become as useful, low-cost, and non-invasive diagnostic modality to implement as mobile software the aim of this article is to introduce a new deep learning structure for recognizing challenging ie similar human activities based on signals which have been recorded by sensors mounted on mobile devices in the proposed structure, the residual network concept is engaged as a new substructure inside the main proposed structure this part is responsible to address the problem of accuracy saturation in convolutional neural networks, thanks to its ability in jump over some layers which leads to reducing vanishing gradients effect therefore the accuracy of the classification of several activities is increased by using the proposed structure performance of the proposed method is evaluated on real life recorded signals and is compared with existing techniques in two different scenarios the proposed structure is applied on two well-known human activity datasets that have been prepared in university of fordham the first dataset contains the recorded signals which arise from six different activities including walking, jogging, upstairs, downstairs, sitting, and standing the second dataset also contains walking, jogging, stairs, sitting, standing, eating soup, eating sandwich, and eating chips in the first scenario, the performance of the proposed structures is compared with deep learning schemes the obtained results show that the proposed method may improve the recognition rate at least 5% for the first dataset against its own family alternatives in distinguishing challenging activities ie downstairs and upstairs for the second data set similar improvements is obtained for some challenging activities ie eating sandwich and eating chips these superiorities even reach to at least 28% when the capability of the proposed method in recognizing downstairs and upstairs is compared to its non-family methods for the first dataset increasing the recognition rate of the proposed method for challenging activities ie downstairs and upstairs, eating sandwich and eating chips in parallel with its acceptable performance for other non-challenging activities shows its effectiveness in mobile sensor-based health monitoring systems",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
63856,"neural muscle activation detection: a deep learning approach using surface electromyography the timing of muscles activation which is a key parameter in determining plenty of medical conditions can be greatly assessed by the surface emg signal which inherently carries an immense amount of information many techniques for measuring muscle activity detection exist in the literature however, due to the complex nature of the emg signal as well as the interference from other muscles that is observed during the measurement of the emg signal, the accuracy of these techniques is compromised in this paper, we introduce the neural muscle activation detection nmad framework that detects the muscle activation based on deep learning the main motivation behind using deep learning is to allow the neural network to detect based on the appropriate signal features instead of depending on certain assumptions not only the presented approach significantly improves the accuracy of timing detection, but because of the training nature, it can adapt to operate under different levels of interference and signal-to-noise ratio",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [128]:
## combine

labelled['subspec_icu'] = np.where(spec['icu_text'].str.contains("1"), "1", "0")

labelled['subspec_ed'] = np.where(spec['ed_text'].str.contains("1"), "1", "0")

labelled['spec_paeds'] = np.where(spec['paeds_text'].str.contains("1"), "1", "0")

labelled['spec_dent'] = np.where(spec['dent_text'].str.contains("1"), "1", "0")

labelled['spec_audio'] = np.where(spec['audio_text'].str.contains("1"), "1", "0")

labelled['spec_id'] = np.where(spec['id_text'].str.contains("1"), "1", "0")

labelled['subspec_sepsis'] = np.where(spec['sepsis_text'].str.contains("1"), "1", "0")

labelled['subspec_hiv'] = np.where(spec['hiv_text'].str.contains("1"), "1", "0")

labelled['subspec_cov19'] = np.where(spec['cov19_text'].str.contains("1"), "1", "0")

labelled['subspec_tb'] = np.where(spec['tb_text'].str.contains("1"), "1", "0")

labelled['subspec_malaria'] = np.where(spec['malaria_text'].str.contains("1"), "1", "0")

labelled['subspec_tropic'] = np.where(spec['tropic_text'].str.contains("1"), "1", "0")

labelled['spec_derm'] = np.where(spec['derm_text'].str.contains("1"), "1", "0")

labelled['subspec_dermca'] = np.where(spec['dermca_text'].str.contains("1"), "1", "0")

labelled['spec_onc'] = np.where(spec['onc_text'].str.contains("1"), "1", "0")

labelled['subspec_rx'] = np.where(spec['rx_text'].str.contains("1"), "1", "0")

labelled['subspec_lungca'] = np.where(spec['lungca_text'].str.contains("1"), "1", "0")

labelled['subspec_brainca'] = np.where(spec['brainca_text'].str.contains("1"), "1", "0")

labelled['subspec_gica'] = np.where(spec['gica_text'].str.contains("1"), "1", "0")

labelled['subspec_hepca'] = np.where(spec['hepca_text'].str.contains("1"), "1", "0")

labelled['subspec_prosca'] = np.where(spec['prosca_text'].str.contains("1"), "1", "0")

labelled['subspec_gynonc'] = np.where(spec['gynonc_text'].str.contains("1"), "1", "0")

labelled['subspec_renalca'] = np.where(spec['renalca_text'].str.contains("1"), "1", "0")

labelled['subspec_haemonc'] = np.where(spec['haemonc_text'].str.contains("1"), "1", "0")

labelled['subspec_breast'] = np.where(spec['breast_text'].str.contains("1"), "1", "0")

labelled['subspec_breastca'] = np.where(spec['breastca_text'].str.contains("1"), "1", "0")

labelled['spec_psych'] = np.where(spec['psych_text'].str.contains("1"), "1", "0")

labelled['subspec_suicide'] = np.where(spec['suicide_text'].str.contains("1"), "1", "0")

labelled['spec_msk'] = np.where(spec['msk_text'].str.contains("1"), "1", "0")

labelled['subspec_frac'] = np.where(spec['frac_text'].str.contains("1"), "1", "0")

labelled['spec_rheum'] = np.where(spec['rheum_text'].str.contains("1"), "1", "0")

labelled['spec_gi'] = np.where(spec['gi_text'].str.contains("1"), "1", "0")

labelled['spec_hep'] = np.where(spec['hep_text'].str.contains("1"), "1", "0")

labelled['spec_resp'] = np.where(spec['resp_text'].str.contains("1"), "1", "0")

labelled['subspec_pneum'] = np.where(spec['pneum_text'].str.contains("1"), "1", "0")

labelled['subspec_osa'] = np.where(spec['osa_text'].str.contains("1"), "1", "0")

labelled['subspec_pe'] = np.where(spec['pe_text'].str.contains("1"), "1", "0")

labelled['spec_neuro'] = np.where(spec['neuro_text'].str.contains("1"), "1", "0")

labelled['subspec_epilep'] = np.where(spec['epilep_text'].str.contains("1"), "1", "0")

labelled['subspec_cva'] = np.where(spec['cva_text'].str.contains("1"), "1", "0")

labelled['subspec_alzh'] = np.where(spec['alzh_text'].str.contains("1"), "1", "0")

labelled['spec_cvs'] = np.where(spec['cvs_text'].str.contains("1"), "1", "0")

labelled['subspec_ihd'] = np.where(spec['ihd_text'].str.contains("1"), "1", "0")

labelled['subspec_hf'] = np.where(spec['hf_text'].str.contains("1"), "1", "0")

labelled['subspec_arrhyt'] =  np.where(spec['arrhyt_text'].str.contains("1"), "1", "0")

labelled['spec_endo'] = np.where(spec['endo_text'].str.contains("1"), "1", "0")

labelled['spec_dm'] = np.where(spec['dm_text'].str.contains("1"), "1", "0")

labelled['subspec_insulin'] = np.where(spec['insulin_text'].str.contains("1"), "1", "0")

labelled['spec_eye'] = np.where(spec['eye_text'].str.contains("1"), "1", "0")

labelled['subspec_retina'] = np.where(spec['retina_text'].str.contains("1"), "1", "0")

labelled['spec_haem'] = np.where(spec['haem_text'].str.contains("1"), "1", "0")

labelled['spec_obs'] = np.where(spec['obs_text'].str.contains("1"), "1", "0")

labelled['spec_renal'] = np.where(spec['renal_text'].str.contains("1"), "1", "0")

labelled['subspec_ackd'] = np.where(spec['ackd_text'].str.contains("1"), "1", "0")

labelled['spec_pubh'] = np.where(spec['pubh_text'].str.contains("1"), "1", "0")

labelled['subspec_bci'] = np.where(spec['bci_text'].str.contains("1"), "1", "0")

labelled['subspec_prosth'] = np.where(spec['prosth_text'].str.contains("1"), "1", "0")

labelled['subspec_assist'] = np.where(spec['assist_text'].str.contains("1"), "1", "0")

labelled['subspec_activity'] = np.where(spec['activity_text'].str.contains("1"), "1", "0")

#spec.to_csv('output/spec_tagged.csv')

  labelled['spec_haem'] = np.where(spec['haem_text'].str.contains("1"), "1", "0")
  labelled['spec_obs'] = np.where(spec['obs_text'].str.contains("1"), "1", "0")
  labelled['spec_renal'] = np.where(spec['renal_text'].str.contains("1"), "1", "0")
  labelled['subspec_ackd'] = np.where(spec['ackd_text'].str.contains("1"), "1", "0")
  labelled['spec_pubh'] = np.where(spec['pubh_text'].str.contains("1"), "1", "0")
  labelled['subspec_bci'] = np.where(spec['bci_text'].str.contains("1"), "1", "0")
  labelled['subspec_prosth'] = np.where(spec['prosth_text'].str.contains("1"), "1", "0")
  labelled['subspec_assist'] = np.where(spec['assist_text'].str.contains("1"), "1", "0")
  labelled['subspec_activity'] = np.where(spec['activity_text'].str.contains("1"), "1", "0")


In [129]:
## Why NER?
# non specific e.g. TB could be in the middle of a ward. NER recognises context
# words separate by unspecified distance -> lung and adenocarcinoma
## too many possible specific terms for subconditions e.g. lung adenocarcinoma, NSCLC -> adenocarcinoma of the lung

## Combination of general terms in main text
## NER for specific terms

## What are the most used **use-cases**
## Can we find what the prediction target is?

## Other Tags

In [130]:
#lmic_list = ['afghanistan', 'albania', 'algeria', 'angola', 'antigua', 'barbuda', 'argentina', 'armenia', 'china',
#             'azerbaijan', 'bangladesh', 'belarus', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia', 'herzegovina', 
#             'botswana', 'brazil', 'burkina', 'faso', 'burundi', 'verde', 'cambodia', 'cameroon', 'africa', 'chad', 
#             'colombia', 'comoros', 'congo', 'costa rica', 'ivoire', 'cuba', 'djibouti', 'dominica', 'dominica', 
#             'ecuador', 'egypt', 'salvador', 'guinea', 'eritrea', 'eswatini', 'ethiopia', 'fiji', 'gabon', 'gambia', 
#             'georgia', 'ghana', 'grenada', 'guatemala', 'guinea', 'guyana', 'haiti', 'honduras', 'india', 
#             'indonesia', 'iran', 'iraq', 'jamaica', 'jordan', 'kazakhstan', 'kenya', 'kiribati', 'dpr', 'north korea', 
#             'kosovo', 'kyrgyzstan', 'lao', 'lebanon', 'lesotho', 'liberia', 'libya', 'macedonia', 'madagascar', 'malawi', 
#             'malaysia', 'maldives', 'mali', 'marshall', 'mauritania', 'mauritius', 'mexico', 'micronesia', 'moldova', 
#             'mongolia', 'montenegro', 'montserrat', 'morocco', 'mozambique', 'myanmar', 'namibia', 'nauru', 'nepal', 
#             'nicaragua', 'niger', 'nigeria', 'niue', 'pakistan', 'palau', 'panama', 'papua', 'paraguay', 'peru', 
#             'philippines', 'rwanda', 'helena', 'samoa', 'príncipe', 'senegal', 'serbia', 'sierra leone', 'solomon', 
#             'somalia', 'south africa', 'sudan', 'sri lanka', 'saint lucia', 'saint vincent', 'grenadines', 'sudan', 
#             'suriname', 'syria', 'tajikistan', 'tanzania', 'thailand', 'timor', 'togo', 'tokelau', 'tonga', 'tunisia', 
#             'turkey', 'turkmenistan', 'tuvalu', 'uganda', 'ukraine', 'uzbekistan', 'vanuatu', 'venezuela', 'vietnam', 
#             'wallis', 'west bank', 'gaza', 'palestine', 'yemen', 'zambia', 'zimbabwe', 'low-income', 'middle-income', 
#             'lmic', 'scarce', 'resource limited', 'resource-limited']

## Final Tagged Dataset

In [131]:
#all_tagged = pd.concat([algo, feat, spec], axis=1)
#
print(len(labelled))

34179


In [132]:
labelled.info(verbose=1)

<class 'pandas.core.frame.DataFrame'>
Index: 34179 entries, 1 to 172538
Data columns (total 109 columns):
 #    Column               Dtype 
---   ------               ----- 
 0    pmid                 string
 1    doi                  string
 2    title                string
 3    abstract             string
 4    article_date         string
 5    pubmed_date          string
 6    article_type         string
 7    lang                 string
 8    journal              string
 9    journal_short        string
 10   journal_country      string
 11   authors              string
 12   author_affils        string
 13   keywords             string
 14   mesh_terms           string
 15   references_pmids     string
 16   feature              string
 17   include              string
 18   mature               string
 19   algo_neural_net      object
 20   algo_support_vector  object
 21   algo_regression      object
 22   algo_decision_tree   object
 23   algo_discriminant    object
 24   algo

In [133]:
labelled.head(10)

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,journal_country,authors,author_affils,keywords,mesh_terms,references_pmids,feature,include,mature,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_emg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,feat_prom,feat_sound,subspec_icu,subspec_ed,spec_paeds,spec_dent,spec_audio,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,subspec_tropic,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_gynonc,subspec_renalca,subspec_haemonc,subspec_breast,subspec_breastca,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,subspec_osa,subspec_pe,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,subspec_arrhyt,spec_endo,spec_dm,subspec_insulin,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity
1,34688173,10.1016/j.compbiomed.2021.104924,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists.,Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,2021-10-06,2021-10-24,Journal Article,eng,Computers in biology and medicine,Comput Biol Med,United States,"['Yang Yiguang', 'Wang Juncheng', 'Xie Fengying', 'Liu Jie', 'Shu Chang', 'Wang Yukun', 'Zheng Yushan', 'Zhang Haopeng']","['Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China. Electronic address: xfy_73@buaa.edu.cn.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China. Electronic address: Liujie04672@pumch.cn.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.']","['Convolutional neural networks', 'Deep-learning', 'Dermoscopic images', 'Papulosquamous skin diseases', 'Psoriasis']",,,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists. Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,1.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,34688172,10.1016/j.compbiomed.2021.104927,A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19.,"The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",2021-10-11,2021-10-24,Journal Article,eng,Computers in biology and medicine,Comput Biol Med,United States,"['Azouji Neda', 'Sami Ashkan', 'Taheri Mohammad', 'Müller Henning']","['Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: azouji@shirazu.ac.ir.', 'Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: sami@shirazu.ac.ir.', 'Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: motaheri@shirazu.ac.ir.', 'Department of Business Information Systems University of Applied Sciences Western Switzerland, Sierre (HES SO), Switzerland. Electronic address: henning.mueller@hevs.ch.']","['COVID-19', 'Computer-aided diagnosis (CAD)', 'Deep feature extraction', 'Large margin classifier', 'MERS', 'SARS', 'X-ray']",,,"A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19. The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",1.0,0.0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,34687858,10.1016/j.neuroimage.2021.118652,Causal Decoding of Individual Cortical Excitability States.,"Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",2021-10-20,2021-10-24,Journal Article,eng,NeuroImage,Neuroimage,United States,"['Metsomaa J', 'Belardinelli P', 'Ermolova M', 'Ziemann U', 'Zrenner C']","['Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen; CIMeC, Center for Mind-Brain Sciences, University of Trento, Italy.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen. Electronic address: ulf.ziemann@uni-tuebingen.de.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen; Temerty Centre for Therapeutic Brain Intervention, Centre for Addiction and Mental Health, and Department of Psychiatry, University of Toronto, Toronto, ON, Canada.']","['EEG', 'TMS', 'brain state', 'classification', 'excitability', 'machine learning']",,,"Causal Decoding of Individual Cortical Excitability States. Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,34687853,10.1016/j.mri.2021.10.024,Radiomic machine learning for pretreatment assessment of prognostic risk factors for endometrial cancer and its effects on radiologists' decisions of deep myometrial invasion.,To evaluate radiomic machine learning (ML) classifiers based on multiparametric magnetic resonance images (MRI) in pretreatment assessment of endometrial cancer (EC) risk factors and to examine effects on radiologists' interpretation of deep myometrial invasion (dMI).,2021-10-20,2021-10-24,Journal Article,eng,Magnetic resonance imaging,Magn Reson Imaging,Netherlands,"['Otani Satoshi', 'Himoto Yuki', 'Nishio Mizuho', 'Fujimoto Koji', 'Moribata Yusaku', 'Yakami Masahiro', 'Kurata Yasuhisa', 'Hamanishi Junzo', 'Ueda Akihiko', 'Minamiguchi Sachiko', 'Mandai Masaki', 'Kido Aki']","['Department of Diagnostic Imaging and Nuclear Medicine, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan. Electronic address: yhimoto@kuhp.kyoto-u.ac.jp.', 'Department of Diagnostic Imaging and Nuclear Medicine, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Real World Data Research and Developmentx, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan; Preemptive Medicine and Lifestyle-related Disease Research Center, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Preemptive Medicine and Lifestyle-related Disease Research Center, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Pathology, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan.']","['Endometrial cancer', 'Radiomic machine learning']",,,Radiomic machine learning for pretreatment assessment of prognostic risk factors for endometrial cancer and its effects on radiologists' decisions of deep myometrial invasion. To evaluate radiomic machine learning (ML) classifiers based on multiparametric magnetic resonance images (MRI) in pretreatment assessment of endometrial cancer (EC) risk factors and to examine effects on radiologists' interpretation of deep myometrial invasion (dMI).,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,34687850,10.1016/j.mri.2021.10.023,MRI-based machine learning for determining quantitative and qualitative characteristics affecting the survival of glioblastoma multiforme.,Our current study aims to consider the image biomarkers extracted from the MRI images for exploring their effects on glioblastoma multiforme (GBM) patients' survival. Determining its biomarker helps better manage the disease and evaluate treatments. It has been proven that imaging features could be used as a biomarker. The purpose of this study is to investigate the features in MRI and clinical features as the biomarker association of survival of GBM.,2021-10-20,2021-10-24,Journal Article,eng,Magnetic resonance imaging,Magn Reson Imaging,Netherlands,"['Jajroudi Mahdie', 'Enferadi Milad', 'Homayoun Amir Azar', 'Reiazi Reza']","['Pharmaceutical Research Center, Mashhad University of Medical Sciences, Mashhad, Iran. Electronic address: Jajroudimh991@mums.ac.ir.', 'Research Center for Nuclear Medicine, Shariati Hospital, Tehran University of Medical Sciences, Tehran, Iran.', 'Sina Trauma Research Center, Tehran University of Medical Sciences, Tehran, Iran.', 'Radiation Medicine Program, Princess Margaret Cancer Centre, University Health Network, Toronto, Ontario, Canada. Electronic address: reza.reiazi@uhnresearch.ca.']","['Biomarker', 'Clinical features', 'Glioblastoma multiforme', 'MRI features', 'Machine learning']",,,MRI-based machine learning for determining quantitative and qualitative characteristics affecting the survival of glioblastoma multiforme. Our current study aims to consider the image biomarkers extracted from the MRI images for exploring their effects on glioblastoma multiforme (GBM) patients' survival. Determining its biomarker helps better manage the disease and evaluate treatments. It has been proven that imaging features could be used as a biomarker. The purpose of this study is to investigate the features in MRI and clinical features as the biomarker association of survival of GBM.,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14,34687347,10.1007/s00330-021-08284-z,Automated detection of the contrast phase in MDCT by an artificial neural network improves the accuracy of opportunistic bone mineral density measurements.,To determine the accuracy of an artificial neural network (ANN) for fully automated detection of the presence and phase of iodinated contrast agent in routine abdominal multidetector computed tomography (MDCT) scans and evaluate the effect of contrast correction for osteoporosis screening.,2021-10-23,2021-10-24,Journal Article,eng,European radiology,Eur Radiol,Germany,"['Rühling Sebastian', 'Navarro Fernando', 'Sekuboyina Anjany', 'El Husseini Malek', 'Baum Thomas', 'Menze Bjoern', 'Braren Rickmer', 'Zimmer Claus', 'Kirschke Jan S']","['Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Informatics, Technical University of Munich, Munich, Germany.', 'Department of Diagnostic and Interventional Radiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany. jan.kirschke@tum.de.']","['Bone density', 'Machine learning', 'Multidetector computed tomography', 'Osteoporosis', 'Screening']",,,Automated detection of the contrast phase in MDCT by an artificial neural network improves the accuracy of opportunistic bone mineral density measurements. To determine the accuracy of an artificial neural network (ANN) for fully automated detection of the presence and phase of iodinated contrast agent in routine abdominal multidetector computed tomography (MDCT) scans and evaluate the effect of contrast correction for osteoporosis screening.,1.0,0.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21,34686914,10.1007/s00467-021-05321-3,Posterior Urethral Valves Outcomes Prediction (PUVOP): a machine learning tool to predict clinically relevant outcomes in boys with posterior urethral valves.,Early kidney and anatomic features may be predictive of future progression and need for additional procedures in patients with posterior urethral valve (PUV). The objective of this study was to use machine learning (ML) to predict clinically relevant outcomes in these patients.,2021-10-22,2021-10-24,Journal Article,eng,"Pediatric nephrology (Berlin, Germany)",Pediatr Nephrol,Germany,"['Kwong Jethro Cc', 'Khondker Adree', 'Kim Jin Kyu', 'Chua Michael', 'Keefe Daniel T', 'Dos Santos Joana', 'Skreta Marta', 'Erdman Lauren', ""D'Souza Neeta"", 'Selman Antoine Fermin', 'Weaver John', 'Weiss Dana A', 'Long Christopher', 'Tasian Gregory', 'Teoh Chia Wei', 'Rickard Mandy', 'Lorenzo Armando J']","['Division of Urology, Department of Surgery, University of Toronto, Toronto, ON, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Division of Urology, Department of Surgery, University of Toronto, Toronto, ON, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Centre for Computational Medicine, The Hospital for Sick Children, Toronto, ON, Canada.', 'Centre for Computational Medicine, The Hospital for Sick Children, Toronto, ON, Canada.', ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", 'Division of Nephrology, Hospital for Sick Children, Toronto, ON, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Division of Urology, Department of Surgery, University of Toronto, Toronto, ON, Canada. armando.lorenzo@sickkids.ca.']","['Catheterization', 'Chronic kidney disease', 'Dialysis', 'Machine learning', 'Posterior urethral valve', 'Transplant']",,,Posterior Urethral Valves Outcomes Prediction (PUVOP): a machine learning tool to predict clinically relevant outcomes in boys with posterior urethral valves. Early kidney and anatomic features may be predictive of future progression and need for additional procedures in patients with posterior urethral valve (PUV). The objective of this study was to use machine learning (ML) to predict clinically relevant outcomes in these patients.,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
25,34686815,10.1038/s41415-021-3526-6,The ADEPT study: a comparative study of dentists' ability to detect enamel-only proximal caries in bitewing radiographs with and without the use of AssistDent artificial intelligence software.,"Introduction Reversal of enamel-only proximal caries by non-invasive treatments is important in preventive dentistry. However, detecting such caries using bitewing radiography is difficult and the subtle patterns are often missed by dental practitioners.Aims To investigate whether the ability of dentists to detect enamel-only proximal caries is enhanced by the use of AssistDent artificial intelligence (AI) software.Materials and methods In the ADEPT (AssistDent Enamel-only Proximal caries assessmenT) study, 23 dentists were randomly divided into a control arm, without AI assistance, and an experimental arm, in which AI assistance provided on-screen prompts indicating potential enamel-only proximal caries. All participants analysed a set of 24 bitewings in which an expert panel had previously identified 65 enamel-only carious lesions and 241 healthy proximal surfaces.Results The control group found 44.3% of the caries, whereas the experimental group found 75.8%. The experimental group incorrectly identified caries in 14.6% of the healthy surfaces compared to 3.7% in the control group. The increase in sensitivity of 71% and decrease in specificity of 11% are statistically significant (p <0.01).Conclusions AssistDent AI software significantly improves dentists' ability to detect enamel-only proximal caries and could be considered as a tool to support preventive dentistry in general practice.",2021-10-22,2021-10-24,Journal Article,eng,British dental journal,Br Dent J,England,"['Devlin Hugh', 'Williams Tomos', 'Graham Jim', 'Ashley Martin']","['Professor of Restorative Dentistry, Division of Dentistry, School of Medical Sciences, University of Manchester, UK; Director, Manchester Imaging Ltd, UK.', 'Honorary Research Assistant, Division of Dentistry, School of Medical Sciences, University of Manchester, UK; Software Manager, Manchester Imaging Ltd, UK. tomos.williams@manchester.ac.uk.', 'Director, Manchester Imaging Ltd, UK; Honorary Reader, Division of Informatics, Imaging and Data Sciences, School of Health Sciences, University of Manchester, UK.', 'Consultant and MAHSC Honorary Professor in Restorative Dentistry and Oral Health, University Dental Hospital of Manchester, Manchester University NHS Foundation Trust, UK.']",,,,"The ADEPT study: a comparative study of dentists' ability to detect enamel-only proximal caries in bitewing radiographs with and without the use of AssistDent artificial intelligence software. Introduction Reversal of enamel-only proximal caries by non-invasive treatments is important in preventive dentistry. However, detecting such caries using bitewing radiography is difficult and the subtle patterns are often missed by dental practitioners.Aims To investigate whether the ability of dentists to detect enamel-only proximal caries is enhanced by the use of AssistDent artificial intelligence (AI) software.Materials and methods In the ADEPT (AssistDent Enamel-only Proximal caries assessmenT) study, 23 dentists were randomly divided into a control arm, without AI assistance, and an experimental arm, in which AI assistance provided on-screen prompts indicating potential enamel-only proximal caries. All participants analysed a set of 24 bitewings in which an expert panel had previously identified 65 enamel-only carious lesions and 241 healthy proximal surfaces.Results The control group found 44.3% of the caries, whereas the experimental group found 75.8%. The experimental group incorrectly identified caries in 14.6% of the healthy surfaces compared to 3.7% in the control group. The increase in sensitivity of 71% and decrease in specificity of 11% are statistically significant (p <0.01).Conclusions AssistDent AI software significantly improves dentists' ability to detect enamel-only proximal caries and could be considered as a tool to support preventive dentistry in general practice.",1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
29,34686646,10.1097/CMR.0000000000000774,Machine learning for the identification of decision boundaries during the transition from radial to vertical growth phase superficial spreading melanomas.,"To compute threshold values for the diameter of superficial spreading melanomas (SSMs) at which the radial growth phase (RGP) evolves into an invasive vertical growth phase (VGP). We examined reports from 1995 to 2019 of 834 primary SSMs. All the patients underwent complete surgical removal of the tumor and the diagnosis was confirmed after histologic examination. Machine learning was used to compute the thresholds. For invasive non-naevus-associated SSMs, a threshold for the diameter was found at 13.2 mm (n = 634). For the lower limb (n = 209) the threshold was at 9.8 mm, whereas for the upper limb (n = 117) at 14.1 mm. For the back (n = 106) and the trunk (n = 173), the threshold was at 16.2 mm and 17.1 mm, respectively. When considering non-naevus-associated and naevus-associated SSMs together (n = 834) a threshold for the diameter was found at 16.8 mm. For the lower limb (n = 248) the threshold was at 11.7 mm, whereas for the upper limb (n = 146) at 16.4 mm. For the back (n = 170) and the trunk (n = 236), the threshold was at 18.6 mm and 14.1 mm, respectively. Thresholds for various anatomic locations and for each gender were defined. They were based on the diameter of the melanoma and computed to suggest a transition from RGP to VGP. The transition from a radial to a more invasive vertical phase is detected by an increase of tumor size with a numeric cutoff. Besides the anamnestic, clinical and dermatoscopic findings, our proposed approach may have practical relevance in vivo during clinical presurgical inspections.",2021-10-21,2021-10-24,Journal Article,eng,Melanoma research,Melanoma Res,England,"['Moglia Andrea', 'Cerri Amilcare', 'Moglia Alessandra', 'Berchiolli Raffaella', 'Ferrari Mauro', 'Betti Roberto']",,,,,"Machine learning for the identification of decision boundaries during the transition from radial to vertical growth phase superficial spreading melanomas. To compute threshold values for the diameter of superficial spreading melanomas (SSMs) at which the radial growth phase (RGP) evolves into an invasive vertical growth phase (VGP). We examined reports from 1995 to 2019 of 834 primary SSMs. All the patients underwent complete surgical removal of the tumor and the diagnosis was confirmed after histologic examination. Machine learning was used to compute the thresholds. For invasive non-naevus-associated SSMs, a threshold for the diameter was found at 13.2 mm (n = 634). For the lower limb (n = 209) the threshold was at 9.8 mm, whereas for the upper limb (n = 117) at 14.1 mm. For the back (n = 106) and the trunk (n = 173), the threshold was at 16.2 mm and 17.1 mm, respectively. When considering non-naevus-associated and naevus-associated SSMs together (n = 834) a threshold for the diameter was found at 16.8 mm. For the lower limb (n = 248) the threshold was at 11.7 mm, whereas for the upper limb (n = 146) at 16.4 mm. For the back (n = 170) and the trunk (n = 236), the threshold was at 18.6 mm and 14.1 mm, respectively. Thresholds for various anatomic locations and for each gender were defined. They were based on the diameter of the melanoma and computed to suggest a transition from RGP to VGP. The transition from a radial to a more invasive vertical phase is detected by an increase of tumor size with a numeric cutoff. Besides the anamnestic, clinical and dermatoscopic findings, our proposed approach may have practical relevance in vivo during clinical presurgical inspections.",1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
31,34686573,10.1136/neurintsurg-2021-017976,"Prediction of bleb formation in intracranial aneurysms using machine learning models based on aneurysm hemodynamics, geometry, location, and patient population.",Bleb presence in intracranial aneurysms (IAs) is a known indication of instability and vulnerability.,2021-10-22,2021-10-24,Journal Article,eng,Journal of neurointerventional surgery,J Neurointerv Surg,England,"['Salimi Ashkezari Seyedeh Fatemeh', 'Mut Fernando', 'Slawski Martin', 'Cheng Boyle', 'Yu Alexander K', 'White Tim G', 'Woo Henry H', 'Koch Matthew J', 'Amin-Hanjani Sepideh', 'Charbel Fady T', 'Rezai Jahromi Behnam', 'Niemelä Mika', 'Koivisto Timo', 'Frosen Juhana', 'Tobe Yasutaka', 'Maiti Spandan', 'Robertson Anne M', 'Cebral Juan R']","['Department of Bioengineering, George Mason University, Fairfax, Virginia, USA ssalimia@gmu.edu.', 'Department of Bioengineering, George Mason University, Fairfax, Virginia, USA.', 'Department of Statistics, George Mason University, Fairfax, Virginia, USA.', 'Department of Neurosurgery, Allegheny General Hospital, Pittsburgh, Pennsylvania, USA.', 'Department of Neurosurgery, Allegheny General Hospital, Pittsburgh, Pennsylvania, USA.', 'Department of Neurosurgery, Donald and Barbara Zucker School of Medicine at Hofstra/Northwell, Manhasset, New York, USA.', 'Department of Neurosurgery, Donald and Barbara Zucker School of Medicine at Hofstra/Northwell, Manhasset, New York, USA.', 'Department of Neurosurgery, University of Illinois at Chicago, Chicago, Illinois, USA.', 'Department of Neurosurgery, University of Illinois at Chicago, Chicago, Illinois, USA.', 'Department of Neurosurgery, University of Illinois at Chicago, Chicago, Illinois, USA.', 'Neurosurgery Research Group, Biomedicum Helsinki, University of Helsinki, Helsinki, Uusimaa, Finland.', 'Department of Neurosurgery, Töölö Hospital, University of Helsinki, Helsinki, Finland.', 'Department of Neurosurgery, Kuopio University Hospital, Kuopio, Pohjois-Savo, Finland.', 'Department of Neurosurgery, Tampere University Hospital, Tampere, Finland.', 'Department of Mechanical Engineering and Material Science, University of Pittsburgh, Pittsburgh, Pennsylvania, USA.', 'Department of Mechanical Engineering and Material Science, University of Pittsburgh, Pittsburgh, Pennsylvania, USA.', 'Department of Mechanical Engineering and Material Science, University of Pittsburgh, Pittsburgh, Pennsylvania, USA.', 'Department of Bioengineering, George Mason University, Fairfax, Virginia, USA.']","['aneurysm', 'blood flow', 'hemorrhage', 'statistics']",,,"Prediction of bleb formation in intracranial aneurysms using machine learning models based on aneurysm hemodynamics, geometry, location, and patient population. Bleb presence in intracranial aneurysms (IAs) is a known indication of instability and vulnerability.",1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [134]:
#final_ner = all_tagged[['text', 
#                        'algo_neural_net', 'algo_support_vector', 'algo_regression', 'algo_decision_tree', 
#                       'algo_discriminant', 'algo_naive_bayes', 'algo_transfer', 'algo_federated', 'algo_k_nearest',
#                       'algo_unsupervised',
#                        'feat_imaging', 'feat_xr', 'feat_ct', 'feat_mri', 'feat_eeg', 'feat_ecg',
#                       'feat_us', 'feat_echo', 'feat_histo', 'feat_oct', 'feat_mamm', 'feat_endoscop', 'feat_derm',
#                       'feat_gene', 'feat_bio', 'feat_nlp', 'feat_ehr', 'feat_sensor', 'feat_phone', 
#                        'subspec_icu', 'subspec_ed', 'spec_id', 'subspec_sepsis', 'subspec_hiv', 'subspec_cov19', 'subspec_tb',
#                       'subspec_malaria', 'spec_derm', 'subspec_dermca', 'spec_onc', 'subspec_rx', 'subspec_gynonc', 
#                       'subspec_lungca', 'subspec_brainca', 'subspec_gica', 'subspec_hepca', 'subspec_prosca',
#                       'subspec_renalca', 'subspec_haemonc', 'subspec_breast', 'spec_psych','subspec_suicide', 'spec_msk', 
#                        'subspec_frac', 'spec_rheum', 'spec_gi', 'spec_hep', 'spec_resp', 'subspec_pneum',
#                        'spec_neuro', 'subspec_epilep', 'subspec_cva', 'subspec_alzh', 'spec_cvs', 'subspec_ihd', 'subspec_hf', 
#                       'spec_endo', 'subspec_dm', 'spec_eye', 'subspec_retina', 'spec_haem', 'spec_obs', 'spec_renal', 
#                        'subspec_ackd', 'spec_paeds', 'spec_dent',  'spec_audio', 'spec_pubh', 'subspec_bci',
#                       'subspec_prosth', 'subspec_assist','subspec_activity', 'subspec_arrhyt', 'countries', 'lmic_flag']].copy()
#
#final_ner.to_csv('output/final_ner.csv')

In [135]:
labelled.to_csv('data/char_labelled.csv')

## Evaluation

In [164]:
ner_eval = labelled.drop(['doi', 'title', 'abstract', 'article_date', 'pubmed_date', 'article_type', 'lang', 'journal', 'journal_short',
                         'journal_country', 'authors', 'author_affils', 'keywords', 'mesh_terms', 'references_pmids', 'include', 'mature'], axis=1)

In [165]:
ner_eval.head(3)

Unnamed: 0,pmid,feature,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_emg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,feat_prom,feat_sound,subspec_icu,subspec_ed,spec_paeds,spec_dent,spec_audio,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,subspec_tropic,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_gynonc,subspec_renalca,subspec_haemonc,subspec_breast,subspec_breastca,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,subspec_osa,subspec_pe,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,subspec_arrhyt,spec_endo,spec_dm,subspec_insulin,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity
1,34688173,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists. Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,34688172,"A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19. The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,34687858,"Causal Decoding of Individual Cortical Excitability States. Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [166]:
nerdata = ner_eval.apply(lambda s: [s.name if v == "1" else np.nan for v in s])

In [167]:
nerdata.head(3)

Unnamed: 0,pmid,feature,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_emg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,feat_prom,feat_sound,subspec_icu,subspec_ed,spec_paeds,spec_dent,spec_audio,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,subspec_tropic,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_gynonc,subspec_renalca,subspec_haemonc,subspec_breast,subspec_breastca,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,subspec_osa,subspec_pe,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,subspec_arrhyt,spec_endo,spec_dm,subspec_insulin,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity
1,,,algo_neural_net,,,,,,,,,,,,,,,,,,,,,,feat_derm,,,,,,,,,,,,,,,,,,,,,spec_derm,,,,,,,,,,,,,,spec_psych,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,algo_transfer,,,,feat_xr,,,,,,,,,,,,,,,,,,,,,,,,,,spec_id,,,subspec_cov19,,,,,,,,,,,,,,,,,,,,,,,,,spec_resp,subspec_pneum,,,,,,,,,,,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,,,feat_eeg,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,spec_neuro,,,,,,,,,,,,,,,,,,,,,


In [169]:
ner_eval['result'] = nerdata.apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

In [170]:
ner_eval.head(3)

Unnamed: 0,pmid,feature,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_emg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,feat_prom,feat_sound,subspec_icu,subspec_ed,spec_paeds,spec_dent,spec_audio,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,subspec_tropic,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_gynonc,subspec_renalca,subspec_haemonc,subspec_breast,subspec_breastca,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,subspec_osa,subspec_pe,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,subspec_arrhyt,spec_endo,spec_dm,subspec_insulin,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity,result
1,34688173,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists. Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"algo_neural_net,feat_derm,spec_derm,spec_psych,algo_neural_net,feat_derm,spec_derm,spec_psych"
2,34688172,"A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19. The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"algo_transfer,feat_xr,spec_id,subspec_cov19,spec_resp,subspec_pneum,algo_transfer,feat_xr,spec_id,subspec_cov19,spec_resp,subspec_pneum"
8,34687858,"Causal Decoding of Individual Cortical Excitability States. Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"feat_eeg,spec_neuro,feat_eeg,spec_neuro"


In [171]:
ner_eval.to_csv('data/char_labelled_evaluation.csv')