In [1]:
import os
import shutil
import sys
from tqdm import tqdm

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

tf.get_logger().setLevel('ERROR')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

#for BERT
import transformers

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.__version__)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15838796034612933057
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 22723493888
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4212270416112766604
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6"
]
2.5.0


In [4]:
# GPU options to limit OOM erors
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Reload model

In [5]:
#RELOAD MODEL

saved_model_path = 'models/maturity_bert'

reloaded_model = tf.saved_model.load(saved_model_path)

In [6]:
reloaded_model

<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject at 0x22117b43e20>

## Label data with maturity classification

In [7]:
complete = pd.read_csv('data/inclusions.csv', index_col=0)

In [8]:
#My version of the DF didn't have the 'feature' columns so made it here
#complete['feature'] = (complete.title + ' ' + complete.abstract)

In [9]:
complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 1 to 192947
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   pmid              42307 non-null  int64  
 1   doi               37409 non-null  object 
 2   title             42306 non-null  object 
 3   abstract          42307 non-null  object 
 4   article_date      32669 non-null  object 
 5   pubmed_date       42307 non-null  object 
 6   article_type      42307 non-null  object 
 7   lang              42307 non-null  object 
 8   journal           42307 non-null  object 
 9   journal_short     42307 non-null  object 
 10  journal_country   42307 non-null  object 
 11  authors           41281 non-null  object 
 12  author_affils     31156 non-null  object 
 13  keywords          24364 non-null  object 
 14  mesh_terms        32535 non-null  object 
 15  references_pmids  21512 non-null  object 
 16  feature           42307 non-null  objec

In [10]:
labeldf = complete[['feature']].copy()
labeldf.tail(10)

Unnamed: 0,feature
192918,TACHY: an expert system for the management of ...
192924,Detection of ECG waveforms by neural networks....
192928,Neural network analysis of breast cancer from ...
192929,Automated classification of patients with chro...
192932,On the use of neural network techniques to ana...
192933,Analysis of quantitative EEG with artificial n...
192935,Neural networks as a prognostic tool for patie...
192939,Automated interpretation of myocardial SPECT p...
192946,Acute pulmonary embolism: cost-effectiveness a...
192947,Algorithm analysis of lectin glycohistochemist...


In [11]:
labeldf[labeldf['feature'].isna()]

Unnamed: 0,feature


In [12]:
labeldf.dropna(subset=['feature'], inplace=True)
labeldf[labeldf['feature'].isna()]

Unnamed: 0,feature


In [13]:
labeldf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 1 to 192947
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   feature  42307 non-null  object
dtypes: object(1)
memory usage: 661.0+ KB


In [14]:
def label_unlabelled(df, feature_column = 'feature', model=reloaded_model, number_to_label='all', cpu_labelling=False, chunk_size=100):
    
    # Either use the whole df, random sample of size specified or a list of indices
    if number_to_label == 'all':
        labelled_df = df[feature_column].copy()
        
    elif isinstance(number_to_label, list):
        labelled_df = df.loc[number_to_label, feature_column].copy()
        
    else:
        assert isinstance(number_to_label, int), "Number to label must be 'all' or an integer subset to label"
        assert number_to_label < len(df), "When specifying a subset to label, must be less than the total number of samples"
        labelled_df = df[feature_column].sample(number_to_label).copy()
        
    # Add a column to the DF for labels
    labelled_df = labelled_df.to_frame()
    labelled_df['include'] = np.nan 
    
    # Decide what device we want TF to use
    if cpu_labelling:
        device = '/cpu:0'
        print("Labelling with CPU...")
    else:
        device = '/gpu:0'
        print("Labelling with GPU...")
     
    # Label by specified chunk size
    with tqdm(total=len(labelled_df), file=sys.stdout) as pbar:
        for chunk_i in range(0, len(labelled_df.index), chunk_size):
            
            chunk = labelled_df.index[chunk_i:chunk_i + chunk_size]
        
            try:
                with tf.device(device):
                    labels = tf.sigmoid(model(tf.constant(labelled_df.loc[chunk, 'feature'])))
                labelled_df.loc[chunk, 'include'] = labels
                pbar.update(len(chunk))
            except Exception as e:
                print(e)
                print("Returning (possibly) partially labelled dataset...")
                return labelled_df
                break
            
    return labelled_df

In [15]:
labelled = label_unlabelled(labeldf, number_to_label='all', cpu_labelling=False, chunk_size=50)

Labelling with GPU...
100%|████████████████████████████████████████████████████████████████████████████| 42307/42307 [08:51<00:00, 79.66it/s]


In [16]:
uncertain = labelled[(labelled.include < 0.9) & (labelled.include > 0.1)]
uncertain.head(20)

Unnamed: 0,feature,include
16,Validation of a deep learning segmentation alg...,0.87453
31,Impact of Artificial Intelligence on Miss Rate...,0.858973
69,Artificial intelligence assistance in radiogra...,0.308002
77,Fully automated deep learning powered calcium ...,0.411124
201,Prediction of Aortic Contrast Enhancement on D...,0.290394
230,Federated learning for multi-center collaborat...,0.363134
282,Using a Convolutional Neural Network and Convo...,0.298519
373,Scrutinizing high-risk patients from ASC-US cy...,0.872286
457,Optimizing the radiomics-machine-learning mode...,0.713625
467,Unenhanced abdominal low-dose CT reconstructed...,0.827257


In [17]:
labelled['include_rounded'] = np.round(labelled.include)

In [18]:
labelled.include_rounded.value_counts()

0.0    38781
1.0     3526
Name: include_rounded, dtype: int64

In [19]:
labelled.tail(5)

Unnamed: 0,feature,include,include_rounded
192933,Analysis of quantitative EEG with artificial n...,0.000196,0.0
192935,Neural networks as a prognostic tool for patie...,6.6e-05,0.0
192939,Automated interpretation of myocardial SPECT p...,0.999428,1.0
192946,Acute pulmonary embolism: cost-effectiveness a...,0.210386,0.0
192947,Algorithm analysis of lectin glycohistochemist...,0.000411,0.0


In [20]:
labelled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 1 to 192947
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   feature          42307 non-null  object 
 1   include          42307 non-null  float64
 2   include_rounded  42307 non-null  float64
dtypes: float64(2), object(1)
memory usage: 2.3+ MB


In [21]:
#complete.dropna(subset=['feature'], inplace=True)
complete.tail(5)

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,journal_country,authors,author_affils,keywords,mesh_terms,references_pmids,feature,year,include
192933,9438271,,Analysis of quantitative EEG with artificial n...,Artificial neural networks (ANN) are widely us...,,1998-01-23,Clinical Trial,eng,Neuropsychobiology,Neuropsychobiology,Switzerland,"['Winterer G', 'Ziller M', 'Klöppel B', 'Heinz...",,,"['Alcoholism', 'Algorithms', 'Discriminant Ana...",,Analysis of quantitative EEG with artificial n...,1998,1.0
192935,9436967,,Neural networks as a prognostic tool for patie...,Patients with non-small cell carcinoma of the ...,,1998-01-22,Journal Article,eng,Modern pathology : an official journal of the ...,Mod Pathol,United States,"['Bellotti M', 'Elsner B', 'Paez De Lima A', '...",,,"['Adenocarcinoma', 'Antigens, Nuclear', 'Bioma...",,Neural networks as a prognostic tool for patie...,1998,1.0
192939,9430460,,Automated interpretation of myocardial SPECT p...,The purpose of this study was to develop a com...,,1998-01-16,Comparative Study,eng,Journal of nuclear medicine : official publica...,J Nucl Med,United States,"['Lindahl D', 'Palmer J', 'Ohlsson M', 'Peters...",,,"['Coronary Angiography', 'Coronary Disease', '...",,Automated interpretation of myocardial SPECT p...,1998,1.0
192946,9423655,,Acute pulmonary embolism: cost-effectiveness a...,To evaluate the cost-effectiveness of artifici...,,1998-01-10,Journal Article,eng,Radiology,Radiology,United States,"['Tourassi G D', 'Floyd C E', 'Coleman R E']",,,"['Acute Disease', 'Angiography', 'Cost-Benefit...",,Acute pulmonary embolism: cost-effectiveness a...,1998,1.0
192947,9415600,,Algorithm analysis of lectin glycohistochemist...,The aim of this study is to present a new clas...,,1998-01-10,Journal Article,eng,"The Annals of otology, rhinology, and laryngology",Ann Otol Rhinol Laryngol,United States,"['Hassid S', 'Decaestecker C', 'Hermans C', 'S...",,,"['Algorithms', 'Coloring Agents', 'Decision Tr...",,Algorithm analysis of lectin glycohistochemist...,1998,1.0


In [22]:
print(len(complete))
print(len(labelled))

42307
42307


In [23]:
#complete['include_fuzzy'] = labelled['include']
complete['mature'] = labelled['include_rounded']

In [24]:
# SAVE FINAL FILES

In [25]:
#uncertain.to_csv("final_outputs/comparative_uncertain.csv")

In [26]:
complete.to_csv("data/mature_labelled.csv")

In [27]:
complete['mature'].value_counts()

0.0    38781
1.0     3526
Name: mature, dtype: int64