In [5]:
import random
from sklearn.model_selection import train_test_split
from transformers import Trainer,AutoTokenizer,AutoModelForSequenceClassification, set_seed
from datasets import Dataset,load_dataset, load_from_disk
import pandas as pd
import numpy as np
import torch
import json
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from run_normalize_sections import normalize_section
import stanza

stanza.download('en', package='genia')
nlp = stanza.Pipeline('en', package='genia')

tqdm.pandas()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 213kB [00:00, 19.8MB/s]
2023-05-01 15:27:09 INFO: Downloading these customized packages for language: en (English)...
| Processor | Package |
-----------------------
| tokenize  | genia   |
| pos       | genia   |
| lemma     | genia   |
| depparse  | genia   |
| pretrain  | genia   |

2023-05-01 15:27:09 INFO: File exists: /Users/joemenke/stanza_resources/en/tokenize/genia.pt
2023-05-01 15:27:09 INFO: File exists: /Users/joemenke/stanza_resources/en/pos/genia.pt
2023-05-01 15:27:09 INFO: File exists: /Users/joemenke/stanza_resources/en/lemma/genia.pt
2023-05-01 15:27:09 INFO: File exists: /Users/joemenke/stanza_resources/en/depparse/genia.pt
2023-05-01 15:27:10 INFO: File exists: /Users/joemenke/stanza_resources/en/pretrain/genia.pt
2023-05-01 15:27:10 INFO: Finished downloading models and saved to /Users/joemenke/stanza_resources.
2023-05-01 15:27:10 INFO: Checking for updates to res

In [2]:
random_state = 42
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
set_seed(42)

## Process data

In [25]:
def process_paper(inputs):
    with open(inputs) as fp:
        contents = fp.read()
        paper_dict = json.loads(contents)
        normalized_dict = normalize_section(paper_dict)
    return normalized_dict

def directory_looper(directory):
    input_list=[]
    filenames = []
    for name in os.listdir(directory):
        filename = os.fsdecode(name)
        if filename.endswith(".txt"):
            input_list.append(os.path.join(directory, filename))
            filenames.append(filename)
    return input_list, filenames

def process_directory(input_directory):
    inputs, filenames = directory_looper(input_directory)
    data = []
    for i in range(len(inputs)):
        norm_data = process_paper(inputs[i])
        norm_data["filename"] = filenames[i]
        data.append(norm_data)
    return data

In [41]:
def convert_list2str(row):
    try:
        row = ast.literal_eval(row)
    except:
        row = row.strip('][').replace("'", "").split(', ')
    return " ".join(row)

def split_sentences(row):
    doc = nlp(row) # nlp is the biomedical stanza model initialized in the import cell
    return [sentence.text for sentence in doc.sentences]

def process_text(df, col_name = 'methods'):
    df_ = df[['filename', col_name]]
    df_ = df_.dropna()
    df_[col_name] = df_[col_name].apply(convert_list2str)
    df_[col_name] = df_[col_name].progress_apply(split_sentences)
    df_ = df_.explode(col_name)
    df_.rename(columns = {col_name:'text'}, inplace = True)
    return df_

def process_sections(infile_loc, sections2check):
    infile = process_directory(infile_loc)
    df = pd.DataFrame.from_records(infile)
    sections = []
    for section in sections2check:
        sections.append(process_text(df, section))
    sections = pd.concat(sections)
    return sections

In [16]:
sections = ['abstract', 'intro', 'methods', 'results', 'discussion', 'conclusions']

cai_text = process_sections('/data/cai/txt', sections) # Cai Prediabetes Meta-Analysis
guj_text = process_sections('/data/gujral/txt', sections) # Gujral Prediabetes Meta-Analysis

frames = [cai_text, guj_text]
text = pd.concat(frames)

unique_text = text.drop_duplicates()

## Predictions

In [68]:
model_name = "biolink_model"
tokenizer_name = "michiyasunaga/BioLinkBERT-base"
max_length = 512

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def tokenize_function(data):
    return tokenizer(data["text"], padding='max_length', truncation=True, max_length = max_length)

def preprocessing(dataset):
    dataset = Dataset.from_pandas(dataset[['text']], preserve_index=False)
    dataset_token = dataset.map(tokenize_function)
    return dataset_token

In [69]:
text_dataset = preprocessing(unique_text)

biolink_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2).to(device)
trainer = Trainer(model = biolink_model)

text_pred = trainer.predict(text_dataset)

predictions = list(np.argmax(text_pred.predictions.squeeze(), axis=-1))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
                                                                                

In [70]:
text = text_dataset['text']
filenames = unique_text['filename']

text_dict = {'text': text, 'name': filenames, 'prediction': predictions} 
    
text_df = pd.DataFrame(text_dict)

In [71]:
text_df.to_csv('/data/doc_summary_predictions.csv', index=False)