In [2]:
#conda install pytorch
#conda install -c huggingface transformers

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import sklearn
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import AutoModel, AutoTokenizer, BertTokenizer
import transformers
import pandas as pd
import numpy as np

torch.set_grad_enabled(False)
device = "cpu"
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name())

In [12]:
len(pd.read_csv("/home/steve/Desktop/journalrater/deduped_rob.csv"))

15406

In [5]:
#setup
#put path to files here
path = ""
df = pd.read_csv(path+"rob_dev.csv")
validation=pd.read_csv(path+"rob_test.csv")

#training set
abstracts = df["ab"]
labels = df["all_low"]
#validation set
ABSTRACTS = validation["ab"]
LABELS = validation["all_low"]



In [None]:
#BERT model: BERT tasks run on colab
class BertBased(torch.nn.Module):
    def __init__(self):
        super(BertBased, self).__init__()
        self.bert = transformers.AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
        self.linear = torch.nn.Linear(768, 2)

    def forward(self, ids, mask):
        return torch.nn.functional.softmax(self.linear(self.bert(ids, attention_mask=mask)[0][:,0,:]),dim=1)

    
model = BertBased()
model.state_dict = torch.load(path+"scibertweights")
model.to(device)
tokenizer = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
encoded = tokenizer.batch_encode_plus(list(abstracts), return_tensors='pt', padding=True, truncation=True,max_length=512, add_special_tokens = True)
ENCODED = tokenizer.batch_encode_plus(list(ABSTRACTS), return_tensors='pt', padding=True, truncation=True,max_length=512, add_special_tokens = True)

In [None]:
#Training loop for bert:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
epochs = 4
batchsize = 32

model.train()
for epoch in range(epochs):
    print("epoch:")
    print(epoch)
    for batch in range(len(abstracts)//batchsize): 
        #print("batch:")
        #print(batch)
        data = encoded["input_ids"][batch:batch+batchsize].to(device)
        attention_mask = encoded['attention_mask'][batch:batch+batchsize].to(device)
        targets = torch.tensor(list(labels)[batch:batch+batchsize]).type(torch.LongTensor).to(device)  
        #print(targets)
        optimizer.zero_grad()  
        outputs = model(data,attention_mask) 
        loss = criterion(outputs, targets)
        loss.requires_grad=True
        #print("outputs: ")
        #print(outputs)
        #print("Loss:")
        #print(loss)
        loss.backward()
        optimizer.step()

In [None]:
#validation
model.eval()
predictions = []
for i in range(len(ABSTRACTS)):
    if i%100==0: print(i)
    data = ENCODED["input_ids"][i].to(device).reshape(1,-1)
    attention_mask = ENCODED['attention_mask'][i].to(device).reshape(1,-1)
    predictions.append(model(data,attention_mask))


print(sklearn.metrics.confusion_matrix(LABELS,[0 if i[0][0]>i[0][1] else 1 for i in predictions]))
print(sklearn.metrics.roc_auc_score(LABELS,[i[0][1] for i in predictions]))
print(sklearn.metrics.f1_score(LABELS, [0 if i[0][0]>i[0][1] else 1 for i in predictions]))

In [6]:
#Logistic Regression
m = LogisticRegression(max_iter=10000)
v = CountVectorizer()
a = abstracts.append(ABSTRACTS)

v = v.fit(a)
m.fit(v.transform(abstracts),labels)

#validation
print(sklearn.metrics.roc_auc_score(LABELS,m.predict(v.transform(ABSTRACTS))))
print(sklearn.metrics.confusion_matrix(LABELS,m.predict(v.transform(ABSTRACTS))))
print(sklearn.metrics.f1_score(LABELS,m.predict(v.transform(ABSTRACTS))))

0.5884146880782307
[[1219  108]
 [ 158   55]]
0.2925531914893617


In [None]:
#scraping code
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time

#rct_data.csv found at https://raw.githubusercontent.com/wmotte/frrp/master/data/data.csv
rct = pd.read_csv(path+"rct_data.csv")
alreadyhave = pd.read_csv(path+"abstracts-journals.csv")

def lookup(pmid, attempt=0):
    baseurl = "https://pubmed.ncbi.nlm.nih.gov/"
    try:
        soup = BeautifulSoup(requests.get(baseurl+str(pmid)).text)
        temp = soup.find(id="enc-abstract")
        abstract = ""
        for c in temp.findChildren("p"):
            abstract+=c.text
        journal = soup.find(id="full-view-journal-trigger").text
        return {"pmid": pmid,"abstract": abstract, "journal": journal}
    except Exception:
        print("excepting ", pmid)
        if attempt == 100: raise Exception("cant get ",pmid)
        time.sleep(2)
        return lookup(pmid, attempt+1)


with open(path+"abstracts-journals.csv","a") as f:
    print("have: ",len(alreadyhave))
    print("total:", len(rct["pmid"]))
    for i in range(len(alreadyhave),len(rct["pmid"])):
        result = []
        result.append(lookup(rct["pmid"][i]))
        result = pd.DataFrame(result)
        f.write(result.to_csv(index=False,header=False))
        if i!=0 and i%1000==0: 
            print(i)
            
print("done")

In [8]:
#all of the journals in the dataset: might need to look up full name on pubmed 
journals = pd.read_csv(path+"abstracts-journals.csv")
set(journals["journal"])

{'\n      Sex Abuse\n    ',
 '\n      J Interferon Cytokine Res\n    ',
 '\n      Ear Hear\n    ',
 '\n      Reprod Biomed Online\n    ',
 '\n      Theriogenology\n    ',
 '\n      Am J Clin Nutr\n    ',
 '\n      Pituitary\n    ',
 '\n      J Commun Disord\n    ',
 '\n      Clin Prostate Cancer\n    ',
 '\n      Korean J Hepatol\n    ',
 '\n      Qual Saf Health Care\n    ',
 '\n      Nicotine Tob Res\n    ',
 '\n      Cont Lens Anterior Eye\n    ',
 '\n      Arch Intern Med\n    ',
 '\n      Drugs\n    ',
 '\n      Respir Investig\n    ',
 '\n      Ann Noninvasive Electrocardiol\n    ',
 '\n      Biol Res Nurs\n    ',
 '\n      PLoS Med\n    ',
 '\n      Eur J Nutr\n    ',
 '\n      Free Radic Res\n    ',
 '\n      J Auton Nerv Syst\n    ',
 '\n      Int J Aging Hum Dev\n    ',
 '\n      Leuk Res\n    ',
 '\n      Int J Hyperthermia\n    ',
 '\n      J Psychosom Obstet Gynaecol\n    ',
 '\n      Acupunct Electrother Res\n    ',
 '\n      Age Ageing\n    ',
 '\n      Cogn Emot\n    ',

In [None]:
#to rate a journal, enter its name below
name = "N Engl J Med"

def rate(abstract):
    enc = tokenizer.encode_plus(abstract, return_tensors='pt', padding=True, truncation=True,max_length=512, add_special_tokens = True)
    out = model(enc["input_ids"],enc["attention_mask"])[0]
    return 0 if out[0] > out[1] else 1

journal = journals[journals.journal.str.contains(name)].copy()
journal["rating"] = journal["abstract"].apply(rate)
print(name, journal["rating"].mean())