In [0]:
!python -m spacy download en_core_web_md
%pip install thefuzz
%pip install beautifulsoup4

2023-10-26 16:12:21.486865: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/42.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:31[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/42.8 MB[0m [31m21.5 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/42.8 MB[0m [31m36.4 MB/s[0m eta [36m0:00:02[0m

In [0]:
import gzip
import json
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import spacy
from thefuzz import fuzz
from bs4 import BeautifulSoup
import requests
import re
from tqdm import tqdm

In [0]:
software_contexts = '/dbfs/FileStore/citation_intent/software_contexts_json.gz'
extra = '/dbfs/FileStore/citation_intent/dataset_software_extra_contexts_json.gz'

def open_gzip_json(fpath):
    with gzip.open(fpath, 'r') as gzip_file:
        reserve_data = gzip_file.read()
        j = gzip.decompress(reserve_data)
        data = json.loads(j.decode('utf-8')) 
    return data

In [0]:
extra_data = open_gzip_json(extra)
nlp = spacy.load('en_core_web_md')

extra_df = pd.DataFrame(columns=['id', 'sentence', 'context', 'used', 'created'])

for pub in extra_data['documents']:
    texts = pub['texts']
    for text in texts:
        try:
            id = pub['id']
            sentence = text['text'].strip()
            context = text['full_context']
            context_sentences = [i.text for i in nlp(context).sents]
            for t in context_sentences:
                ratio = fuzz.ratio(sentence, t)
                if ratio > 90:
                    sentence_index = context_sentences.index(t)
                    break
            try:
                prior_sentence = context_sentences[sentence_index-1]
            except:
                prior_sentence = None
            try:
                trailing_sentence = context_sentences[sentence_index+1]
            except:
                trailing_sentence = None

            if prior_sentence and trailing_sentence:
                context = ' '.join([prior_sentence, sentence, trailing_sentence])
            elif prior_sentence and not trailing_sentence:
                context = ' '.join([prior_sentence, sentence])
            else:
                context = ' '.join([sentence, trailing_sentence])


            try:
                used = text['class_attributes']['classification']['used']['value']
            except:
                used = False
            try:
                created = text['class_attributes']['classification']['created']['value']
            except:
                created = False
            try:
                shared = text['class_attributes']['classification']['created']['value']
            except:
                shared = False

            temp_df = pd.DataFrame({'id':[id], 
                                    'sentence':[sentence], 
                                    'context':[context], 
                                    'used':[used], 
                                    'created':[created],
                                    'shared':[shared]})
            
            extra_df = pd.concat([extra_df, temp_df])
        except:
            continue

In [0]:
def get_response(url):
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    session = requests.Session()
    response = session.get(url, headers=headers)
    
    return BeautifulSoup(response.text, 'xml')

def get_context(pmcid, software_sentence):
    try:
        resp = get_response(f'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/{pmcid}/ascii')
        texts = resp.findAll('text')
        context_sentences = [i.get_text() for i in texts]
        result = []
        for sent in context_sentences:
            sentences = nlp(sent).sents
            sentences = [i.text for i in sentences]
            result.append(sentences)


        for t in result:
            for sent in t:
                ratio = fuzz.ratio(software_sentence, sent)
                if ratio > 90:
                    sent_match = t
                    sent_match_ind = sent_match.index(sent)
                    break
        try:
            if sent_match_ind != 0:
                prior_sentence = sent_match[sent_match_ind-1]
            else:
                raise ValueError('no prior sentence')
        except:
            prior_sentence = None
        try:
            trailing_sentence = sent_match[sent_match_ind+1]
        except:
            trailing_sentence = None
        
        if prior_sentence and trailing_sentence:
            context = ' '.join([prior_sentence, software_sentence, trailing_sentence])
        elif prior_sentence and not trailing_sentence:
            context = ' '.join([prior_sentence, software_sentence])
        else:
            context = ' '.join([software_sentence, trailing_sentence])
    except:
        context = None
        
    return context


In [0]:
softcite_somesci_data = open_gzip_json(software_contexts)

softcite_somesci_data_df = pd.DataFrame(columns=['id', 'sentence', 'context', 'used', 'created', 'shared'])
for i in tqdm(softcite_somesci_data['documents']):
    for ind, t in enumerate(i['texts']):
        pmcid = i['pmcid']
        if 'entity_spans' in t:
            text = t['text']

            try:
                context = get_context(pmcid, text)
            except:
                context = None

            used = False
            created = False
            shared = False
            for ind in t['entity_spans']:
                if 'used' in ind:
                    if ind['used']:
                        used = True
                if 'created' in ind:
                    if ind['created']:
                        created = True
                if 'shared' in ind:
                    if ind['shared']:
                        shared = True
            temp_df = pd.DataFrame({'id':[pmcid], 'sentence':[text], 'context':[context], 'used':[used], 'created':[created], 'shared':[shared]})
            softcite_somesci_data_df = pd.concat([softcite_somesci_data_df, temp_df])
        else:
            continue


In [0]:
softcite_somesci_data_df

Unnamed: 0,id,sentence,context,used,created,shared
0,PMC4690612,"We used EventIDE (Okazolab Ltd, London, UK) fo...","We used EventIDE (Okazolab Ltd, London, UK) fo...",True,False,False
0,PMC6185835,Part of the controls were recruited via an onl...,Seven participants with ADHD reported using MP...,True,False,False
0,PMC6185835,The heartbeat perception task was programmed u...,"The Mental Tracking Method, a well-validated t...",True,False,False
0,PMC6185835,R-waves were counted offline by means of a cus...,The electrocardiogram was recorded via two ext...,True,False,False
0,PMC3515589,The Brain Vision Analyzer software (Brain Prod...,The Brain Vision Analyzer software (Brain Prod...,True,False,False
...,...,...,...,...,...,...
0,PMC5253549,We generated multilevel logistic regression mo...,,True,False,False
0,PMC5253549,We used reweighted iterative least squares and...,,True,False,False
0,PMC7471053,"The Statistical Analysis System for Windows, v...",Student's t-test was used to compare promotor ...,True,False,False
0,PMC3794969,Quantitative data were obtained using the Lumi...,These parameters were measured simultaneously ...,True,False,False


In [0]:
extra_df.head()

Unnamed: 0,id,sentence,context,used,created,shared
0,005f9f83-dabe-4ab0-b674-257b5ff1b4af,"Finally, Haft et al. (12) used REPfind (http:/...",The PYGRAM program is mostly efficient in visu...,True,False,False
0,005f9f83-dabe-4ab0-b674-257b5ff1b4af,"Recently, two CRISPR-dedicated software tools ...",They require further manual manipulations to e...,True,False,False
0,005f9f83-dabe-4ab0-b674-257b5ff1b4af,"In the first step, maximal repeats are found b...","Hereafter, the obtained maximal repeats are gr...",True,False,False
0,005f9f83-dabe-4ab0-b674-257b5ff1b4af,Vmatch is based on a comprehensive implementat...,"In the first step, maximal repeats are found b...",False,False,False
0,0114b264-4f0d-4ac0-9aac-56ba96e36f90,The FASTX application (www.molecularevolution....,Calculation of percent of reads by biotype was...,True,False,False


In [0]:
final_df = pd.concat([softcite_somesci_data_df, extra_df])

In [0]:
final_df.to_csv('/dbfs/FileStore/citation_intent/software_intent_data_with_context.csv')

In [0]:
curated_data = pd.read_csv('/dbfs/FileStore/citation_intent/software_intent_data___final_data.csv')

In [0]:
curated_data = curated_data.drop(['context', 'Unnamed: 6'], axis=1)

In [0]:
curated_data

Unnamed: 0,id,sentence,context,used,created,mention,Unnamed: 6
0,PMC5189946,All of this analysis was implemented using Mat...,"In turn, Pemp(I) was inferred using a well-est...",False,True,False,
1,PMC4511233,"Code for calculating partition similarity, obt...",Since the probability of getting a given MI is...,False,True,False,
2,PMC4186879,All behavioral statistical analyses were perfo...,"After 1,000 permutations, the cluster-level si...",False,False,True,
3,PMC5026371,"M-Track was written using Python 2.7, OpenCV 3...",The software and documentation for M-Track is ...,True,False,False,
4,PMC1283974,"Mindboggle is a freely downloadable, open sour...","In this paper, we have chosen to extend the us...",False,True,False,
...,...,...,...,...,...,...,...
3212,fa2b2905-5624-449c-bc30-13682e2f5e84,Microarray data are available in the ArrayExpr...,The pooled RNA samples of each experimental gr...,False,False,True,
3213,fbdacd7f-c51f-4453-99f8-07799f76fa27,AltAnalyze (http://www. altanalyze.org) identi...,There are few tools for analyzing AS isoforms ...,False,False,True,
3214,fdb400f9-9c0c-43c2-afb3-1d8d14d3f474,Information about the data and their availabil...,Information about the data and their availabil...,False,False,True,
3215,ff070b04-1c52-4506-a92d-3f46a0c612e7,All files are available from the SEANOE databa...,All files are available from the SEANOE databa...,False,False,True,


In [0]:
merged_df = pd.merge(curated_data, final_df, on=['id', 'sentence'])

In [0]:
merged_df = merged_df.drop(['used_y', 'created_y', 'shared'], axis=1)

In [0]:
filtered_merged = merged_df[~merged_df['context'].isnull()]

In [0]:
filtered_merged = filtered_merged.rename(columns={'used_x':'used', 'created_x':'created'})

In [0]:
filtered_merged.to_csv('/dbfs/FileStore/citation_intent/software_intent_data_with_context.csv')

In [0]:
filtered_merged

Unnamed: 0,id,sentence,used,created,mention,context
1,PMC4511233,"Code for calculating partition similarity, obt...",False,True,False,Since the probability of getting a given MI is...
2,PMC4186879,All behavioral statistical analyses were perfo...,False,False,True,All behavioral statistical analyses were perfo...
4,PMC1283974,"Mindboggle is a freely downloadable, open sour...",False,True,False,"Mindboggle is a freely downloadable, open sour..."
5,PMC3591454,All graph algorithms used in this article were...,False,True,False,The graph's community structure can be determi...
6,01a01273-2e2c-423c-b480-6a3161c2aa36,Data analysis was performed in R version 3.2.3...,True,False,False,Data sets obtained in this study have been upl...
...,...,...,...,...,...,...
2545,fa2b2905-5624-449c-bc30-13682e2f5e84,Microarray data are available in the ArrayExpr...,False,False,True,The pooled RNA samples of each experimental gr...
2546,fbdacd7f-c51f-4453-99f8-07799f76fa27,AltAnalyze (http://www. altanalyze.org) identi...,False,False,True,There are few tools for analyzing AS isoforms ...
2547,fdb400f9-9c0c-43c2-afb3-1d8d14d3f474,Information about the data and their availabil...,False,False,True,Information about the data and their availabil...
2548,ff070b04-1c52-4506-a92d-3f46a0c612e7,All files are available from the SEANOE databa...,False,False,True,All files are available from the SEANOE databa...



#### Extract unlabeled data from dataset to use as negative samples

In [0]:
unlabeled_df = pd.DataFrame(columns=['id', 'sentence', 'used', 'created', 'mention'])
for i in softcite_somesci_data['documents']:
    for ind, t in enumerate(i['texts']):
        pmcid = i['pmcid']
        if 'entity_spans' not in t:
            text = t['text']
            used = False
            created = False
            mention = False
            temp_df = pd.DataFrame({'id':[pmcid], 'sentence':[text], 'used':[used], 'created':[created], 'mention':[mention]})
            unlabeled_df = pd.concat([unlabeled_df, temp_df])

In [0]:
unlabeled_df_sample = unlabeled_df.sample(1000)
unlabeled_df_sample.to_csv('/dbfs/FileStore/citation_intent/unlabeled.csv')

In [0]:
unlabeled_df_sample['context'] = unlabeled_df_sample.apply(lambda x: get_context(x['id'], x['sentence']), axis=1)

In [0]:
unlabeled_df_sample.to_csv('/dbfs/FileStore/citation_intent/unlabeled_w_context.csv')

In [0]:
filtered_unlabeled_df_sample = unlabeled_df_sample[~unlabeled_df_sample['context'].isnull()]

In [0]:
final_df_w_unlabeled = pd.concat([filtered_merged, filtered_unlabeled_df_sample])

In [0]:
final_df_w_unlabeled.to_csv('/dbfs/FileStore/citation_intent/final_df_w_unlabeled_context.csv', index=None)

In [0]:
final_df_w_unlabeled

Unnamed: 0,id,sentence,used,created,mention,context
1,PMC4511233,"Code for calculating partition similarity, obt...",False,True,False,Since the probability of getting a given MI is...
2,PMC4186879,All behavioral statistical analyses were perfo...,False,False,True,All behavioral statistical analyses were perfo...
4,PMC1283974,"Mindboggle is a freely downloadable, open sour...",False,True,False,"Mindboggle is a freely downloadable, open sour..."
5,PMC3591454,All graph algorithms used in this article were...,False,True,False,The graph's community structure can be determi...
6,01a01273-2e2c-423c-b480-6a3161c2aa36,Data analysis was performed in R version 3.2.3...,True,False,False,Data sets obtained in this study have been upl...
...,...,...,...,...,...,...
0,PMC5388469,The study was approved by the Health Research ...,False,False,False,The protocol for this study has been published...
0,PMC3660501,U0216 and Wortmannin were dissolved in PBS whe...,False,False,False,"An ERK1/2 inhibitor, U0216; a PI3 kinase inhib..."
0,PMC4213368,The linker unit serves as an acceptor for the ...,False,False,False,The rhamnosyltransferase WbbL then attaches th...
0,PMC4451992,The Training and Recourse Center of CIDI in Be...,False,False,False,The WHO Composite International Diagnostic Int...
