https://docs.google.com/spreadsheets/d/1CFGZq90xc1V0ARubq3ZXT5G2KoMewEJWazLnCtIIa5M/edit#gid=0

https://huggingface.co/datasets/squad

https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering

Create question answering training dataset from GO table:
- answer_start = integer marking the start position of the annotated answer in the full text file
- text = text of the answer
- context = full text
- id = number from (0, len(GO))
- question: Does the Graphene Oxide {property} correlate positively or negatively with {effect} in {biosystem_organism}?
- title = title for question

In [1]:
import os
import pandas as pd
import re
from unstructured.partition.auto import partition


In [2]:
go = pd.read_csv('../data/go.tsv', sep="\t") #.dropna(how='any')
dois = [str(article) for article in set(go['doi']) if 'doi' in str(article)]
#go = go[go['doi'].isin(dois)]
#go['filenames'] = ['../data/docs/'+i.replace('http://', '').replace('doi.org/', '').replace('/', '.').replace('dx.', '') for i in go['doi']]
go.columns

Index(['property', 'effect', 'type_relation', 'dose', 'material',
       'biosystem_organism', 'dose.1', 'coating', 'diameter_type', 'diameter',
       'Material properties', 'quote', 'doi'],
      dtype='object')

In [3]:
go.describe(include='all')

Unnamed: 0,property,effect,type_relation,dose,material,biosystem_organism,dose.1,coating,diameter_type,diameter,Material properties,quote,doi
count,61,61,51,14,95,58,9,0.0,1,3,45,105,99
unique,16,23,4,11,8,35,5,,1,1,16,82,17
top,exposure,toxicity,PositivelyCorrelates,< 20 μg/mL,GO,Daphnia magnadoses (1–50 mg.L) and exposure ti...,2 μg/mL to 6 μg/mL,,hydrodynamic diameter,100nm to 110nm,GO layers with a median size of 200–300 nm. av...,This review reveals that the toxicity of graph...,dx.doi.org/10.1021/tx400385x
freq,17,15,21,2,68,6,4,,1,3,8,6,47
mean,,,,,,,,,,,,,
std,,,,,,,,,,,,,
min,,,,,,,,,,,,,
25%,,,,,,,,,,,,,
50%,,,,,,,,,,,,,
75%,,,,,,,,,,,,,


In [4]:
def strip_xml_tags(text):
    # Remove XML tags from the text using regular expressions
    return re.sub(r'<[^>]+>', '', text)

def get_full_text(doi):
    my_files = os.listdir('../data/docs')
    doi = doi.split("/")[-1]
    if '/' in doi:
        doi = doi.split('/')[1]
    
    for file in my_files:
        if doi in file:
            filepath = os.path.join('../data/docs', file)
            print(f'Parsing {filepath}')
            elements = partition(filepath)
            full_text = " ".join([strip_xml_tags(str(el)) for el in elements])
            return full_text
        else:
            if '/' in doi:
                doi_attempt = doi.split('/')[1]
                if doi_attempt in file:
                    print(f'Parsing {filepath}')
                    elements = partition(filepath)
                    full_text = " ".join([strip_xml_tags(str(el)) for el in elements])
                    return full_text

def find_answer_start(context, answer):

    max = 7
    start = context.find(answer[0:max])
    while start == -1 and start >0:
        max -=1
        start = context.find(answer[0:max])
    return start

In [5]:
# A. For each journal article, list the effects DONE
# B. For each nanomaterial mentioned, list  TODO
squad_sbd4nano = pd.DataFrame()
seen = {}
for index, row in go.iterrows():
    id = str(index)
    title = str(row['doi'])
    if title not in seen.keys(): 
        context = str(get_full_text(title))
        seen[title] = context
        question = f'Which effects on organisms or cells are described upon exposure to graphene oxide?' #A
        text = str(row['quote'])
        if re.match(r'^\s*$', text) is None:
            answer_start = str(find_answer_start(context, text))
            answers = {"text": [text], "answer_start": [answer_start]}
            new_row = pd.DataFrame([id,title,context,question,answers]).transpose()
            squad_sbd4nano = pd.concat([squad_sbd4nano, new_row], ignore_index=True)
    else:
        context = seen[title]
    if row['effect'] != 'NaN' and len(str(row['Material properties'])) > 3:
        effect = row['effect']
        question_b = f'Which are the properties of graphene oxide leading to {effect}?' #B
        props = row['Material properties'].split(".")
        answers = {"text": [], "answer_start": []}
        for prop in props:
            text = prop
            answer_start = str(find_answer_start(context, text))
            if str(answer_start) not in ['0', '-1']:
                answers['text'].append(text)
                answers['answer_start'].append(answer_start)   
        new_row = pd.DataFrame([id,title,context,question,answers]).transpose()
        squad_sbd4nano = pd.concat([squad_sbd4nano, new_row], ignore_index=True)


    answer_start = str(find_answer_start(context, text))
    answers = {"text": text, "answer_start": [answer_start]}
    new_row = pd.DataFrame([id,title,context,question,answers]).transpose()
    squad_sbd4nano = pd.concat([squad_sbd4nano, new_row], ignore_index=True)# B
    # C
    
squad_sbd4nano.columns = ['id', 'title', 'context', 'question', 'answers']


Parsing ../data/docs/tx400385x.pdf
Parsing ../data/docs/tx400385x.pdf
Parsing ../data/docs/j.toxlet.2010.11.016.xml
Parsing ../data/docs/cdd.2010.11.pdf
Parsing ../data/docs/j.biomaterials.2012.07.040.xml
Parsing ../data/docs/j.biomaterials.2012.02.021.xml
Parsing ../data/docs/nl202515a.pdf
Parsing ../data/docs/s11671-010-9751-6.xml
Parsing ../data/docs/smll.201201546.pdf
Parsing ../data/docs/nn101097v.pdf
Parsing ../data/docs/nn202699t.pdf
Parsing ../data/docs/A20060216.pdf
Parsing ../data/docs/ijms221910578.xml
Parsing ../data/docs/am300253c.pdf
Parsing ../data/docs/acsomega.2c03171.xml
Parsing ../data/docs/j.envpol.2017.12.034.xml


In [6]:
#QC
n = 0
for index, row in squad_sbd4nano.iterrows():
    if len(row['answers']['text']) > 0 and len(row['answers']['text']) == len(row['answers']['answer_start']):
        for i in range(len(row['answers']['text'])):
            n+=1
            print(f'QA #{n} | id: {row["id"]} | text: {row["answers"]["text"][i][0:10]} (...) | start position at: {row["answers"]["answer_start"][i]}')
    else:
        squad_sbd4nano.drop(index)
    
    

QA #1 | id: 0 | text: formation  (...) | start position at: 3335
QA #2 | id: 1 | text: Toxicologi (...) | start position at: -1
QA #3 | id: 6 | text: Exposure o (...) | start position at: 6198
QA #4 | id: 6 | text: diameter b (...) | start position at: 9975
QA #5 | id: 7 | text: diameter b (...) | start position at: 9975
QA #6 | id: 8 | text: diameter b (...) | start position at: 9975
QA #7 | id: 9 | text: The thickn (...) | start position at: 16124
QA #8 | id: 9 | text:  The size  (...) | start position at: 15084
QA #9 | id: 9 | text:  The absor (...) | start position at: 10750
QA #10 | id: 9 | text:  The oxyge (...) | start position at: 31769
QA #11 | id: 9 | text:   (...) | start position at: 10
QA #12 | id: 9 | text:   (...) | start position at: 10
QA #13 | id: 10 | text: The influe (...) | start position at: 14188
QA #14 | id: 10 | text: The thickn (...) | start position at: 30134
QA #15 | id: 10 | text:  S1) (...) | start position at: 30214
QA #16 | id: 10 | text:  Both larg (...

In [7]:
squad_sbd4nano.head(5)

Unnamed: 0,id,title,context,question,answers
0,0,doi.org/10.1021/tx400385x,"Downloaded via MAASTRICHT UNIV on June 26, 202...",Which effects on organisms or cells are descri...,{'text': ['formation of hydrogen bonds between...
1,0,doi.org/10.1021/tx400385x,"Downloaded via MAASTRICHT UNIV on June 26, 202...",Which effects on organisms or cells are descri...,{'text': 'formation of hydrogen bonds between ...
2,1,,,Which effects on organisms or cells are descri...,{'text': ['Toxicological aspects related to cy...
3,1,,,Which effects on organisms or cells are descri...,{'text': 'Toxicological aspects related to cyt...
4,2,,,Which effects on organisms or cells are descri...,{'text': 'Toxicological aspects related to cyt...


In [8]:
squad_sbd4nano.describe()

Unnamed: 0,id,title,context,question,answers
count,170,170,170,170,170
unique,110,18,16,1,59
top,102,dx.doi.org/10.1021/tx400385x,"Downloaded via MAASTRICHT UNIV on June 26, 202...",Which effects on organisms or cells are descri...,"{'text': 'GO film, beyond not exerting any cyt..."
freq,3,60,62,170,28


In [9]:
squad_sbd4nano.to_csv('../data/squad_sbd4nano.tsv', sep='\t')