https://docs.google.com/spreadsheets/d/1CFGZq90xc1V0ARubq3ZXT5G2KoMewEJWazLnCtIIa5M/edit#gid=0

https://huggingface.co/datasets/squad

https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering

Create question answering training dataset from GO table:
- answer_start = find the start of the annotated answer in the full text file
- text = text of the answer
- context = full text
- id = number from (0, len(GO))
- question: Does the Graphene Oxide {property} correlate positively or negatively with {effect} in {biosystem_organism}?
- title = title for question

In [1]:
%pip install PyPDF2


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
# Necessary to import OpenAI
with open('assets/openai_api_key', 'r') as f:
    openai_api_key = f.read()
    os.environ['OPENAI_API_KEY'] = openai_api_key
from pathlib import Path

import json
import pandas as pd
import re

import PyPDF4
import xml.etree.ElementTree as ET
from unstructured.partition.auto import partition
from unstructured.documents.elements import Text
from unstructured.cleaners.core import replace_unicode_quotes
from unstructured.cleaners.core import bytes_string_to_string
from unstructured.cleaners.core import clean
from fuzzywuzzy import fuzz



In [3]:
dataset_path = '../data/graphene_oxide.tsv'
paper_path = '../data/docs/'
pickle_path = '../data/pickle/'
chroma_path = '../data/chroma/'
MODEL = 'gpt-3.5-turbo'

In [4]:
go = pd.read_csv('../data/graphene_oxide.tsv', sep="\t").dropna(how='any')
dois = [str(article) for article in set(go['doi']) if 'doi' in str(article)]
go = go[go['doi'].isin(dois)]
go['filenames'] = ['../data/docs/'+i.replace('http://', '').replace('doi.org/', '').replace('/', '.').replace('dx.', '') for i in go['doi']]
go.columns

Index(['property', 'effect', 'type_relation', 'dose', 'material',
       'biosystem_organism', 'quote', 'doi', 'filenames'],
      dtype='object')

In [5]:
go.describe()

Unnamed: 0,property,effect,type_relation,dose,material,biosystem_organism,quote,doi,filenames
count,14,14,14,14,14,14,14,14,14
unique,4,7,3,11,3,11,10,6,6
top,Dose,cell viability,DoesNotCorrelate,< 20 μg/mL,GO,Mice>0.4mg/mouse,graphene oxides under low dose (0.1 mg) and mi...,https://doi.org/10.1007/s11671-010-9751-6,../data/docs/https:..10.1007.s11671-010-9751-6
freq,8,3,6,2,12,2,3,6,6


In [6]:
def strip_xml_tags(text):
    # Remove XML tags from the text using regular expressions
    return re.sub(r'<[^>]+>', '', text)

def get_full_text(doi):
    my_files = os.listdir('../data/docs')
    doi = doi.split("/")[-1]
    if '/' in doi:
        doi = doi.split('/')[1]
    
    for file in my_files:
        if doi in file:
            filepath = os.path.join('../data/docs', file)
            print(f'Parsing {filepath}')
            elements = partition(filepath)
            full_text = " ".join([strip_xml_tags(str(el)) for el in elements])
            return full_text
        else:
            if '/' in doi:
                doi_attempt = doi.split('/')[1]
                if doi_attempt in file:
                    print(f'Parsing {filepath}')
                    elements = partition(filepath)
                    full_text = " ".join([strip_xml_tags(str(el)) for el in elements])
                    return full_text

def find_answer_start(context, answer):

    max = 7
    start = context.find(answer[0:max])
    while start == -1 and start >0:
        max -=1
        start = context.find(answer[0:max])
    return start

In [7]:
squad_sbd4nano = pd.DataFrame()

for index, row in go.iterrows():
    id = str(index)
    title = str(row['doi'])
    context = str(get_full_text(title))
    question = f'Which effects are described for graphene oxide (GO)   nanomaterials and how do they relate to its properties?'
    text = str(row['quote'])
    answer_start = str(find_answer_start(context, text))
    answers = {"text": text, "answer_start": answer_start}
    new_row = [id,title,context,question,answers]
    squad_sbd4nano = pd.concat([squad_sbd4nano, new_row])

squad_sbd4nano.columns = ['id', 'title', 'context', 'question', 'answers']
squad_sbd4nano.to_json('squad_sbd4nano_go.json')

Looking for https://doi.org/10.1016/j.biomaterials.2012.02.021 in ../data/docs
Parsing ../data/docs/j.biomaterials.2012.02.021.xml
Looking for https://doi.org/10.1007/s11671-010-9751-6 in ../data/docs
Parsing ../data/docs/s11671-010-9751-6.xml
Looking for dx.doi.org/10.1021/tx400385x in ../data/docs
Parsing ../data/docs/tx400385x.pdf
Looking for https://doi.org/10.1021/nn101097v in ../data/docs
Parsing ../data/docs/nn101097v.pdf
Looking for https://doi.org/10.1021/acsomega.2c03171 in ../data/docs
Parsing ../data/docs/acsomega.2c03171.xml
Looking for https://doi.org/10.1016/j.envpol.2017.12.034 in ../data/docs
Parsing ../data/docs/j.envpol.2017.12.034.xml


https://huggingface.co/datasets/squad


{

    "answers": {

        "answer_start": [1],

        "text": ["This is a test text"]

    },

    "context": "This is a test context.",

    "id": "1",

    "question": "Is this a test?",

    "title": "train test"
    
}


[{'answers': {'answer_start': int, 'text': str},
  'context': str,
  'id': int,
  'question': str,
  'title': str},
 {'answers': {'answer_start': int, 'text': str},
  'context': str,
  'id': int,
  'question': str,
  'title': str},
 {'answers': {'answer_start': int, 'text': str},
  'context': str,
  'id': int,
  'question': str,
  'title': str},
 {'answers': {'answer_start': int, 'text': str},
  'context': str,
  'id': int,
  'question': str,
  'title': str},
 {'answers': {'answer_start': int, 'text': str},
  'context': str,
  'id': int,
  'question': str,
  'title': str},
 {'answers': {'answer_start': int, 'text': str},
  'context': str,
  'id': int,
  'question': str,
  'title': str},
 {'answers': {'answer_start': int, 'text': str},
  'context': str,
  'id': int,
  'question': str,
  'title': str},
 {'answers': {'answer_start': int, 'text': str},
  'context': str,
  'id': int,
  'question': str,
  'title': str},
 {'answers': {'answer_start': int, 'text': str},
  'context': str,
  'id