# Data Extraction

- Get the MedQuAD dataset from the GitHub repository: https://github.com/abachaa/MedQuAD

- Extract question-answer pairs: Parse the XML files in the dataset to extract the question-answer pairs. Each pair should contain the question text and its corresponding answer.

- Clean and normalize the text: Remove any special characters, HTML tags, or irrelevant formatting.

- Convert all text to lowercase for consistency. Remove extra whitespace.

- Save the preprocessed data: Store the processed data in a format that's easy to load for training (e.g., JSON, CSV, or a custom binary format).



In [2]:
import requests
!pip install -q -U lxml
import pandas as pd
import xml.etree.ElementTree as ET
import os

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h

Since data is stored in xml, xpath is used to extract node information

In [4]:
headers = {
    'Authorization':'token '
}
"""
Download xml files and process them
"""

list_with_ans = ['https://api.github.com/repos/komus/MedQuAD/contents/1_CancerGov_QA?ref=master',
                 'https://api.github.com/repos/komus/MedQuAD/contents/2_GARD_QA?ref=master',
                 'https://api.github.com/repos/komus/MedQuAD/contents/3_GHR_QA?ref=master',
                'https://api.github.com/repos/komus/MedQuAD/contents/4_MPlus_Health_Topics_QA?ref=master',
                'https://api.github.com/repos/komus/MedQuAD/contents/5_NIDDK_QA?ref=master',
                 'https://api.github.com/repos/komus/MedQuAD/contents/6_NINDS_QA?ref=master',
                 'https://api.github.com/repos/komus/MedQuAD/contents/7_SeniorHealth_QA?ref=master',
                 'https://api.github.com/repos/komus/MedQuAD/contents/8_NHLBI_QA_XML?ref=master',
                 'https://api.github.com/repos/komus/MedQuAD/contents/9_CDC_QA?ref=master',

                 ]

def download_process_xml(url):
  #print(url)
  try:
    resp = requests.get(url, headers)
    resp.raise_for_status()

    xml_content = resp.content
    root = ET.fromstring(xml_content)
    return parse_xml_key_pair(root)
  except Exception as e:
    print(f"Error {e}")
    return None

def parse_xml_key_pair(root):
  df = pd.DataFrame(columns=['focus','synonyms', 'semanticgroup', 'question', 'answer'])

  synonyms = [sy.text.strip() for sy in root.findall(".//Synonyms/Synonym")]
  focus = [sy.text.strip() for sy in root.findall(".//Focus")]
  #print(synonyms)
  semanticgroup = [se.text.strip() for se in root.findall(".//UMLS/SemanticGroup")]
  #print(semanticgroup)
  for qapair in root.findall(".//QAPair"):
    question = qapair.find("Question").text.strip() if qapair.find("Question") is not None else ""
    answer = qapair.find("Answer").text.strip() if qapair.find("Answer") is not None else ""

    temp_df = pd.DataFrame({
        'question': question,
        'answer': answer,
        'semanticgroup': ', '.join(semanticgroup),
        'synonyms': [synonyms],
        'focus': ', '.join(focus)
    })
    df = pd.concat([df, temp_df], ignore_index=True)
  return df


def parse_xml_to_dict(root):
  data = {}
  for elem in root.iter():
    if elem.text:
      data[elem.tag] = elem.text.strip()
    else:
      data[elem.tag] = None
  return data

output_path = "output_medplus.jsonl"

"""
Using the url, loop through the content of the repo and get the xml files
"""
def process_github_xml_files(url):
  df = pd.DataFrame()
  resp = requests.get(url, headers=headers)
  contents = resp.json()
  #print(contents)
  if contents:
    for item in contents:
      if item['type']:
        if item['type'] == 'file' and item['name'].endswith('.xml'):
          xml_data = download_process_xml(item['download_url'])
          if xml_data is not None:
          #print(xml_data)
            with open(output_path, "w") as f:
              f.write(xml_data.to_json(orient='records', lines=True, force_ascii=False))
            xml_data.to_csv('output_file1.csv', mode='a', header=not os.path.exists('output_file1.csv'), index=False)


In [5]:
for d in list_with_ans:
  process_github_xml_files(d)

Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' object has no attribute 'strip'
Error 'NoneType' obj