# Content Extraction
script to obtain protocols from article url
   
## input files
- **config file** 

    - In cofig file, please specify the following three items:

        - **input_link** : Link of the website (BioRxiv) 
        - **output_path** (string) : path for the obtained protocols **". /protocols/"** 
        - **num_keywords** (int) : Number of keywords in each protocol - Higher numer = Higher quality 
        
## output files

The obtained protocols are stored in the **". /protocols/"** folder.


In [1]:
import os
import requests
import re
import configparser
import errno
import logging
import ast
import csv

from bs4 import BeautifulSoup

In [2]:
config_path = "./content_config.ini"

## There are two different type of functions to extract content based on different website
### e.g. If get_text is not working, please use the get_text2

In [3]:
def modify_biorxiv_url(url, logger):
    if 'biorxiv' in url:
        if '.full' not in url:
            # If biorxiv.org is present and .full is not in the URL, add .full
            modified_url = url + '.full'
        else:
            # If .full is already in the URL, keep it unchanged
            modified_url = url
        return modified_url
    else:
        logger.error(f"The input link is not from biorxiv!")
        return url
    
def get_text(url, exclude_sections):
#     full_url = modify_biorxiv_url(url, logger)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    sections = soup.find_all("div", class_="section")

    section_titles = []
    section_texts = []
    subsections = {}
    sub_subsections = {}
    
    paragraph = ""  # String variable to store the paragraph
    
    # Find and filter headings
    for section in sections:
        section_title = section.find("h2")
        if section_title:
            section_title = section_title.text.strip()
            section_stripped = re.sub(r'^[\d.]+\s*', '', section_title)
            if any(exclude_section.lower() in section_stripped.lower() for exclude_section in exclude_sections):
                continue
            else:
                section_titles.append(section_title)
#                 print(section_title)
                section_text = section.get_text()
                section_text = section_text.replace(section_title, "").strip()
                section_texts.append(section_text)
#                 print(section_text)
                paragraph += f"{section_title}\n"
                paragraph += f"{section_text}\n"
                
                # subtitles under titles
                subtitles = section.find_all("h3")
                for subtitle in subtitles:
                    subtitle_text = subtitle.text.strip()
                    subtitle_content = subtitle.find_next("p").get_text()
                    if subtitle_text in subsections:
                        subsections[subtitle_text].append(subtitle_content)
                    else:
                        subsections[subtitle_text] = [subtitle_content]
                
                # titles under subtitles
                sub_subtitles = section.find_all("h4")
                if len(sub_subtitles) == 0:
                    continue
                else:
                    for sub_subtitle in sub_subtitles:
                        sub_subtitle_text = sub_subtitle.text.strip()
                        sub_subtitle_content = sub_subtitle.find_next("p").get_text()
                        if sub_subtitle_text in sub_subsections:
                            sub_subsections[sub_subtitle_text].append(sub_subtitle_content)
                        else:
                            sub_subsections[sub_subtitle_text] = [sub_subtitle_content]
                
    return paragraph, section_titles, subsections, sub_subsections

def get_text2(url, exclude_sections):
#     full_url = modify_biorxiv_url(url, logger)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    sections = soup.find_all("section")
    
    section_titles = []
    section_texts = []
    subsections = {}
    sub_subsections = {}
    paragraph = ""  # String variable to store the paragraph
    
    # Find and filter headings
    for section in sections:
        section_title_element = section.find("h2", class_="c-article-section__title")
        if section_title_element:
            section_title = section_title_element.text.strip()
            section_stripped = re.sub(r'^[\d.]+\s*', '', section_title)
            if any(exclude_section.lower() in section_stripped.lower() for exclude_section in exclude_sections):
                continue
            else:
                section_titles.append(section_title)
                #print(section_title)
                section_text = section.get_text()
                section_text = section_text.replace(section_title, "").strip()
                section_texts.append(section_text)
                #print(section_text)
                paragraph += f"{section_title}\n"
                paragraph += f"{section_text}\n"
                
                subtitles = section.find_all("h3")
                # subtitles under titles
                for subtitle in subtitles:
                    subtitle_text = subtitle.text.strip()
                    subtitle_content = subtitle.find_next("p").get_text()
                    if subtitle_text in subsections:
                        subsections[subtitle_text].append(subtitle_content)
                    else:
                        subsections[subtitle_text] = subtitle_content
                
                # titles under subtitles
                sub_subtitles = section.find_all("h4")
                if len(sub_subtitles) == 0:
                    continue
                else:
                    for sub_subtitle in sub_subtitles:
                        sub_subtitle_text = sub_subtitle.text.strip()
                        sub_subtitle_content = sub_subtitle.find_next("p").get_text()
                        if sub_subtitle_text in sub_subsections:
                            sub_subsections[sub_subtitle_text].append(sub_subtitle_content)
                        else:
                            sub_subsections[sub_subtitle_text] = sub_subtitle_content
                
    return paragraph, section_titles, subsections, sub_subsections

def prt_selection(sections, protocol_title, subsections_exclude):
    # Initiate infomation sections and protocol sections
    info_sections = {}
    protocol_sections = {}
    for heading, c in sections.items():
        content = c[0]
        
        # Classify infomation section and protocol section
        if any(exclude_section.lower() in heading.lower() for exclude_section in subsections_exclude):
            continue
        else:
            if any(include_section.lower() in heading.lower() for include_section in protocol_title):
                if heading in protocol_sections:
                    protocol_sections[heading].append(content)
                else:
                    protocol_sections[heading] = content
            else:
                if heading in info_sections:
                    info_sections[heading].append(content)
                else:
                    info_sections[heading] = content
                    
    return info_sections, protocol_sections

def pcr_decider(sections, keywords, units, num): 
    protocol_sections = {}
    k = {}
    # Check if the input is a dictionary
    if isinstance(sections, dict):
        for heading, content in sections.items():
            # count how many keywords in content
            count_k = 0
            count_u = 0
            count = 0
            k[heading] = []  # Initialize an empty list for each heading
            for i in keywords:
                keyword = i.lower()
                if keyword in content.lower():
                    count_k +=1
                    k[heading].append(keyword)
            for j in units:
                unit = j.lower()
                 # Pattern with optional space in front
                pattern = r'(\b|\d)' + re.escape(unit) + r'\b'
                if re.search(pattern, content, re.IGNORECASE):
                    count_u +=1
                    k[heading].append(unit)

            count = count_k + count_u   # Avoid situation that only units exist
            
            if count_k != 0:
                if count >= num:
                    if heading in protocol_sections:
                        protocol_sections[heading].append(content)
                    else:
                        protocol_sections[heading] = content
                        
    # Cases if only one section    
    else:    
        count_k = 0
        count_u = 0
        count = 0
        k = []
        for i in keywords:
            keyword = i.lower()
            if keyword in sections.lower():
                count_k +=1
                k.append(keyword)
        for j in units:
            unit = j.lower()
             # Pattern with optional space in front
            pattern = r'(\b|\d)' + re.escape(unit) + r'\b'
            if re.search(pattern, sections, re.IGNORECASE):
                count_u +=1
                k.append(unit)
        count = count_k + count_u   # Avoid situation that only units exist
        
        if count_k != 0:
            if count >= num:
                protocol_sections = sections

    return protocol_sections, k

def store_pcr(subsections, protocol_title, subsections_exclude, keywords, units, num):
    protocols = []
    new_protocol_sections = {}
    k = []
    info_sections = {}
    protocol_sections = {}
    # Process subsection
    if subsections:
        info_sections, protocol_sections = prt_selection(subsections, protocol_title, subsections_exclude)
        if protocol_sections:
            check_protocol_sections, k = pcr_decider(protocol_sections, keywords, units, num)
            # Check information section if there is no protocol in filtered protocol section
            if not check_protocol_sections:
                temp_sec, k = pcr_decider(info_sections, keywords, units, num)
                if temp_sec:
                    new_protocol_sections = temp_sec
            else:
                new_protocol_sections = check_protocol_sections
                
    # Combines information sections and each protocol
    if new_protocol_sections:
        if not check_protocol_sections:
            for h, c in new_protocol_sections.items():
                temp_ = {}
                temp_.update({h: c})
                protocols.append(temp_)
        else:
            for h, c in new_protocol_sections.items():
                temp_ = dict(info_sections)  # Create a new dictionary based on info_sections
                temp_.update({h: c})
                protocols.append(temp_)
    return protocols, k, info_sections, protocol_sections

In [4]:
def main(url, sections_exclude, num_keywords, subsections_exclude, keywords, units, protocol_title, results_save_path, logger):
    
    protocol = None
    headers = ["protocol index", "heading name", "content"]
    
    try:
        num_keywords = int(num_keywords)
    except Exception as e:
        logger.error(f"Error occured in number of keywords selections:{str(e)}")
        raise ValueError(f"Error occured in number of keywords selections:{str(e)}")
        
    # Check whether the link is from BioRXiv or not
    full_url = modify_biorxiv_url(url, logger)
    
    try:
        # get text from filtered headings
        spec_paragraph, titles, subsections, sub_subsections = get_text(full_url, sections_exclude)
        if not spec_paragraph:
            spec_paragraph, titles, subsections, sub_subsections = get_text2(full_url, sections_exclude)
        
    except Exception as e:
        logger.error(f"Error occured when filtering headings and extractint text:{str(e)}")
        raise ValueError(f"Error occured in filtered headings and extractint text:{str(e)}")
        
    if not spec_paragraph:
        logger.error(f"There is no text extracted from the link you've provided!")
        raise ValueError("There is no text extracted from the link you've provided!")
        
    if not subsections and not sub_subsections:
        try:
            protocol, k = pcr_decider(spec_paragraph, keywords, units, num_keywords)
        except Exception as e:
            logger.error(f"Error occured in obtain text from main headings:{str(e)}")
            raise ValueError(f"Error occured in obtain text from main headings:{str(e)}")
    else:
        # subsections
        try:
            subsection_protocol, k1, info, p = store_pcr(subsections, protocol_title, subsections_exclude, keywords, units, num_keywords)
        except Exception as e:
            logger.error(f"Error occured in obtain text from subheadings:{str(e)}")
            raise ValueError(f"Error occured in obtain text from subheadings:{str(e)}")
            
        # sub-subsections
        try:
            sub_subsection_protocol, k2, _, _ = store_pcr(sub_subsections, protocol_title, subsections_exclude, keywords, units, num_keywords)
        except Exception as e:
            logger.error(f"Error occured in obtain text from sub-subheadings:{str(e)}")
            raise ValueError(f"Error occured in obtain text from sub-subheadings:{str(e)}")
            
    #  Check whether the article mentions PCR or not     
    if not protocol and not subsection_protocol and not sub_subsection_protocol:
        logger.error(f"There is no PCR mentioned in the article!")
        raise ValueError("There is no PCR mentioned in the article!")

    else:
        # save content into csv file
        with open(results_save_path, mode='w', newline='', encoding="utf-8") as file:
            writer = csv.writer(file)
            # Write the headers to the CSV file
            writer.writerow(headers)
            index = 0

            # Heading content
            if protocol:
                for heading_name, content in protocol.items():
                    # Write each data row with protocol index, heading name, and content
                    writer.writerow([index, heading_name, content])
                    index += 1  

            # Subheading content        
            if subsection_protocol:
                for protocol_index, data_dict in enumerate(subsection_protocol, start=index):
                    for subheading_name, subcontent in data_dict.items():
                        # Write each data row with protocol index, heading name, and content
                        writer.writerow([protocol_index, subheading_name, subcontent])
                    index = protocol_index+1

            # Sub-subheading content        
            if sub_subsection_protocol:
                for protocol_index, data_dict in enumerate(sub_subsection_protocol, start=index):
                    for subsubheading_name, subsubcontent in data_dict.items():
                        # Write each data row with protocol index, heading name, and content
                        writer.writerow([protocol_index, subsubheading_name, subsubcontent])
                    index = protocol_index+1

In [5]:
### read ini file
config = configparser.ConfigParser()

if not os.path.exists(config_path):
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), config_path)

try:
    config.read(config_path)

    ## input link
    url = config.get("input_link", "url")
    
    ## output dir
    results_save_path = config.get("output_path", "out_path")
    
    ## number of keywords
    num_keywords = config.get("num_keywords", "num_keywords")
    
    # Set the file path for the log file
    log_file = "./error.log"

    # Configure the logging settings
    logger = logging.getLogger()
    logger.setLevel(logging.ERROR)
    logger.setLevel(logging.INFO)

    # Create a FileHandler to save logs to the specified file path
    file_handler = logging.FileHandler(log_file)
    file_handler.setLevel(logging.ERROR)
    file_handler.setLevel(logging.INFO)

    # Create a formatter to customize the log message format
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    # Add the FileHandler to the logger
    logger.addHandler(file_handler)

    ## get sections
    # Get the sections_exclude value as a string
    sections_exclude = config.get("sections", "sections_exclude")
    subsections_exclude = config.get("sections", "subsections_exclude")
    keywords = config.get("sections", "keywords")
    units = config.get("sections", "units")
    protocol_title = config.get("sections", "protocol_title") 
                                     
    # Parse the string representation of the list into an actual list
    sections_exclude = ast.literal_eval(sections_exclude)
    subsections_exclude = ast.literal_eval(subsections_exclude)
    keywords = ast.literal_eval(keywords)
    units = ast.literal_eval(units)
    protocol_title = ast.literal_eval(protocol_title)

except Exception as e:
    print("Error occured in reading setting.ini:", e)

main(url, sections_exclude, num_keywords, subsections_exclude, keywords, units, protocol_title, results_save_path, logger)

ValueError: There is no text extracted from the link you've provided!