In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import re  # Import the re module for regular expressions
import random  # Import the random module
import pandas as pd

# Function to extract the abstract of a research paper from its link
def extract_abstract(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")
    abstract = soup.find("div", class_="abstract-content selected")
    return abstract.get_text(strip=True) if abstract else None

# Function to clean text by removing all non-alphanumeric characters and numbers except for years
def clean_text(text):
    # Remove all numbers except for years starting with '19' or '20'
    text = re.sub(r'\b(?!19\d{2}\b)(?!20\d{2}\b)\d+\b', '', text)  # Matches numbers not starting with '19xx' or '20xx' and removes them
    # Remove all other non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Function to partition text into exactly 200 partitions with specified words per partition
def partition_text(text, num_partitions=200, words_per_partition=100):
    words = text.split()
    partitions = []
    start_index = random.randint(0, max(0, len(words) - num_partitions * words_per_partition))
    for i in range(num_partitions):
        start = start_index + i * words_per_partition
        end = start + words_per_partition
        partition = " ".join(words[start:end])
        if len(partition.split()) == words_per_partition:  # Ensure partition has exactly 100 words
            partitions.append(partition)
        if len(partitions) >= num_partitions:
            break
    return partitions

# Function to scrape PubMed for a specific query
def scrape_pubmed(query, max_results=200):
    url = f"https://pubmed.ncbi.nlm.nih.gov/?term={query.replace(' ', '+')}&size=200"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    abstracts = []
    results = soup.find_all("article", class_="full-docsum")

    links = ["https://pubmed.ncbi.nlm.nih.gov" + article.find("a", class_="docsum-title")["href"] for article in results]

    with ThreadPoolExecutor(max_workers=10) as executor:
        for abstract in executor.map(extract_abstract, links):
            if abstract:  # Only include articles with an abstract
                clean_abstract = clean_text(abstract)
                abstracts.append(clean_abstract)
            if len(abstracts) >= max_results:
                break

    return abstracts

# Scrape data for each category
scraped_data = []
# Define categories
categories = {
    "Category1": "clinical depression",
    "Category2": "bipolar disorder",
    "Category3": "anxiety disorder",
    "Category4": "post-traumatic stress disorder",
    "Category5": "schizophrenia"
}

for idx, (label, category) in enumerate(categories.items()):
    data = []
    while len(data) < 200:
        data.extend(scrape_pubmed(category, max_results=200 - len(data)))
    partitions = []
    for abstract in data:
        if abstract:  # Ensure abstract is not None
            partitions.extend(partition_text(abstract))
        if len(partitions) >= 200:
            break
    partitions = partitions[:200]  # Ensure exactly 200 partitions per category
    for partition in partitions:
        if partition.strip():  # Ensure partition is not empty
            scraped_data.append({"Label": category, "Abstract": partition})

# Save the scraped data to a CSV
df_scraped = pd.DataFrame(scraped_data, columns=["Label", "Abstract"])
df_scraped.to_csv("scraped_pubmed_abstract.csv", index=False)

print("Scraped data saved to 'scraped_pubmed_abstract.csv'.")

print(df_scraped.head(5))
print(df_scraped.shape)

ConnectionError: HTTPSConnectionPool(host='pubmed.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /21306217/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x126fa4040>: Failed to resolve 'pubmed.ncbi.nlm.nih.gov' ([Errno 8] nodename nor servname provided, or not known)"))