# Extract literature from OpenAlex
- Fetch literature (Paper title, abstract and metadata) from OpenAlex API
- Define search string: search_term = '("keywordA" OR "keywordb" OR "keywordc")'
- Define time range of relevant publications: publication_year_range = '2014-2024'

## Set up of environment
- Import libraries
- Define working directory
- Define helper functions

In [None]:
# Import libraries
import os
import pandas as pd
import requests
import nltk
from collections import namedtuple
from nltk.corpus import brown
import time
import re
import urllib.parse
import html
import logging
import datetime
from tqdm import tqdm

nltk.download('brown')

print("All imports successful.")

In [None]:
# Print current working directory
print("Current Working Directory:", os.getcwd())

# Global Flags
FETCH_PAPERS = True  # Toggle for paper fetching
ALLOW_DOWNLOAD = True  # Toggle for downloading functionality
LLM_EXECUTION = True  # Toggle for LLM-based tasks

# Define the base directory for file handling
base_folder = os.path.join(os.getcwd(), 'FOLDER NAME')

# Check if FOLDER NAME folder exists, create if not
if not os.path.exists(base_folder):
    os.makedirs(base_folder, exist_ok=True)
    print(f"Folder 'FOLDER NAME' created at: {base_folder}")
else:
    print(f"Folder 'FOLDER NAME' already exists at: {base_folder}")

# Define path for Analysis subfolder
analysis_folder = os.path.join(base_folder, 'Analysis')

# Check if Analysis folder exists, create if not
if not os.path.exists(analysis_folder):
    os.makedirs(analysis_folder, exist_ok=True)
    print(f"Folder 'Analysis' created at: {analysis_folder}")
else:
    print(f"Folder 'Analysis' already exists at: {analysis_folder}")

# Logging setup
error_log_file = os.path.join(base_folder, 'fetch_errors.log')
logging.basicConfig(filename=error_log_file, level=logging.ERROR)

# Confirm save location
print(f"Logging and outputs will be saved in: {base_folder}")

# Get today's date in YYYY-MM-DD format
today_date = datetime.datetime.now().strftime("%Y-%m-%d")
print(f"Today's date: {today_date}")

In [None]:
# Define helper functions
def to_abstract(paper):
    """Reconstruct abstract from inverted index in OpenAlex data."""
    if 'abstract_inverted_index' in paper and paper['abstract_inverted_index']:
        words = sorted(
            [(k, index) for k, v in paper['abstract_inverted_index'].items() for index in v],
            key=lambda x: x[1]
        )
        return ' '.join([word[0] for word in words])
    return ''

def is_english(text, brown_corpus, threshold=0.15):
    """Check if text is in English using word overlap with Brown corpus."""
    if not text:
        return 'EMPTY', 'NA'
    tokens = set(re.findall(r"\w+", text.lower()))
    lang_ratio = len(tokens & brown_corpus.value) / len(tokens) if tokens else 0
    return 'PASS', 'en' if lang_ratio > threshold else 'non-en'

def clean_text(text):
    """Clean text by decoding HTML entities, removing HTML tags, and handling encoding artifacts."""
    if isinstance(text, str):
        try:
            # Decode HTML entities
            text = html.unescape(text)
            # Remove HTML tags
            text = re.sub(r'<[^>]+>', '', text)
            # Replace problematic characters
            text = (text.replace('‚Äê', '—')
                        .replace('‚Äì', '-')
                        .replace('‚Äî', '–')
                        .replace('‚Äò', "'")
                        .replace('‚Äô', "'")
                        .replace('‚Ä¢', '•')
                        .replace('‚Äû', '"')
                        .replace('‚Äú', '"')
                        .replace('‚Ä¶', '…')
                        .replace('¬†', ' ')
                        .replace('√∫', 'ú')
                        .replace('√©', 'é')
                        .replace('√±', 'ñ')
                        .replace('√≥', 'ó')
                        .replace('√∂', 'ö')
                        .replace('√', '')
                        .replace('‚Ä', '')
                        .replace('‚', ''))
            # Normalize whitespace
            text = re.sub(r'\s+', ' ', text).strip()
            # Remove non-ASCII characters
            text = re.sub(r'[^\x00-\x7F]+', '', text)
        except Exception as e:
            logging.error(f"Text cleaning failed: {e}")
            return ""
    return text

def format_ids(items, base_url):
    """Format IDs from items."""
    return '|'.join([clean_text(item['id'].replace(base_url, '')) for item in items if 'id' in item])

def format_authors(authorships):
    """Extract and clean author names."""
    return ', '.join([clean_text(auth['author']['display_name']) for auth in authorships if 'author' in auth])

def format_citations(referenced_works):
    """Format citations for the paper."""
    return '|'.join([clean_text(ref.replace('https://openalex.org/W', '')) for ref in referenced_works])

def fetch_with_retries(url, headers, retries=3, delay=2):
    """Fetch data with retry logic."""
    for attempt in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response
        print(f"Attempt {attempt + 1} failed with status code {response.status_code}. Retrying in {delay} seconds...")
        time.sleep(delay)
    return None

def fetch_articles(base_url, headers):
    """Fetch articles using the OpenAlex API."""
    cursor = '*'
    papers_list = []
    total_pages = None

    start_time = time.time()

    with tqdm(desc="Fetching data", unit="page") as pbar:
        while cursor:
            response = fetch_with_retries(f"{base_url}&per_page=100&cursor={cursor}", headers)
            if response and response.status_code == 200:
                data = response.json()

                # Initialize total_pages on the first successful response
                if total_pages is None and 'meta' in data and 'count' in data['meta']:
                    total_results = data['meta']['count']
                    total_pages = (total_results // 100) + (1 if total_results % 100 > 0 else 0)
                    print(f"Total number of pages to process: {total_pages}")
                    pbar.total = total_pages

                for paper in data.get('results', []):
                    try:
                        abstract = clean_text(to_abstract(paper))
                        title = clean_text(paper.get('title', ''))
                        pub_date = clean_text(paper.get('publication_date', ''))
                        pub_year = str(paper.get('publication_year', ''))

                        status, lang = is_english(f"{title} {abstract}", bc_brown)
                        if lang == 'en':
                            authors = format_authors(paper.get('authorships', []))
                            citations = format_citations(paper.get('referenced_works', []))
                            concepts = format_ids(paper.get('concepts', []), 'https://openalex.org/C')

                            papers_list.append({
                                'PaperId': clean_text(paper['id'].replace('https://openalex.org/W', '')),
                                'PaperTitle': title,
                                'Citations': citations,
                                'c': concepts,
                                'Authors': authors,
                                'Abstract': abstract,
                                'Language': 'en',
                                'PubYear': pub_year,
                                'PubDate': pub_date
                            })
                    except Exception as e:
                        logging.error(f"Error processing paper: {e}")

                cursor = data['meta'].get('next_cursor')
                pbar.update(1) 
                time.sleep(1.5)  # Delay to avoid hitting rate limits!
            else:
                print("Failed to fetch data or rate limit exceeded.")
                break

    total_time_minutes = round((time.time() - start_time) / 60, 2)
    print(f"Total fetch time: {total_time_minutes} minutes.")
    return papers_list

print("Helper functions successfully defined.")

## Run fetching of publications.

In [None]:
# Initialize Brown corpus for language detection
Brown = namedtuple("Brown", field_names=['value'])
bc_brown = Brown(value=set(word.lower() for word in brown.words()))

# Set up email for OpenAlex API requests
headers = {'email': 'EMAIL ADDRESS'}

# Define search parameters for all kind of cancers
search_term = '("cancer" OR "carcinoma" OR "tumor")'
publication_year_range = '2014-2024'
encoded_search_term = urllib.parse.quote(search_term)
base_url = (
    f'https://api.openalex.org/works?filter=has_abstract:true,title_and_abstract.search:'
    f'{encoded_search_term},publication_year:{publication_year_range}&sort=publication_year:desc'
)

# Set global flags for fetching papers
FETCH_PAPERS = True

# Create a folder for saving results
drive_folder = os.path.join(os.getcwd(), 'FOLDER NAME')
if not os.path.exists(drive_folder):
    os.makedirs(drive_folder)
    print(f"Created folder: {drive_folder}")
else:
    print(f"Using existing folder: {drive_folder}")


# Fetch and save articles
if FETCH_PAPERS:
    print("Fetching articles...")
    papers = fetch_articles(base_url, headers)
    if papers:
        print(f"Fetched {len(papers)} articles.")
        
        # Convert articles to a DataFrame
        df = pd.DataFrame(papers)

        # Create the output filename with the current date
        file_name = f"search_{today_date}_articles.csv"
        output_file = os.path.join(drive_folder, file_name)

        # Save the DataFrame to a CSV file
        df.to_csv(output_file, index=False)
        print(f"Saved articles to CSV in FOLDER NAME folder: {output_file}")
    else:
        print("No articles fetched.")
else:
    print("Fetching is disabled.")