In [None]:
import requests
import os
import re
import time
import json
import urllib.parse
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

def extract_article_content(docs, api_key, output_dir="article_content"):
    """
    Extract available article content, focusing on what's actually accessible
    through the API rather than attempting to get complete full text
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    text_output_dir = os.path.join(output_dir, "extracted_text")
    if not os.path.exists(text_output_dir):
        os.makedirs(text_output_dir)

    stats = {
        "total": len(docs),
        "abstract_only": 0,
        "partial_text": 0,
        "full_text": 0,
        "not_available": 0
    }

    print(f"Extracting content for {len(docs)} articles...")
    
    # Process each document with a DOI
    for i, doc in enumerate(docs):
        print(f"\nProcessing document {i+1}/{len(docs)}...")
        
        # Skip documents without DOI
        if not doc.get("doi"):
            print("No DOI available, skipping")
            stats["not_available"] += 1
            continue
            
        doi = doc["doi"]
        title = doc["title"]
        print(f"Title: {title}")
        print(f"DOI: {doi}")
        
        # Generate a safe filename from title
        safe_title = re.sub(r"[^\w\s-]", "", title)
        safe_title = re.sub(r"\s+", "_", safe_title)
        safe_title = safe_title[:50]
        year = doc.get("publication_date", "")
        
        # Create object to store extracted content
        content = {
            "doi": doi,
            "title": title,
            "year": year,
            "abstract": doc.get("abstract", ""),
            "sections": [],
            "full_text": "",
            "metadata": {}
        }
        
        # Try to get XML representation first (most structured)
        encoded_doi = urllib.parse.quote(doi, safe="")
        url = f"https://api.elsevier.com/content/article/doi/{encoded_doi}"
        
        headers = {
            "X-ELS-APIKey": api_key,
            "Accept": "text/xml"
        }
        
        # Set retrieved content flag
        content_retrieved = False
        
        try:
            print("Requesting XML content...")
            response = requests.get(url, headers=headers)
            
            if response.status_code == 200:
                # Save the raw XML for reference
                xml_filepath = os.path.join(output_dir, f"{safe_title}_{year}.xml")
                with open(xml_filepath, "wb") as f:
                    f.write(response.content)
                
                # Parse the XML to extract content
                try:
                    # Register namespaces for XML parsing
                    ns = {
                        'dc': 'http://purl.org/dc/elements/1.1/',
                        'prism': 'http://prismstandard.org/namespaces/basic/2.0/',
                        'ja': 'http://www.elsevier.com/xml/ja/dtd',
                        'ce': 'http://www.elsevier.com/xml/common/dtd',
                        'xocs': 'http://www.elsevier.com/xml/xocs/dtd'
                    }
                    
                    root = ET.fromstring(response.content)
                    
                    # Extract the abstract if available
                    abstract_elem = root.find('.//dc:description', ns)
                    if abstract_elem is not None and abstract_elem.text:
                        content["abstract"] = abstract_elem.text
                        
                    # Extract full text sections if available
                    # Approach varies by document format and availability
                    full_text_sections = []
                    
                    # Try to find sections in the full-text element
                    for section_elem in root.findall('.//ce:sections/ce:section', ns) + root.findall('.//section', ns):
                        section_title_elem = section_elem.find('./ce:section-title', ns) or section_elem.find('./section-title', ns)
                        section_title = section_title_elem.text if section_title_elem is not None else "Unnamed Section"
                        
                        # Collect all paragraph texts
                        paragraphs = []
                        for p_elem in section_elem.findall('.//ce:para', ns) + section_elem.findall('.//p', ns):
                            if p_elem.text:
                                paragraphs.append(p_elem.text)
                        
                        if paragraphs:
                            section_text = "\n\n".join(paragraphs)
                            full_text_sections.append({
                                "title": section_title,
                                "text": section_text
                            })
                    
                    # If we found structured sections
                    if full_text_sections:
                        content["sections"] = full_text_sections
                        content_retrieved = True
                        
                        # Combine all sections into full text
                        full_text_parts = []
                        for section in full_text_sections:
                            full_text_parts.append(f"## {section['title']}\n\n{section['text']}")
                        
                        content["full_text"] = "\n\n".join(full_text_parts)
                        
                        # Determine content completeness level
                        total_text_length = len(content["full_text"])
                        if total_text_length > 5000:  # Arbitrary threshold
                            stats["full_text"] += 1
                        else:
                            stats["partial_text"] += 1
                    
                    # If we only got the abstract
                    elif content["abstract"]:
                        content_retrieved = True
                        stats["abstract_only"] += 1
                    else:
                        # Try alternative parsing with BeautifulSoup
                        soup = BeautifulSoup(response.content, 'xml')
                        
                        # Extract abstract
                        abstract = soup.find('abstract') or soup.find('dc:description')
                        if abstract and abstract.text:
                            content["abstract"] = abstract.text.strip()
                            content_retrieved = True
                            stats["abstract_only"] += 1
                
                except Exception as e:
                    print(f"Error parsing XML: {str(e)}")
                
                # Save the extracted content as JSON
                text_filepath = os.path.join(text_output_dir, f"{safe_title}_{year}.json")
                with open(text_filepath, "w", encoding="utf-8") as f:
                    json.dump(content, f, indent=2, ensure_ascii=False)
                
                print(f"Content saved to {text_filepath}")
                doc["content_file"] = text_filepath
                doc["xml_file"] = xml_filepath
            
            else:
                print(f"Failed to retrieve XML content: {response.status_code}")
                print(f"Error response: {response.text[:200]}...")
        
        except Exception as e:
            print(f"Error retrieving XML: {str(e)}")
        
        # If nothing was retrieved, increment counter
        if not content_retrieved:
            stats["not_available"] += 1
            
        # Rate limiting
        time.sleep(2)

    # Print final summary
    print("\nExtraction Summary:")
    print(f"Total documents processed: {stats['total']}")
    print(f"Full text extracted: {stats['full_text']} ({stats['full_text']*100/stats['total']:.1f}%)")
    print(f"Partial text extracted: {stats['partial_text']} ({stats['partial_text']*100/stats['total']:.1f}%)")
    print(f"Abstract only: {stats['abstract_only']} ({stats['abstract_only']*100/stats['total']:.1f}%)")
    print(f"No content available: {stats['not_available']} ({stats['not_available']*100/stats['total']:.1f}%)")
    
    return docs

# Install BeautifulSoup if not already installed
try:
    from bs4 import BeautifulSoup
except ImportError:
    import sys
    !{sys.executable} -m pip install beautifulsoup4
    from bs4 import BeautifulSoup

# Call the function with your filtered docs
output_dir = "article_content"
api_key = client.api_key

# For testing, just use a few documents first
test_docs = doc_info
docs_with_content = extract_article_content(test_docs, api_key, output_dir)

# Print detailed results
print("\nExtracted content details:")
for doc in docs_with_content:
    print(f"\nTitle: {doc['title']}")
    print(f"DOI: {doc.get('doi', 'N/A')}")
    
    # Show what files are available
    if "content_file" in doc:
        print(f"Content file: {doc['content_file']}")
        
        # Try to load the content to show sections
        try:
            with open(doc['content_file'], 'r', encoding='utf-8') as f:
                content = json.load(f)
                
            print(f"Abstract available: {'Yes' if content.get('abstract') else 'No'}")
            print(f"Number of extracted sections: {len(content.get('sections', []))}")
            
            # Show section titles
            if content.get('sections'):
                print("Extracted sections:")
                for i, section in enumerate(content['sections']):
                    print(f"- {section['title']} ({len(section['text'])} characters)")
        except Exception as e:
            print(f"Error loading content file: {str(e)}")
    else:
        print("No content was extracted")

# Save the results to a JSON file for future reference
with open(os.path.join(output_dir, "document_metadata.json"), "w") as f:
    json.dump(docs_with_content, f, indent=2)

print(f"\nDocument metadata saved to {os.path.join(output_dir, 'document_metadata.json')}")

Extracting content for 325 articles...

Processing document 1/325...
Title: Instruction-Tuning Llama-3-8B Excels in City-Scale Mobility Prediction
DOI: 10.1145/3681771.3699908
Requesting XML content...
Failed to retrieve XML content: 404
Error response: <service-error><status><statusCode>RESOURCE_NOT_FOUND</statusCode><statusText>The resource specified cannot be found.</statusText></status></service-error>...

Processing document 2/325...
Title: Large language model as parking planning agent in the context of mixed period of autonomous vehicles and Human-Driven vehicles
DOI: 10.1016/j.scs.2024.105940
Requesting XML content...


  section_title_elem = section_elem.find('./ce:section-title', ns) or section_elem.find('./section-title', ns)


Content saved to article_content/extracted_text/Large_language_model_as_parking_planning_agent_in__2024.json

Processing document 3/325...
Title: Evaluating a global citizenship course on developing business students' AI literacy skills
DOI: 10.1108/978-1-83608-852-320241010
Requesting XML content...
Failed to retrieve XML content: 404
Error response: <service-error><status><statusCode>RESOURCE_NOT_FOUND</statusCode><statusText>The resource specified cannot be found.</statusText></status></service-error>...

Processing document 4/325...
Title: Arabic Opinion Classification of Customer Service Conversations Using Data Augmentation and Artificial Intelligence
DOI: 10.3390/bdcc8120196
Requesting XML content...
Failed to retrieve XML content: 404
Error response: <service-error><status><statusCode>RESOURCE_NOT_FOUND</statusCode><statusText>The resource specified cannot be found.</statusText></status></service-error>...

Processing document 5/325...
Title: Integrating Large Language Models a

KeyboardInterrupt: 