In [None]:
import requests
import pandas as pd
import time

In [None]:
def get_abstracts_paginated(total=100, start_year=2018, end_year=2025):
    """Get abstracts using pagination with robust error handling"""
    base_url = "https://api.openalex.org/works"
    headers = {
        'User-Agent': 'AcademicResearchBot/1.0',
        'Accept': 'application/json'
    }
    
    filters = [
        "type:article",
        f"publication_year:>{start_year-1}",
        f"publication_year:<{end_year+1}",
        "has_abstract:true"
    ]
    
    abstracts = []
    page = 1
    per_page = 25
    
    print(f"Starting to collect {total} abstracts...")
    
    while len(abstracts) < total:
        params = {
            'filter': ','.join(filters),
            'select': 'id,title,abstract_inverted_index,publication_year,authorships,primary_location,doi',
            'per-page': per_page,
            'page': page
        }
        
        try:
            print(f"Fetching page {page}...")
            response = requests.get(base_url, params=params, headers=headers, timeout=30)
            
            if response.status_code == 429:
                print("Rate limited. Waiting 10 seconds...")
                time.sleep(10)
                continue
                
            response.raise_for_status()
            data = response.json()
            
            if not data.get('results'):
                print("No more results available")
                break
                
            for work in data['results']:
                if len(abstracts) >= total:
                    break
                    
                # Skip if work is None or missing critical data
                if not work:
                    continue
                    
                # Process abstract 
                abstract_text = ""
                try:
                    if work.get('abstract_inverted_index'):
                        inverted_index = work['abstract_inverted_index']
                        if isinstance(inverted_index, dict):
                            word_positions = []
                            for word, positions in inverted_index.items():
                                if isinstance(positions, list):
                                    for pos in positions:
                                        word_positions.append((pos, word))
                            word_positions.sort()
                            abstract_text = ' '.join([word for pos, word in word_positions])
                    
                    # Only include if abstract is meaningful 
                    if abstract_text and len(abstract_text.strip()) > 50:  
                        abstract_info = {
                            'id': work.get('id', ''),
                            'title': work.get('title', ''),
                            'abstract': abstract_text,
                            'year': work.get('publication_year'),
                            'doi': work.get('doi', ''),
                            'authors_count': len(work.get('authorships', [])),
                            'source': work.get('primary_location', {}).get('source', {}).get('display_name', '') if work.get('primary_location') else ''
                        }
                        abstracts.append(abstract_info)
                        print(f"Collected {len(abstracts)}/{total} abstracts")
                        
                except Exception as e:
                    print(f"Error processing work: {e}")
                    continue
            
            page += 1
            time.sleep(1)  
            
        except requests.exceptions.RequestException as e:
            print(f"Error on page {page}: {e}")
            break
    
    return abstracts

def save_abstracts_to_csv(abstracts, filename='abstracts_sample.csv'):
    if abstracts:
        df = pd.DataFrame(abstracts)
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Saved {len(abstracts)} abstracts to {filename}")
        return True
    return False

# Main execution
if __name__ == "__main__":
    print("Starting abstract extraction using paginated approach...")
    
    abstracts = get_abstracts_paginated(total=100, start_year=2018, end_year=2025)
    
    if abstracts:
        save_abstracts_to_csv(abstracts)
        print(f"\nSuccess! Collected {len(abstracts)} abstracts")
        if abstracts:
            print(f"Sample title: {abstracts[0]['title'][:50]}...")
            print(f"Abstract preview: {abstracts[0]['abstract'][:100]}...")
    else:
        print("Failed to retrieve abstracts")