In [1]:
import requests
import json
import csv
import time
import logging
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def search_wikipedia_topics(search_term, max_words, headers, max_articles):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": search_term,
        "srnamespace": "0",
        "srlimit": "1000",  # Maximum limit per query
        "format": "json"
    }
    
    articles = []
    sroffset = 0  # Offset for pagination
    
    while len(articles) < max_articles:
        params['sroffset'] = sroffset
        try:
            response = requests.get(base_url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()
            
            results = data.get('query', {}).get('search', [])
            if not results:
                break  # No more results to fetch
            
            for result in tqdm(results, desc=f"Searching articles for {search_term}"):
                title = result['title']
                article = fetch_wikipedia_article(title, max_words, headers, search_term)
                if article:
                    articles.append(article)
                
                time.sleep(1.0)  # Delay to respect rate limits
                
                if len(articles) >= max_articles:
                    break
            
            sroffset += len(results)  # Move to the next batch of results
            
        except requests.exceptions.RequestException as e:
            logging.error(f"Error searching for topics with term {search_term}: {e}")
            break

    return articles

def fetch_wikipedia_article(title, max_words, headers, search_term):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts",
        "exintro": "",
        "explaintext": "",
        "format": "json"
    }
    
    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        for page in data['query']['pages'].values():
            if 'extract' in page:
                content = page['extract']
                words = len(content.split())
                if words <= max_words:
                    logging.info(f"Successfully fetched article: {title}")
                    return {
                        'title': page['title'],
                        'content': content,
                        'word_count': words,
                        'topic': search_term.split()[0]  # Adjust if Hindi topics have spaces
                    }
                else:
                    logging.info(f"Article '{title}' exceeded word count limit")
            else:
                logging.warning(f"No extract found for {title}")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching {title}: {e}")
    return None

def main():
    articles = []
    max_articles = 1500
    max_words = 300
    
    headers = {
        'User-Agent': 'MultiTopicArticlesFetcher/1.0 (yourname@example.com)'
    }
    
    # List of topics in Hindi
    topics = [
        "खेल"
    ]
    
    for topic in topics:
        if len(articles) < max_articles:
            articles.extend(search_wikipedia_topics(topic, max_words, headers, max_articles - len(articles)))
        if len(articles) >= max_articles:
            break
    
    # Save to CSV
    with open('multi_topic_articles.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['title', 'content', 'word_count', 'topic']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for article in articles[:max_articles]:
            writer.writerow(article)

if __name__ == "__main__":
    main()

Searching articles for खेल:   0%|          | 0/500 [00:00<?, ?it/s]2025-01-28 00:50:02,207 - INFO - Article 'खेल' exceeded word count limit
Searching articles for खेल:   0%|          | 1/500 [00:01<13:47,  1.66s/it]2025-01-28 00:50:03,773 - INFO - Successfully fetched article: राष्ट्रमण्डल खेल
Searching articles for खेल:   0%|          | 2/500 [00:03<13:22,  1.61s/it]2025-01-28 00:50:05,357 - INFO - Article 'ओलम्पिक खेल' exceeded word count limit
Searching articles for खेल:   1%|          | 3/500 [00:04<13:11,  1.59s/it]2025-01-28 00:50:06,979 - INFO - Successfully fetched article: 2010 राष्ट्रमण्डल खेल
Searching articles for खेल:   1%|          | 4/500 [00:06<13:16,  1.60s/it]2025-01-28 00:50:08,572 - INFO - Successfully fetched article: 2014 राष्ट्रमण्डल खेल
Searching articles for खेल:   1%|          | 5/500 [00:08<13:11,  1.60s/it]2025-01-28 00:50:10,170 - INFO - Successfully fetched article: एशियाई खेल
Searching articles for खेल:   1%|          | 6/500 [00:09<13:10,  1.60s/it]2025-