In [1]:
import requests
import json
import csv
import time
import logging
from tqdm import tqdm
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def search_wikipedia_topics(search_term, max_words, headers, max_articles):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": search_term,
        "srnamespace": "0",
        "srlimit": "1000",  # Maximum limit per query
        "format": "json"
    }
    
    articles = []
    sroffset = 0  # Offset for pagination
    
    session = requests_retry_session()
    
    while len(articles) < max_articles:
        params['sroffset'] = sroffset
        try:
            response = session.get(base_url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()
            
            results = data.get('query', {}).get('search', [])
            if not results:
                break  # No more results to fetch
            
            for result in tqdm(results, desc=f"Searching articles for {search_term}"):
                title = result['title']
                article = fetch_wikipedia_article(title, max_words, headers, search_term)
                if article:
                    articles.append(article)
                
                time.sleep(0.2)  # Increased delay to respect rate limits
                
                if len(articles) >= max_articles:
                    break
            
            sroffset += len(results)  # Move to the next batch of results
            
        except requests.exceptions.RequestException as e:
            logging.error(f"Error searching for topics with term {search_term}: {e}")
            break

    return articles

def fetch_wikipedia_article(title, max_words, headers, search_term):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts",
        "exintro": "",
        "explaintext": "",
        "format": "json"
    }
    
    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        for page in data['query']['pages'].values():
            if 'extract' in page:
                content = page['extract']
                words = len(content.split())
                if words <= max_words:
                    logging.info(f"Successfully fetched article: {title}")
                    return {
                        'title': page['title'],
                        'content': content,
                        'word_count': words,
                        'topic': search_term.split()[0]  # Adjust if Hindi topics have spaces
                    }
                else:
                    logging.info(f"Article '{title}' exceeded word count limit")
            else:
                logging.warning(f"No extract found for {title}")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching {title}: {e}")
    return None

def main():
    articles = []
    max_articles = 3000
    min_words = 301
    max_words = 450
    
    headers = {
        'User-Agent': 'MultiTopicArticlesFetcher/1.0 (yourname@example.com)'
    }
    
    # List of new topics in Hindi
    topics = [
           "राजनीतिक व्यक्तित्व"
    ]
    
    for topic in topics:
        if len(articles) < max_articles:
            articles.extend(search_wikipedia_topics(topic, max_words, headers, max_articles - len(articles)))
        if len(articles) >= max_articles:
            break
    
    # Save to CSV
    csv_file = 'multi_topic_articles_300_to_450_18.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['title', 'content', 'word_count', 'topic']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for article in articles[:max_articles]:
            writer.writerow(article)
            
    
    # Convert CSV to Excel
    df = pd.read_csv(csv_file, encoding='utf-8')
    df.to_excel('multi_topic_articles_300_to_450_18.xlsx', index=False)
    logging.info("CSV file has been converted to Excel format.")

if __name__ == "__main__":
    main()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Searching articles for राजनीतिक व्यक्तित्व:   0%|          | 0/469 [00:00<?, ?it/s]2025-02-02 14:47:09,482 - INFO - Successfully fetched article: विधिक व्यक्तित्व
Searching articles for राजनीतिक व्यक्तित्व:   0%|          | 1/469 [00:00<06:33,  1.19it/s]2025-02-02 14:47:10,292 - INFO - Successfully fetched article: भीण्डर
Searching articles for राजनीतिक व्यक्तित्व:   0%|          | 2/469 [00:01<06:18,  1.23it/s]2025-02-02 14:47:11,056 - INFO - Successfully fetched article: नवाब फजल अली खान
Searching articles for राजनीतिक व्यक्तित्व:   1%|          | 3/469 [00:02<06:11,  1.2