In [1]:
import requests
import json
import csv
import time
import logging
from tqdm import tqdm
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def search_wikipedia_topics(search_term, max_words, headers, max_articles):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": search_term,
        "srnamespace": "0",
        "srlimit": "1000",  # Maximum limit per query
        "format": "json"
    }
    
    articles = []
    sroffset = 0  # Offset for pagination
    
    session = requests_retry_session()
    
    while len(articles) < max_articles:
        params['sroffset'] = sroffset
        try:
            response = session.get(base_url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()
            
            results = data.get('query', {}).get('search', [])
            if not results:
                break  # No more results to fetch
            
            for result in tqdm(results, desc=f"Searching articles for {search_term}"):
                title = result['title']
                article = fetch_wikipedia_article(title, max_words, headers, search_term)
                if article:
                    articles.append(article)
                
                time.sleep(0.2)  # Increased delay to respect rate limits
                
                if len(articles) >= max_articles:
                    break
            
            sroffset += len(results)  # Move to the next batch of results
            
        except requests.exceptions.RequestException as e:
            logging.error(f"Error searching for topics with term {search_term}: {e}")
            break

    return articles

def fetch_wikipedia_article(title, max_words, headers, search_term):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts",
        "exintro": "",
        "explaintext": "",
        "format": "json"
    }
    
    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        for page in data['query']['pages'].values():
            if 'extract' in page:
                content = page['extract']
                words = len(content.split())
                if words <= max_words:
                    logging.info(f"Successfully fetched article: {title}")
                    return {
                        'title': page['title'],
                        'content': content,
                        'word_count': words,
                        'topic': search_term.split()[0]  # Adjust if Hindi topics have spaces
                    }
                else:
                    logging.info(f"Article '{title}' exceeded word count limit")
            else:
                logging.warning(f"No extract found for {title}")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching {title}: {e}")
    return None

def main():
    articles = []
    max_articles = 3000
    min_words = 301
    max_words = 450
    
    headers = {
        'User-Agent': 'MultiTopicArticlesFetcher/1.0 (yourname@example.com)'
    }
    
    # List of new topics in Hindi
    topics = [
          "आधुनिक वास्तुकला"
    ]
    
    for topic in topics:
        if len(articles) < max_articles:
            articles.extend(search_wikipedia_topics(topic, max_words, headers, max_articles - len(articles)))
        if len(articles) >= max_articles:
            break
    
    # Save to CSV
    csv_file = 'multi_topic_articles_300_to_450_48.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['title', 'content', 'word_count', 'topic']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for article in articles[:max_articles]:
            writer.writerow(article)
            
    
    # Convert CSV to Excel
    df = pd.read_csv(csv_file, encoding='utf-8')
    df.to_excel('multi_topic_articles_300_to_450_48.xlsx', index=False)
    logging.info("CSV file has been converted to Excel format.")

if __name__ == "__main__":
    main()

Searching articles for आधुनिक वास्तुकला:   0%|          | 0/463 [00:00<?, ?it/s]2025-02-02 16:23:40,447 - INFO - Successfully fetched article: उत्तर आधुनिक वास्तुकला
Searching articles for आधुनिक वास्तुकला:   0%|          | 1/463 [00:00<05:45,  1.34it/s]2025-02-02 16:23:41,138 - INFO - Successfully fetched article: मुग़ल वास्तुकला
Searching articles for आधुनिक वास्तुकला:   0%|          | 2/463 [00:01<05:29,  1.40it/s]2025-02-02 16:23:41,856 - INFO - Successfully fetched article: वास्तुकला
Searching articles for आधुनिक वास्तुकला:   1%|          | 3/463 [00:02<05:29,  1.40it/s]2025-02-02 16:23:42,592 - INFO - Successfully fetched article: हिंदू वास्तुकला
Searching articles for आधुनिक वास्तुकला:   1%|          | 4/463 [00:02<05:32,  1.38it/s]2025-02-02 16:23:43,325 - INFO - Successfully fetched article: बंगाल की वास्तुकला
Searching articles for आधुनिक वास्तुकला:   1%|          | 5/463 [00:03<05:40,  1.34it/s]2025-02-02 16:23:44,074 - INFO - Successfully fetched article: वास्तुकला सिद्धांत