In [1]:
import requests
import json
import csv
import time
import logging
from tqdm import tqdm
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


In [2]:

pip install openpyxl


Collecting openpyxlNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for openpyxl from https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl.metadata
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Obtaining dependency information for et-xmlfile from https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl.metadata
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
   ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/250.9 kB ? eta -:--:--
   ------ -------------------------------- 41.0/250.9 kB 495.5 kB/s eta 0:00:01
   ------ -------------------------------- 41.0/250.


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def search_wikipedia_topics(search_term, max_words, headers, max_articles):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": search_term,
        "srnamespace": "0",
        "srlimit": "1000",  # Maximum limit per query
        "format": "json"
    }
    
    articles = []
    sroffset = 0  # Offset for pagination
    
    session = requests_retry_session()
    
    while len(articles) < max_articles:
        params['sroffset'] = sroffset
        try:
            response = session.get(base_url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()
            
            results = data.get('query', {}).get('search', [])
            if not results:
                break  # No more results to fetch
            
            for result in tqdm(results, desc=f"Searching articles for {search_term}"):
                title = result['title']
                article = fetch_wikipedia_article(title, max_words, headers, search_term)
                if article:
                    articles.append(article)
                
                time.sleep(0.2)  # Increased delay to respect rate limits
                
                if len(articles) >= max_articles:
                    break
            
            sroffset += len(results)  # Move to the next batch of results
            
        except requests.exceptions.RequestException as e:
            logging.error(f"Error searching for topics with term {search_term}: {e}")
            break

    return articles

def fetch_wikipedia_article(title, max_words, headers, search_term):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts",
        "exintro": "",
        "explaintext": "",
        "format": "json"
    }
    
    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        for page in data['query']['pages'].values():
            if 'extract' in page:
                content = page['extract']
                words = len(content.split())
                if words <= max_words:
                    logging.info(f"Successfully fetched article: {title}")
                    return {
                        'title': page['title'],
                        'content': content,
                        'word_count': words,
                        'topic': search_term.split()[0]  # Adjust if Hindi topics have spaces
                    }
                else:
                    logging.info(f"Article '{title}' exceeded word count limit")
            else:
                logging.warning(f"No extract found for {title}")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching {title}: {e}")
    return None

def main():
    articles = []
    max_articles = 4000
    max_words = 300
    
    headers = {
        'User-Agent': 'MultiTopicArticlesFetcher/1.0 (yourname@example.com)'
    }
    
    # List of topics in Hindi
    topics = [
         "यात्रा"
         
    ]
    
    for topic in topics:
        if len(articles) < max_articles:
            articles.extend(search_wikipedia_topics(topic, max_words, headers, max_articles - len(articles)))
        if len(articles) >= max_articles:
            break
    
    # Save to CSV
    csv_file = 'multiple-topic-articles_34.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['title', 'content', 'word_count', 'topic']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for article in articles[:max_articles]:
            writer.writerow(article)
    
    # Convert CSV to Excel
    df = pd.read_csv(csv_file, encoding='utf-8')
    df.to_excel('multi_topic_articles_34.xlsx', index=False)
    logging.info("CSV file has been converted to Excel format.")

if __name__ == "__main__":
    main()

Searching articles for यात्रा:   0%|          | 0/500 [00:00<?, ?it/s]2025-01-29 15:15:16,748 - INFO - Successfully fetched article: यात्रा
Searching articles for यात्रा:   0%|          | 1/500 [00:00<06:27,  1.29it/s]2025-01-29 15:15:17,445 - INFO - Successfully fetched article: २०१७ अमरनाथ यात्रा आक्रमण
Searching articles for यात्रा:   0%|          | 2/500 [00:01<06:03,  1.37it/s]2025-01-29 15:15:19,809 - INFO - Successfully fetched article: जेल यात्रा
Searching articles for यात्रा:   1%|          | 3/500 [00:03<12:13,  1.47s/it]2025-01-29 15:15:20,657 - INFO - Successfully fetched article: यात्रावृत्तांत
Searching articles for यात्रा:   1%|          | 4/500 [00:04<10:08,  1.23s/it]2025-01-29 15:15:21,370 - INFO - Successfully fetched article: कांवड़ यात्रा
Searching articles for यात्रा:   1%|          | 5/500 [00:05<08:35,  1.04s/it]2025-01-29 15:15:23,230 - INFO - Successfully fetched article: सेवा राम यात्री
Searching articles for यात्रा:   1%|          | 6/500 [00:07<10:52,  1.32