In [5]:
import requests
import json
import csv
import time
import logging
from tqdm import tqdm
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


In [6]:

pip install openpyxl


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def search_wikipedia_topics(search_term, max_words, headers, max_articles):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": search_term,
        "srnamespace": "0",
        "srlimit": "1000",  # Maximum limit per query
        "format": "json"
    }
    
    articles = []
    sroffset = 0  # Offset for pagination
    
    session = requests_retry_session()
    
    while len(articles) < max_articles:
        params['sroffset'] = sroffset
        try:
            response = session.get(base_url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()
            
            results = data.get('query', {}).get('search', [])
            if not results:
                break  # No more results to fetch
            
            for result in tqdm(results, desc=f"Searching articles for {search_term}"):
                title = result['title']
                article = fetch_wikipedia_article(title, max_words, headers, search_term)
                if article:
                    articles.append(article)
                
                time.sleep(0.7)  # Increased delay to respect rate limits
                
                if len(articles) >= max_articles:
                    break
            
            sroffset += len(results)  # Move to the next batch of results
            
        except requests.exceptions.RequestException as e:
            logging.error(f"Error searching for topics with term {search_term}: {e}")
            break

    return articles

def fetch_wikipedia_article(title, max_words, headers, search_term):
    base_url = "https://hi.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts",
        "exintro": "",
        "explaintext": "",
        "format": "json"
    }
    
    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        for page in data['query']['pages'].values():
            if 'extract' in page:
                content = page['extract']
                words = len(content.split())
                if words <= max_words:
                    logging.info(f"Successfully fetched article: {title}")
                    return {
                        'title': page['title'],
                        'content': content,
                        'word_count': words,
                        'topic': search_term.split()[0]  # Adjust if Hindi topics have spaces
                    }
                else:
                    logging.info(f"Article '{title}' exceeded word count limit")
            else:
                logging.warning(f"No extract found for {title}")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching {title}: {e}")
    return None

def main():
    articles = []
    max_articles = 3000
    max_words = 300
    
    headers = {
        'User-Agent': 'MultiTopicArticlesFetcher/1.0 (yourname@example.com)'
    }
    
    # List of topics in Hindi
    topics = [
         "भाषाएँ"
         
    ]
    
    for topic in topics:
        if len(articles) < max_articles:
            articles.extend(search_wikipedia_topics(topic, max_words, headers, max_articles - len(articles)))
        if len(articles) >= max_articles:
            break
    
    # Save to CSV
    csv_file = 'multiple-topic-articles_36.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['title', 'content', 'word_count', 'topic']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for article in articles[:max_articles]:
            writer.writerow(article)
    
    # Convert CSV to Excel
    df = pd.read_csv(csv_file, encoding='utf-8')
    df.to_excel('multi_topic_articles_36.xlsx', index=False)
    logging.info("CSV file has been converted to Excel format.")

if __name__ == "__main__":
    main()

Searching articles for भाषाएँ:   0%|          | 0/500 [00:00<?, ?it/s]2025-01-29 17:12:59,041 - INFO - Article 'भाषा' exceeded word count limit
Searching articles for भाषाएँ:   0%|          | 1/500 [00:01<10:17,  1.24s/it]2025-01-29 17:13:00,274 - INFO - Successfully fetched article: भारत की भाषाएँ
Searching articles for भाषाएँ:   0%|          | 2/500 [00:02<10:14,  1.23s/it]2025-01-29 17:13:01,491 - INFO - Successfully fetched article: दार्दी भाषाएँ
Searching articles for भाषाएँ:   1%|          | 3/500 [00:03<10:09,  1.23s/it]2025-01-29 17:13:03,210 - INFO - Article 'पहाड़ी भाषाएँ' exceeded word count limit
Searching articles for भाषाएँ:   1%|          | 4/500 [00:05<11:44,  1.42s/it]2025-01-29 17:13:04,445 - INFO - Successfully fetched article: हिन्द-आर्य भाषाएँ
Searching articles for भाषाएँ:   1%|          | 5/500 [00:06<11:10,  1.35s/it]2025-01-29 17:13:05,676 - INFO - Successfully fetched article: हिन्द-ईरानी भाषाएँ
Searching articles for भाषाएँ:   1%|          | 6/500 [00:07<10:4

KeyboardInterrupt: 