In [20]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
from datetime import datetime, timedelta
import time
import json
import re
import logging
from tqdm import tqdm

In [21]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraping.log'),
        logging.StreamHandler()
    ]
)

In [22]:
def setup_logging():
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    
    fh = logging.FileHandler('scraping_progress.log')
    fh.setLevel(logging.INFO)
    
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    
    # Create formatter and add it to the handlers
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    
    # Add the handlers to the logger
    logger.addHandler(fh)
    logger.addHandler(ch)
    
    return logger

logger = setup_logging()

In [26]:
BASE_URL = "https://proza.ru/texts/list.html"
START_YEAR = 2012
END_YEAR = 2025
OUTPUT_DIR = "proza_texts"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
REQUEST_DELAY = 1
DAY_DELAY = 2

In [24]:
def is_leap_year(year):
    """Check if a year is a leap year"""
    if year % 4 != 0:
        return False
    elif year % 100 != 0:
        return True
    else:
        return year % 400 == 0

def get_days_in_month(year, month):
    """Return number of days in a month, accounting for leap years"""
    if month == 2:
        return 29 if is_leap_year(year) else 28
    elif month in [4, 6, 9, 11]:
        return 30
    else:
        return 31

def generate_dates():
    """Generate all dates from START_YEAR to END_YEAR"""
    current_date = datetime(START_YEAR, 1, 1)
    end_date = datetime(END_YEAR, 12, 31)
    
    while current_date <= end_date:
        yield current_date
        current_date += timedelta(days=1)

In [25]:
def get_article_links(date, topic):
    """Get all article links for a specific date"""
    url = f"{BASE_URL}?topic=12&year={date.year}&month={date.month:02d}&day={date.day}"
    logger.info(f"Going to url {url}...")
    try:
        logger.info(f"Fetching articles for {date.strftime('%Y-%m-%d')} in topic {topic}")
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all links that match the pattern /YYYY/MM/DD/ followed by numbers
            pattern = re.compile(rf'/{date.year}/(0?{date.month}|{date.month})/(0?{date.day}|{date.day})/\d+$')
            
            links = []
            for a in soup.find_all('a', href=True):
                if pattern.search(a['href']):
                    links.append(a['href'])
            logger.info(f"These are the links formed: {links}")
            # Convert relative URLs to absolute
            absolute_links = [urljoin("https://proza.ru/", link) for link in links]
            time.sleep(2)
            logger.info(f"Found {len(absolute_links)} articles for {date.strftime('%Y-%m-%d')}")
            return absolute_links
        
        logger.warning(f"Unexpected status code {response.status_code} for {url}")
        return []
    
    except requests.RequestException as e:
        logger.error(f"Error fetching links for topic {topic}, {date}: {str(e)}")
        return []

def extract_article_text(article_url):
    """Extract text from a single article page"""
    try:
        logger.debug(f"Fetching article: {article_url}")
        response = requests.get(article_url, headers=HEADERS)
        response.raise_for_status()
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text_div = soup.find('div', class_='text')
            
            if text_div:
                logger.debug(f"Successfully extracted text from {article_url}")
                return text_div.get_text(strip=False)  # Keep original formatting
            
            logger.warning(f"No text div found in {article_url}")
            return None
        
        logger.warning(f"Unexpected status code {response.status_code} for {article_url}")
        return None
    
    except requests.RequestException as e:
        logger.error(f"Error fetching article {article_url}: {str(e)}")
        return None

def save_text(article_url, text, date):
    """Save text to a file with metadata"""
    try:
        # Create a filename-safe version of the URL
        filename = article_url.split('/')[-1]
        
        # Store data in a structured format
        data = {
            'url': article_url,
            'date': date.strftime('%Y-%m-%d'),
            'text': text,
            'timestamp': datetime.now().isoformat()
        }
        
        # Create year/month subdirectories
        year_dir = os.path.join(OUTPUT_DIR, str(date.year))
        month_dir = os.path.join(year_dir, f"{date.month:02d}")
        os.makedirs(month_dir, exist_ok=True)
        
        # Save as JSON to preserve metadata and text structure
        output_path = os.path.join(month_dir, f"{filename}.json")
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        logger.debug(f"Saved article to {output_path}")
    
    except Exception as e:
        logger.error(f"Error saving article {article_url}: {str(e)}")

In [27]:
def main():
    """Main scraping function"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    logger.info("Starting scraping process")
    logger.info(f"Date range: {START_YEAR}-01-01 to {END_YEAR}-12-31")
    logger.info(f"Output directory: {OUTPUT_DIR}")
    
    total_articles = 0
    start_time = time.time()
    
    for date in generate_dates():
        date_str = date.strftime('%Y-%m-%d')
        logger.info(f"\nProcessing date: {date_str}")
        
        article_links = get_article_links(date, topic=30)
        if not article_links:
            logger.info(f"No articles found for {date_str}")
            time.sleep(DAY_DELAY)
            continue
        
        # Process articles with progress bar
        for link in tqdm(article_links, desc=f"Articles for {date_str}", leave=False):
            text = extract_article_text(link)
            if text:
                save_text(link, text, date)
                total_articles += 1
            
            time.sleep(REQUEST_DELAY)
        
        logger.info(f"Finished processing {len(article_links)} articles for {date_str}")
        time.sleep(DAY_DELAY)
    
    # Final statistics
    elapsed_time = time.time() - start_time
    logger.info("\nScraping completed!")
    logger.info(f"Total articles processed: {total_articles}")
    logger.info(f"Total time elapsed: {elapsed_time/60:.2f} minutes")
    logger.info(f"Average time per article: {elapsed_time/max(1, total_articles):.2f} seconds")

In [None]:
try:
    main()
except KeyboardInterrupt:
    logger.info("Scraping interrupted by user")
except Exception as e:
    logger.error(f"Unexpected error: {str(e)}", exc_info=True)