<a href="https://colab.research.google.com/github/kalyugwasi/timesofindia-scraper/blob/main/open_deep_researcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install requests beautifulsoup4 pandas tqdm scrapy
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Configuration
START_YEAR = 1984
END_YEAR = 2025
OUTPUT_DIR = "cleaned_articles"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def validate_article(url):
    """Filter out non-article links"""
    return '/articleshow/' in url and all(x not in url for x in [
        '/archive/', '/sitemap', 'newsletter', '/videos/', '/photostory/'
    ])

def extract_actual_date(soup):
    """Extract actual article date from metadata"""
    date_meta = soup.find('meta', {'property': 'article:published_time'})
    if date_meta:
        return date_meta['content'].split('T')[0]
    return None

def clean_article(url, expected_date):
    """Robust article cleaner with date validation"""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Date validation
            actual_date = extract_actual_date(soup)
            if actual_date and actual_date != expected_date:
                return None

            # Extract content
            headline = soup.find('h1', class_=lambda x: x and '_23498' in x)
            content = soup.find('div', class_='_3YYSt')

            if not all([headline, content]):
                return None

            return {
                'headline': headline.get_text(strip=True),
                'content': ' '.join(content.strip().split()),
                'date': expected_date,
                'url': url
            }
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return None

def generate_all_urls():
    """Generate all possible archive URLs for the given date range"""
    urls = []
    for year in range(START_YEAR, END_YEAR + 1):
        for month in range(1, 13):
            for day in range(1, 32):
                try:
                    date = datetime(year, month, day)
                    date_str = date.strftime('%Y-%m-%d')
                    base_url = f"https://timesofindia.indiatimes.com/{date_str.replace('-', '/')}/archivelist/year-{year},month-{month},starttime-{(datetime.strptime(date_str, '%Y-%m-%d').date() - datetime(1900,1,1).date()).days + 1}.cms"
                    urls.append(base_url)
                except:
                    continue
    return urls

def process_url(url):
    """Process a single archive page URL"""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            links = [f"https://timesofindia.indiatimes.com{a['href']}"
                    for a in soup.find_all('a', href=True)
                    if validate_article(a['href'])]
            return links
        return []
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return []

def main():
    # Generate all possible URLs
    all_urls = generate_all_urls()

    # Extract all article links
    with ThreadPoolExecutor(max_workers=10) as executor:
        all_article_links = []
        for links in tqdm(executor.map(process_url, all_urls), total=len(all_urls), desc="Processing URLs"):
            all_article_links.extend(links)

    # Remove duplicates
    all_article_links = list(set(all_article_links))

    # Clean and extract articles
    with ThreadPoolExecutor(max_workers=10) as executor:
        all_articles = []
        for result in tqdm(executor.map(lambda url: clean_article(url, datetime.strptime(url.split('/')[-4:], '%Y/%m/%d').strftime('%Y-%m-%d')), all_article_links), total=len(all_article_links), desc="Processing Articles"):
            if result:
                all_articles.append(result)

    # Save to CSV
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df = pd.DataFrame(all_articles)
    df.to_csv(f"{OUTPUT_DIR}/all_articles.csv", index=False)
    print(f"Total articles collected: {len(df)}")

if __name__ == "__main__":
    main()

Collecting scrapy
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=21.7.0 (from scrapy)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.3.1-py3-none-any.whl.metadata (2.3 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_6

Processing URLs:   5%|▍         | 708/15341 [01:41<41:42,  5.85it/s]