In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import random
import re

# NLTK imports for text processing
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass


class TextProcessor:
    """Handles text preprocessing including tokenization, stemming, and lemmatization"""
    
    def __init__(self):
        self.porter = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
    
    def clean_text(self, text):
        """Remove unwanted characters and normalize text"""
        # Remove references like [1], [2], etc.
        text = re.sub(r'\[\d+\]', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def process_text(self, text):
        """
        Process text: tokenize, remove stopwords, and apply stemming/lemmatization
        Returns: dict with original, tokens, stems, and lemmas
        """
        # Clean the text first
        cleaned_text = self.clean_text(text)
        
        # Tokenize
        tokens = word_tokenize(cleaned_text.lower())
        
        # Filter: keep only alphabetic tokens that are not stopwords
        filtered_tokens = [
            token for token in tokens 
            if token.isalpha() and token not in self.stop_words and len(token) > 2
        ]
        
        # Apply stemming
        stems = [self.porter.stem(token) for token in filtered_tokens]
        
        # Apply lemmatization
        lemmas = [self.lemmatizer.lemmatize(token, pos='v') for token in filtered_tokens]
        
        return {
            'original_text': cleaned_text,
            'tokens': filtered_tokens,
            'stems': stems,
            'lemmas': lemmas,
            'token_count': len(filtered_tokens)
        }


class WikipediaSpider(scrapy.Spider):
    """Spider for crawling Wikipedia pages with BFS strategy"""
    
    name = 'wikipedia_spider'
    allowed_domains = ['en.wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/Madagascar']
    
    # Custom settings
    custom_settings = {
        'ROBOTSTXT_OBEY': True,
        'CONCURRENT_REQUESTS': 1,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
        'RETRY_TIMES': 3,
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'DEPTH_PRIORITY': 1,
        'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
        'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
    }
    
    def __init__(self, *args, **kwargs):
        super(WikipediaSpider, self).__init__(*args, **kwargs)
        self.visited_urls = set()
        self.max_pages = 100
        self.pages_scraped = 0
        self.data = []
        self.text_processor = TextProcessor()
    
    def parse(self, response):
        """Parse Wikipedia page and extract content"""
        
        # Check if we've reached the limit
        if self.pages_scraped >= self.max_pages:
            self.logger.info(f"Reached maximum pages ({self.max_pages}), stopping")
            return
        
        url = response.url
        
        # Skip if already visited
        if url in self.visited_urls:
            return
        
        self.visited_urls.add(url)
        self.pages_scraped += 1
        
        # Extract title
        title = response.css('h1.firstHeading::text').get()
        if not title:
            title = response.url.split('/')[-1].replace('_', ' ')
        
        # Extract main content including text inside links
        # Get all text from paragraphs, including linked text
        paragraphs = response.css('#mw-content-text .mw-parser-output > p')
        
        content_parts = []
        for para in paragraphs:
            # Extract all text including text inside <a> tags
            para_text = para.css('::text').getall()
            content_parts.append(' '.join(para_text))
        
        content = ' '.join(content_parts)
        
        # If no paragraphs found, try alternative selector
        if not content:
            paragraphs_alt = response.css('#mw-content-text p')
            content_parts = []
            for para in paragraphs_alt:
                para_text = para.css('::text').getall()
                content_parts.append(' '.join(para_text))
            content = ' '.join(content_parts)
        
        # Process the text
        if content.strip():
            processed = self.text_processor.process_text(content)
            
            # Store the data
            self.data.append({
                'url': url,
                'title': title,
                'original_text': processed['original_text'],
                'tokens': ' '.join(processed['tokens']),
                'stems': ' '.join(processed['stems']),
                'lemmas': ' '.join(processed['lemmas']),
                'token_count': processed['token_count'],
                'text_length': len(processed['original_text'])
            })
            
            if self.pages_scraped % 50 == 0:
                self.logger.info(f"Progress: {self.pages_scraped}/{self.max_pages} pages scraped")
        
        # Extract links for BFS - only if we haven't reached the limit
        if self.pages_scraped < self.max_pages:
            # Extract all links from the content area
            all_links = response.css('#mw-content-text a::attr(href)').getall()
            
            # Filter to get only valid Wikipedia article links
            valid_links = []
            for link in all_links:
                # Must start with /wiki/
                if not link.startswith('/wiki/'):
                    continue
                
                # Skip special pages
                if any(skip in link for skip in [':', 'Main_Page', '#']):
                    continue
                
                # Convert to absolute URL
                full_url = response.urljoin(link)
                
                # Skip if already visited
                if full_url not in self.visited_urls:
                    valid_links.append(full_url)
            
            # Remove duplicates
            valid_links = list(set(valid_links))
            
            # Shuffle and take only 10 links
            random.shuffle(valid_links)
            selected_urls = valid_links[:10]
            
            self.logger.debug(f"Found {len(valid_links)} valid links, selected {len(selected_urls)}")
            
            # Yield requests for selected URLs
            for new_url in selected_urls:
                yield scrapy.Request(
                    new_url, 
                    callback=self.parse,
                    priority=0,
                    errback=self.handle_error
                )
    
    def handle_error(self, failure):
        """Handle request errors"""
        self.logger.error(f"Request failed: {failure.request.url}")
    
    def closed(self, reason):
        """Called when spider is closed - save data to parquet"""
        if self.data:
            df = pd.DataFrame(self.data)
            
            # Save to parquet
            output_file = 'wikipedia_articles.parquet'
            df.to_parquet(output_file, engine='pyarrow', compression='snappy')
            
            self.logger.info(f"Saved {len(self.data)} articles to {output_file}")
            self.logger.info(f"DataFrame shape: {df.shape}")
            
            # Print summary statistics
            print("\n" + "="*60)
            print("SCRAPING SUMMARY")
            print("="*60)
            print(f"Total pages scraped: {len(self.data)}")
            print(f"Output file: {output_file}")
            print(f"\nDataFrame columns: {list(df.columns)}")
            print(f"\nFirst 10 titles:")
            for i, title in enumerate(df['title'].head(10), 1):
                print(f"  {i}. {title}")
            print(f"\nLast 10 titles:")
            for i, title in enumerate(df['title'].tail(10), 1):
                print(f"  {len(df)-10+i}. {title}")
            print(f"\nAverage tokens per article: {df['token_count'].mean():.0f}")
            print(f"Average text length: {df['text_length'].mean():.0f} characters")
            print(f"Total tokens across all articles: {df['token_count'].sum()}")
            print("="*60)
        else:
            self.logger.warning("No data collected!")


def run_spider():
    """Run the Wikipedia spider"""
    print("="*60)
    print("Wikipedia BFS Crawler")
    print("="*60)
    print("Starting from: https://en.wikipedia.org/wiki/Madagascar")
    print("Target pages: 1000")
    print("Strategy: BFS with 10 random links per page")
    print("Estimated time: 10-15 minutes")
    print("="*60 + "\n")
    
    process = CrawlerProcess(settings={
        'FEEDS': {},
        'LOG_LEVEL': 'INFO',
    })
    
    process.crawl(WikipediaSpider)
    process.start()


if __name__ == '__main__':
    run_spider()

2025-11-07 14:49:26 [scrapy.utils.log] INFO: Scrapy 2.13.3 started (bot: scrapybot)
2025-11-07 14:49:26 [scrapy.utils.log] INFO: Versions:
{'lxml': '6.0.2',
 'libxml2': '2.14.6',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '25.5.0',
 'Python': '3.12.3 (main, Aug 14 2025, 17:47:21) [GCC 13.3.0]',
 'pyOpenSSL': '25.3.0 (OpenSSL 3.5.4 30 Sep 2025)',
 'cryptography': '46.0.3',
 'Platform': 'Linux-6.14.0-33-generic-x86_64-with-glibc2.39'}
2025-11-07 14:49:26 [scrapy.addons] INFO: Enabled addons:
[]
2025-11-07 14:49:26 [scrapy.extensions.telnet] INFO: Telnet Password: f74febe7d424450a
2025-11-07 14:49:26 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2025-11-07 14:49:26 [scrapy.crawler] INFO: Overridden settings:
{'CONCURRENT_REQUESTS': 1,
 'COOKIES_ENABLED': False,
 'DEPTH_PRIORITY': 1,
 'DOWNLOA

Wikipedia BFS Crawler
Starting from: https://en.wikipedia.org/wiki/Madagascar
Target pages: 1000
Strategy: BFS with 10 random links per page
Estimated time: 10-15 minutes



2025-11-07 14:49:26 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2025-11-07 14:49:26 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.start.StartSpiderMiddleware',
 'scrapy.spidermiddlewar

RuntimeError: This event loop is already running

2025-11-07 14:49:26 [scrapy.core.engine] INFO: Spider opened
2025-11-07 14:49:26 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2025-11-07 14:49:26 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2025-11-07 14:50:00 [wikipedia_spider] INFO: Progress: 50/1000 pages scraped
2025-11-07 14:50:26 [scrapy.extensions.logstats] INFO: Crawled 94 pages (at 94 pages/min), scraped 0 items (at 0 items/min)
2025-11-07 14:50:31 [wikipedia_spider] INFO: Progress: 100/1000 pages scraped
