In [1]:
# Aim: Scrape data from Wikipedia to create a text corpus
# Process:
# Result:

In [2]:
import requests
from bs4 import BeautifulSoup
import bs4

from tqdm import tqdm
import os
import json

In [3]:
class WikiCategoryDataScrapper:
    def __init__(self, url, articlePrefix, dataPath=None):
        self.categoryURL = url
        self.articlePrefix = prefix
        self.categoryPageSoup = self.soup(self.request(self.categoryURL))
        self.meta = {}
        
        if dataPath is None:
            self.dataPath  = 'Data'
            if not os.path.exists(self.dataPath):
                os.mkdir(self.dataPath)
        else:
            self.dataPath  = dataPath

        self.index = self.letterIndices()
        
    def request(self,url):
        response = requests.get(url)
        return response

    def soup(self, response):
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup
    
    def letterIndices(self):
        index = {}
        soups = self.categoryPageSoup.body.find(id='mw-pages').find(class_='CategoryIndex').find_all('li')[2:]
        for soup in soups:
            index[soup.text] = soup.a['href']
        return index
    
    def letterLinks(self,letter):
        letterResponse = self.request(self.index[letter])
        letterSoup = self.soup(letterResponse).body.find(class_='mw-category-group')
        articles =  {}
        articleSoups = letterSoup.find_all('li')
        for articleSoup in articleSoups:
            articles[articleSoup.text] = self.articlePrefix + articleSoup.a['href']
        return articles

    def scrapeArticle(self,articleURL):
        articleResponse = self.request(articleURL)
        articleSoup = self.soup(articleResponse)
        contentSoup = articleSoup.find(class_='mw-parser-output').extract()
        intro = []
        for childSoup in contentSoup.children:
            if childSoup.name == 'p':
                intro.append(childSoup.text)
    
        content = []
        restContentSoups = [childSoup for childSoup in contentSoup.children]
        restContentSoup = None
        while not (isinstance(restContentSoup,bs4.element.Tag) or restContentSoups==[]):
            restContentSoup = restContentSoups.pop()
        for childSoup in restContentSoup.children:
            if isinstance(childSoup,bs4.element.Tag):
                if childSoup.find(id='Voir_aussi') is not None or childSoup.find(id='Notes_et_références')is not None:
                    break
                else:
                    content.append(childSoup.text)    
        return ''.join(intro + content)

    def main(self):
        urls = {}
        print('Identifying articles to scrape.')
        for letter in tqdm(self.index.keys()):
            urls[letter] = self.letterLinks(letter)

        print('Scraping articles')
        for letter in tqdm(urls.keys()):
            #print(f'Letter {letter} progress...')
            letterPath = os.path.join(self.dataPath,letter)
            if not os.path.exists(letterPath):
                os.mkdir(letterPath)
            for articleTitle in urls[letter].keys():
                articleURL = urls[letter][articleTitle]
                article = self.scrapeArticle(articleURL)
                if not articleTitle.isalnum():
                    articlePathTitle = [c for c in articleTitle if (c.isalnum() or c in ['-', ' '])]
                    articlePathTitle = ''.join(articlePathTitle)
                else:
                    articlePathTitle = articleTitle
                articlePath = os.path.join(letterPath,articlePathTitle+'.txt')
                with open(articlePath,'w',encoding="utf-8") as articleFile:
                    articleFile.write(article)
                self.meta[articleTitle] = {
                    'url':urls[letter][articleTitle],
                    'path':articlePath
                }        
        print(f'Scraped {len(self.meta.keys())} articles.')

        metaFilePath = os.path.join(self.dataPath,'info.json')
        with open(metaFilePath,'w') as metaFile:
            json.dump(scraper.meta, metaFile, indent=4)
        print(f'Meta data present in file {metaFilePath}')

In [4]:
categoryURL = 'https://fr.wikipedia.org/w/index.php?title=Cat%C3%A9gorie:Portail:Robotique/Articles_li%C3%A9s&pageuntil=Essaim+de+drones#mw-pages'
prefix = 'https://fr.wikipedia.org'
path = "Data"
scraper = WikiCategoryDataScrapper(url=categoryURL, articlePrefix=prefix, dataPath=path)

In [5]:
scraper.main()

Identifying articles to scrape.


100%|███████████████████████████████████████████████████████████████████████████████| 26/26 [00:17<00:00,  1.47it/s]


Scraping articles


100%|███████████████████████████████████████████████████████████████████████████████| 26/26 [07:53<00:00, 18.22s/it]

Scraped 740 articles.
Meta data present in file Data\info.json



