<a href="https://colab.research.google.com/github/kartik727/ml-projects/blob/master/wiki_scrape/Wiki_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup as bsp
from queue import Queue
from tqdm import tqdm
import random
from urllib.parse import unquote

In [2]:
config = {
    'loc' : {
        'base_dir' : '/content/drive/MyDrive/ML/Data/wiki_scrape/',
        'title_file' : 'names.txt',
        'data_dir' : 'articles/',
        'data_ext' : '.txt'
    },
    'base_url' : 'https://en.wikipedia.org',
    'starting_link' : '/wiki/Hypernova',
    'num_articles' : 100
}

In [3]:
base_dir = '/content/drive/MyDrive/ML/Data/wiki_scrape/'
articles = base_dir+config['loc']['data_dir']

In [4]:
# Clean the working directory
! rm {articles}*

rm: cannot remove '/content/drive/MyDrive/ML/Data/wiki_scrape/articles/*': No such file or directory


In [14]:
class Wiki_Article:
    """
    Representation of a wikipedia article. Stores the relative link of an article and creates a Beautiful Soup object
    using the url

    ...

    Attributes
    ----------
    url : str
        Relative url of the article

    Methods
    -------
    parse_data:
        Parses the paragraphs of the article into list of strings

    parse_links:
        Parses all hyperlinks to other wikipedia articles into list of strings

    get_paragraphs:
        Returns the paragraphs (primary text) of the article

    get_wiki_links:
        Returns a list of all the links to other wikipedia articles in the current artcle
    """

    BASE_URL=config['base_url']

    def __init__(self, url:str):
        self.url = url
        response = requests.get(url=self.BASE_URL+url)
        self.soup = bsp(response.content, 'html.parser')
        self.paragraphs = None
        self.links = None
        
        # Setting the title of the article
        title = self.soup.find(id='firstHeading')
        self.title = title.string

    def parse_data(self, add_title:bool=False)->None:
        '''Parses the paragraphs of the article into list of strings'''

        data = self.soup.find(id='mw-content-text')
        data = data.find(class_='mw-parser-output')
        paragraph_list = []

        if add_title:
            paragraph_list.append(self.title)

        for para in data.find_all('p'):
            paragraph_list.append(para.get_text())

        self.paragraphs = '\n'.join(paragraph_list)

    def parse_links(self)->None:
        '''Parses all hyperlinks to other wikipedia articles into list of strings'''

        self.links = []
        all_links = self.soup.find(id='bodyContent').find_all('a')
        for link in all_links:
            try:
                txt = link['href']

                # Don't want non-wikipedia links
                if txt[:6] != '/wiki/':
                    continue
                
                # Don't want special pages
                if ':' in txt:
                    continue

                # Remove any shortcuts
                txt = txt.split('#')[0]

                # Remove disambiguation links
                if txt[-17:] == '_(disambiguation)':
                    continue

                # Add to the list of links
                self.links.append(txt)
            except KeyError:
                pass                

    def get_paragraphs(self, add_title:bool=False):
        '''Returns the paragraphs (primary text) of the article'''
        if self.paragraphs is None:
            self.parse_data(add_title=add_title)

        return self.paragraphs

    def get_wiki_links(self):
        '''Returns a list of all the links to other wikipedia articles in the current artcle'''
        if self.links is None:
            self.parse_links()

        return self.links

In [6]:
def scrape_wiki_articles(articles:set, start_link:str, num_articles:int, base_dir:str, ext:str)->None:
    """
    Scrapes the requested number of articles from wikipedia that are not already present in the given set of articles

    Parameters
    ----------
    articles (set):
        The set of articles that have already been scraped and should be avoided

    start_link (str):
        The link to the starting article from which new links will be recursively generated

    num_artices (int):
        Number of articles to be scraped

    base_dir (int):
        Directory in which the results will be stored

    ext (str):
        Extension of the files containing the scraped articles
    """

    target_len = len(articles) + num_articles
    new_links = set()
    new_links.add(start_link)

    
    while len(articles) < target_len:

        # Stop if ran out of articles
        if len(new_links)==0:
            print('Article links exhausted')
            break

        # Select a random article and remove from the set
        link = random.choice(list(new_links))
        new_links.remove(link)
        created = False

        # Read seen articles to get more links if links set getting small
        if len(new_links) < 100:
            article = Wiki_Article(link)
            created = True
            for l in article.get_wiki_links():
                new_links.add(l)

        # Parse unseen articles and add their links to the set
        if link not in articles:
            if not created:
                article = Wiki_Article(link)
                for l in article.get_wiki_links():
                    new_links.add(l)
            title = article.title
            if title is not None:
                articles.add(link)
                title_unq = unquote(link.split('/')[-1])
                data = article.get_paragraphs(add_title=True)
                with open(base_dir+title_unq+ext, 'w') as f:
                    f.write(data)

    print('Scraping complete')


In [12]:
loc = config['loc']

# Reading the file with all article links
try:
    with open(loc['base_dir']+loc['title_file'], 'r') as f:
        names = set(f.read().split('\n'))
        starting_link = random.sample(list(names), 1)[0]
        print('Title file loaded.')
except FileNotFoundError:
    names = set()
    starting_link = config['starting_link']
    print('Title file not found. Starting from scratch.')

print(f'First link: {starting_link}')

Title file loaded.
First link: /wiki/Pressure_cooker


In [13]:
# Scraping Wikipedia for new articles
print(f'Number of articles before batch: {len(names)}')

scrape_wiki_articles(names, starting_link, config['num_articles'], loc['base_dir']+loc['data_dir'], loc['data_ext'])

Number of articles before batch: 101
Scraping complete


In [15]:
# Writing the title names (including new ones) to disk
with open(loc['base_dir']+loc['title_file'], 'w') as f:
    for name in names:
        f.write(name + '\n')
    print('Written links to disk')

Written links to disk
