# Clue Scraper

Sources include [XWord Info](https://www.xwordinfo.com/)

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [22]:
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

import generation.constants as const

headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}

def scrape(urls):
    """Scrapes all pages in `urls` from the same site."""
    site_names = [url.split('.')[1].lower() for url in urls]
    site_name = site_names[0]
    assert all(n == site_name for n in site_names)

    file_name = f'scraped-clues-{site_name}.csv'
    path = os.path.join(const.DATA_PATH, file_name)
    if not os.path.exists(path):
        pd.DataFrame(columns=['clue', 'answer', 'url']).to_csv(path, index=False)

    for url in tqdm(urls):
        r = requests.get(url=url, headers=headers)
        soup = BeautifulSoup(r.content, 'html5lib')
        data = []

        if site_name == 'xwordinfo':
            table = soup.find('div', attrs={'class': 'numclue'})
            for row in table.find_all('div'):
                txt = row.get_text()
                try:
                    clue, answer = txt.split(" : ")
                    data.append([clue, answer, url])
                except:
                    continue
        if site_name == 'theguardian':
            try:
                entries = json.loads(soup.find(
                    attrs={'class': 'js-crossword'})['data-crossword-data'])['entries']
                for e in entries:
                    data.append([e['clue'], e['solution'], url])
            except:  # if url does not exist
                continue
        
        pd.DataFrame(data, columns=['clue', 'answer', 'url']).to_csv(path, mode='a', index=False, header=False)

In [42]:
from datetime import datetime

xwordinfo_dates = pd.date_range(start="2000-01-01", end=datetime.today()).to_pydatetime().tolist()
xwordinfo_urls = [f"https://www.xwordinfo.com/Crossword?date={date.month}/{date.day}/{date.year}" for date in xwordinfo_dates]

scrape(xwordinfo_urls)

100%|██████████| 8412/8412 [54:29<00:00,  2.57it/s]  


In [23]:
theguardian_urls = [f"https://www.theguardian.com/crosswords/quick/{num}" for num in range(16444)]
scrape(theguardian_urls)

100%|██████████| 16444/16444 [1:36:32<00:00,  2.84it/s]
