# Clue Scraper

Sources include [XWord Info](https://www.xwordinfo.com/)

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [41]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

import generation.constants as const

headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}

def scrape(urls):
    """Scrapes all pages in `urls` from the same site."""
    site_names = [url.split('.')[1].lower() for url in urls]
    site_name = site_names[0]
    assert all(n == site_name for n in site_names)

    file_name = f'scraped-clues-{site_name}.csv'
    path = os.path.join(const.DATA_PATH, file_name)
    if not os.path.exists(path):
        pd.DataFrame(columns=['clue', 'answer', 'url']).to_csv(path, index=False)

    for url in tqdm(urls):
        r = requests.get(url=url, headers=headers)
        soup = BeautifulSoup(r.content, 'html5lib')
        data = []

        if site_name == 'xwordinfo':
            table = soup.find('div', attrs={'class': 'numclue'})
            for row in table.find_all('div'):
                txt = row.get_text()
                try:
                    clue, answer = txt.split(" : ")
                    data.append([clue, answer, url])
                except:
                    continue
        
        pd.DataFrame(data, columns=['clue', 'answer', 'url']).to_csv(path, mode='a', index=False, header=False)

In [42]:
from datetime import datetime

xwordinfo_dates = pd.date_range(start="2000-01-01", end=datetime.today()).to_pydatetime().tolist()
xwordinfo_urls = [f"https://www.xwordinfo.com/Crossword?date={date.month}/{date.day}/{date.year}" for date in xwordinfo_dates]

scrape(xwordinfo_urls)

100%|██████████| 8412/8412 [54:29<00:00,  2.57it/s]  


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.maximize_window()

[WDM] - Downloading: 100%|██████████| 6.79M/6.79M [00:00<00:00, 29.9MB/s]


In [3]:
from selenium.webdriver.common.by import By

driver.get('https://www.theguardian.com/crosswords/quick/16442')

In [40]:
from dataclasses import dataclass, field

import generation.constants as const
from generation.grid import Direction, Grid, Entry

@dataclass()
class FakeEntry(Entry):
    id: int = field(init=True)

    def __post_init__(self):
        self.length = len(self.cells)
        self.direction = Direction.ACROSS if len(self.cells) >= 2 and \
            self.cells[0].row == self.cells[1].row else Direction.DOWN

In [58]:
for _ in range(2):
    for button in driver.find_elements(By.CLASS_NAME, 'button--secondary'):
        if 'Reveal all' == button.get_attribute('data-link-name'):
            driver.execute_script("arguments[0].click();", button)
cell_size = 32

for clue in driver.find_elements(By.CLASS_NAME, 'crossword__clue crossword__clue--answered'):
    print(clue)
    # number = clue.find_element(By.CLASS_NAME, 'crossword__clue__number')
    # clue_text = clue.find_element(By.CLASS_NAME, 'crossword__clue__text')
    # print(number, clue_text)

# grid_content, entry_locs, n = [], [], 0
# for cell in driver.find_elements(By.CLASS_NAME, 'crossword__cell-text'):
#     r = int(float(cell.get_attribute('y')) / cell_size) + 1
#     c = int(float(cell.get_attribute('x')) / cell_size) + 1
#     label = cell.text
#     n = max(n, r)
#     grid_content.append((r, c, label))
    
# for cell in driver.find_elements(By.CLASS_NAME, 'crossword__cell-number'):
#     r = int(float(cell.get_attribute('y')) / cell_size) + 1
#     c = int(float(cell.get_attribute('x')) / cell_size) + 1
#     number = int(cell.text)
#     entry_locs.append((r, c, number))

# print('web processed')

# g = Grid(n, generate_layout=False, verbose=False)
# for r in g.cell_range:
#     for c in g.cell_range:
#         g.cell(r, c).make_block()
# for r, c, label in grid_content:
#     g.cell(r, c).label = label
# for r, c, number in entry_locs:
#     cell = g.cell(r, c)
#     e = FakeEntry(g, cell.get_across(), number)
#     if e.length in const.WORD_LENGTH_RANGE and cell is e.get_start_cell():
#         g.across[number] = g.cell(r, c)
#         g.ids[(r, c)] = number
#         g.entries.append(e)
#     e = FakeEntry(g, g.cell(r, c).get_down(), number)
#     if e.length in const.WORD_LENGTH_RANGE and cell is e.get_start_cell():
#         g.down[number] = g.cell(r, c)
#         g.ids[(r, c)] = number
#         g.entries.append(e)

In [51]:
for e in g.entries:
    print(e.get_contents(), e.id, e.direction)

JACKFROST 1 Direction.ACROSS
ACRE 2 Direction.DOWN
KING 3 Direction.DOWN
REVOLT 4 Direction.DOWN
STAFFS 5 Direction.DOWN
SUPERSTAR 6 Direction.DOWN
COWBOYHAT 7 Direction.DOWN
SUMO 8 Direction.ACROSS
TRANSVAAL 9 Direction.ACROSS
HERB 10 Direction.ACROSS
SKINFLINT 11 Direction.DOWN
LITTERBUG 12 Direction.DOWN
CLIFF 13 Direction.ACROSS
CHIEF 13 Direction.DOWN
FLOAT 14 Direction.DOWN
KNIGHT 15 Direction.ACROSS
SLUSHY 16 Direction.ACROSS
NUTMEG 17 Direction.ACROSS
GLITCH 18 Direction.DOWN
PANAMA 19 Direction.ACROSS
PUNNET 19 Direction.DOWN
FLOUT 20 Direction.ACROSS
LURK 21 Direction.ACROSS
SHAW 22 Direction.DOWN
BOSS 23 Direction.DOWN
STANCHION 24 Direction.ACROSS
NOUS 25 Direction.ACROSS
WHITEWASH 26 Direction.ACROSS


In [None]:
from selenium.webdriver.common.by import By

def interact_browser(url):
    driver.get(url)
    driver.find_element_by_id('bt_gerar_cpf').click()
    text_field = driver.find_element_by_id('texto_cpf')
    text = wait(driver, 10).until(lambda driver: not text_field.text == 'Gerando...' and text_field.text)
    return text

print(open_browser())