### Final version Scrapping Sudoku UK


In [0]:
from bs4 import BeautifulSoup
import urllib3
import requests 
import datetime
import numpy as np
import pandas as pd
from urllib.request import urlopen 
from urllib.error import HTTPError 
from urllib.error import URLError

## FUNCTIONS

## Get_dates():
To extract all the data we need to open all the different solutions that have been posted during the time in the web site. 

In [0]:
def list_dates():
  dates = []
  dates_error = []
  urls = []
  for i in range(2,112):
    d = datetime.date.today()- datetime.timedelta(days = i)
    d_format = str(d.day)+'/'+str(d.month) +'/'+ str(d.year)
    URL = "http://www.sudoku.org.uk/DailySudoku.asp?solution=please&day="+d_format
    try:
      urlopen(URL)
    except HTTPError as e:
      dates_error.append(d_format)
    except URLError as e:
      dates_error.append(d_format)
    else:
      dates.append(d_format)
      urls.append(URL)
  return dates, urls, dates_error


## Get_html()
Using the list of the days we're going to open each URL and extract all the HTML code.

In [0]:
def get_html(url):
  r = requests.get(url) 
  return BeautifulSoup(r.content,'html5lib')

## Get_sudokus()
extract all the sudokus and their solutions

In [0]:
def get_sudokus(urls):
  solution =[]
  sudoku=[]

  for url in urls:
    a=[]
    b=[]
    soup = get_html(url)
    if soup.find('h2'):
      print('yes')
    else:
      for link in soup.find_all('td', attrs={'class': ['InnerTDone2','InnerTDone'] }):
        if link.attrs['class'] == ['InnerTDone2']:
          b.append(link.text)
        else:
          b.append('.')
        a.append(link.text)
      sudoku.append(''.join(b))
      solution.append(''.join(a))

  return sudoku, solution

## Get_info()

extract the level, number of people that solved the soduku and the average time in minutes.

In [0]:
def get_info(urls):
  level = []
  people = []
  av_sln = []
  unit_time = []
  
  for url in urls :
    soup = get_html(url)
    p = list(list(soup.table.td)[2])
    level.append(str(p[1].get_text()).split(", ")[1].split()[0])
    people.append(str(p[3]).split()[0])
    av_sln.append(str(p[3]).split()[6])
    unit_time.append(str(p[3]).split()[7])
  return level,people,av_sln, unit_time

# Extract Data

In [0]:
dates, urls, dates_error = list_dates()

In [0]:
sudoku, solution= get_sudokus(urls)

In [0]:
level,people,av_sln, unit_time = get_info(urls)

# Create Dataframe

In [0]:
df = pd.DataFrame(list(zip(dates, urls,  level, people,av_sln, unit_time, sudoku, solution)), 
               columns =['Date','URL', 'Level','People','Average-Time', 'Unit-Time', 'Sudoku', 'Solution']) 

In [98]:
df.head()

Unnamed: 0,Date,URL,Level,People,Average-Time,Unit-Time,Sudoku,Solution
0,11/25/2019,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Gentle,287,12,minutes,..96417..78.9....4...3...........27.4.3...8.1....,2396417857819526345643871299581342764235768911...
1,11/24/2019,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Diabolical,254,27,minutes,..35..7...75.....2.2...7..42..39.....9.....4.....,4135297686758439128291673542683945713972158465...
2,11/23/2019,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Moderate,276,15,minutes,...73.4...9.1...8......6.....5.78....49...71.....,8617354293971245864528961731359786426492537187...
3,11/22/2019,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Diabolical,252,27,minutes,.7...5....2.7..3..5..8.....95..6..3.8.6...1.9....,1789352464297163855638429719572618348463571292...
4,11/21/2019,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Tough,256,23,minutes,.89...17.3.....985....7......5.8....8..7.6..3....,2895341763471629855619783421753892648247165939...


# Function to visualize Sudoku

In [0]:
def split(element): 
    return [char for char in element]  

def transform_matrix(element):
  return np.reshape(split(element), (-1, 9))

In [100]:
transform_matrix(df.Solution[0])

array([['2', '3', '9', '6', '4', '1', '7', '8', '5'],
       ['7', '8', '1', '9', '5', '2', '6', '3', '4'],
       ['5', '6', '4', '3', '8', '7', '1', '2', '9'],
       ['9', '5', '8', '1', '3', '4', '2', '7', '6'],
       ['4', '2', '3', '5', '7', '6', '8', '9', '1'],
       ['1', '7', '6', '8', '2', '9', '4', '5', '3'],
       ['8', '9', '2', '4', '6', '5', '3', '1', '7'],
       ['6', '1', '7', '2', '9', '3', '5', '4', '8'],
       ['3', '4', '5', '7', '1', '8', '9', '6', '2']], dtype='<U1')

In [101]:
transform_matrix(df.Sudoku[0])

array([['.', '.', '9', '6', '4', '1', '7', '.', '.'],
       ['7', '8', '.', '9', '.', '.', '.', '.', '4'],
       ['.', '.', '.', '3', '.', '.', '.', '.', '.'],
       ['.', '.', '.', '.', '.', '.', '2', '7', '.'],
       ['4', '.', '3', '.', '.', '.', '8', '.', '1'],
       ['.', '7', '6', '.', '.', '.', '.', '.', '.'],
       ['.', '.', '.', '.', '.', '5', '.', '.', '.'],
       ['6', '.', '.', '.', '.', '3', '.', '4', '8'],
       ['.', '.', '5', '7', '1', '8', '9', '.', '.']], dtype='<U1')

# Export Data Frame

In [0]:
df.to_csv('dataset_sudoku_uk.csv')