### Final version Scrapping Sudoku UK


In [1]:
from bs4 import BeautifulSoup
import urllib3
import requests 
import datetime
import numpy as np
import pandas as pd
from urllib.request import urlopen 
from urllib.error import HTTPError 
from urllib.error import URLError
from tqdm import tqdm

## FUNCTIONS

## Get_dates():
To extract all the data we need to open all the different solutions that have been posted during the time in the web site. 

In [2]:
def list_dates():
  dates = []
  dates_error = []
  urls = []
  for i in tqdm(range(2,5252)):
    d = datetime.date.today()- datetime.timedelta(days = i)
    d_format = str(d.day)+'/'+str(d.month) +'/'+ str(d.year)
    URL = "http://www.sudoku.org.uk/DailySudoku.asp?solution=please&day="+d_format
    try:
      urlopen(URL)
    except HTTPError as e:
        pass
#       dates_error.append(d_format)
    except URLError as e:
        pass
#       dates_error.append(d_format)
    else:
#       dates.append(d_format)
      urls.append(URL)
  return urls


## Get_html()
Using the list of the days we're going to open each URL and extract all the HTML code.

In [3]:
def get_html(url):
  r = requests.get(url) 
  return BeautifulSoup(r.content,'html.parser')

## Consolidate()
extract all the sudokus and their solutions, the level of difficulty, number of people that solved the soduku and the average time in minutes.

In [4]:
def consolidate(urls):
    a, b, solution, sudoku, level, people,av_time, unit = ([] for i in range(8))

    for url in tqdm(urls):
        soup = get_html(url)
        for link in soup.find_all('td', attrs={'class': ['InnerTDone2','InnerTDone'] }):
            if link.attrs['class'] == ['InnerTDone2']:
              b.append(link.text)
            else:
              b.append('.')
            a.append(link.text)
            
        sudoku.append(''.join(b))
        solution.append(''.join(a))
             
        p = list(list(soup.table.td)[2])
        level.append(str(p[1].get_text()).split(", ")[1].split()[0])
        people.append(str(p[3]).split()[0])
        av_time.append(str(p[3]).split()[6])
        unit.append(str(p[3]).split()[7])
        
    return  urls, level, people, av_time, unit, sudoku, solution 

# Extract Dates

In [5]:
# dates, urls, dates_error = list_dates()
urls = list_dates()

100%|██████████| 5250/5250 [27:34<00:00,  3.17it/s]


In [6]:
len(urls)

5213

# Export URLs to CSV

In [7]:
urls_df = pd.DataFrame(list(urls), 
               columns =['urls'])
urls_df.to_csv('urls.csv')

# Load URLs

In [8]:
df_urls = pd.read_csv('urls.csv', sep=',').drop('Unnamed: 0', axis=1)

In [9]:
df_urls

Unnamed: 0,urls
0,http://www.sudoku.org.uk/DailySudoku.asp?solut...
1,http://www.sudoku.org.uk/DailySudoku.asp?solut...
2,http://www.sudoku.org.uk/DailySudoku.asp?solut...
3,http://www.sudoku.org.uk/DailySudoku.asp?solut...
4,http://www.sudoku.org.uk/DailySudoku.asp?solut...
...,...
5208,http://www.sudoku.org.uk/DailySudoku.asp?solut...
5209,http://www.sudoku.org.uk/DailySudoku.asp?solut...
5210,http://www.sudoku.org.uk/DailySudoku.asp?solut...
5211,http://www.sudoku.org.uk/DailySudoku.asp?solut...


#### before Mar 7 of 2006 location 5010 there are not registers about the players or the average time in the website, for this reason we're cut this urls 

In [20]:
index = df_urls.index[df_urls['urls'].str.contains('=7/3/2006', regex=False)]

In [22]:
print(index[0])

5014


In [23]:

new_urls = df_urls.iloc[:index[0]]

In [24]:
new_urls.shape

(5014, 1)

In [25]:
#Export to csv
new_urls.to_csv('new_urls.csv')

# Extract Data

In [26]:
urls, level, people,av_time, unit, sudoku, solution = consolidate(new_urls['urls'])

100%|██████████| 5014/5014 [28:43<00:00,  2.91it/s] 


# Create Dataframe

In [27]:
df = pd.DataFrame(list(zip(urls, level, people,av_time, unit, sudoku, solution)), 
               columns =['URL', 'Level','People','Average-Time', 'Unit-Time', 'Sudoku', 'Solution']) 

In [28]:
df.head()

Unnamed: 0,URL,Level,People,Average-Time,Unit-Time,Sudoku,Solution
0,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Moderate,274,14,minutes,.54....8.....4..7.79...8.....26..1....35..2......,1547639823289456717962183459726341588435712965...
1,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Diabolical,244,28,minutes,.54....8.....4..7.79...8.....26..1....35..2......,1547639823289456717962183459726341588435712965...
2,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Tough,251,27,minutes,.54....8.....4..7.79...8.....26..1....35..2......,1547639823289456717962183459726341588435712965...
3,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Moderate,264,15,minutes,.54....8.....4..7.79...8.....26..1....35..2......,1547639823289456717962183459726341588435712965...
4,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Moderate,285,16,minutes,.54....8.....4..7.79...8.....26..1....35..2......,1547639823289456717962183459726341588435712965...


In [29]:
df.shape

(5014, 7)

# Function to visualize Sudoku

In [None]:
def split(element): 
    return [char for char in element]  

def transform_matrix(element):
  return np.reshape(split(element), (-1, 9))

In [100]:
transform_matrix(df.Solution[0])

array([['2', '3', '9', '6', '4', '1', '7', '8', '5'],
       ['7', '8', '1', '9', '5', '2', '6', '3', '4'],
       ['5', '6', '4', '3', '8', '7', '1', '2', '9'],
       ['9', '5', '8', '1', '3', '4', '2', '7', '6'],
       ['4', '2', '3', '5', '7', '6', '8', '9', '1'],
       ['1', '7', '6', '8', '2', '9', '4', '5', '3'],
       ['8', '9', '2', '4', '6', '5', '3', '1', '7'],
       ['6', '1', '7', '2', '9', '3', '5', '4', '8'],
       ['3', '4', '5', '7', '1', '8', '9', '6', '2']], dtype='<U1')

In [101]:
transform_matrix(df.Sudoku[0])

array([['.', '.', '9', '6', '4', '1', '7', '.', '.'],
       ['7', '8', '.', '9', '.', '.', '.', '.', '4'],
       ['.', '.', '.', '3', '.', '.', '.', '.', '.'],
       ['.', '.', '.', '.', '.', '.', '2', '7', '.'],
       ['4', '.', '3', '.', '.', '.', '8', '.', '1'],
       ['.', '7', '6', '.', '.', '.', '.', '.', '.'],
       ['.', '.', '.', '.', '.', '5', '.', '.', '.'],
       ['6', '.', '.', '.', '.', '3', '.', '4', '8'],
       ['.', '.', '5', '7', '1', '8', '9', '.', '.']], dtype='<U1')

# Export Data Frame

In [30]:
df.to_csv('sudoku_uk.csv')