# Collection
- Determines what the most recent lottery draw was
- Determines what the most recent lottery draw stored in the database is
- Collects the winning numbers, # of winners, total prize money for the recent draws not stored in the database
- Collects the stores' info where the 1st & 2nd place winners bought their tickets
- Collected data get stored into 'lottery.db' database.

The notebook is programmed to collect and store only the data that are currently not stored in the database but are present to be collected.<br>
For example, if the lottery's most recent draw was 4149th and we have stored until 4146th in the database, then this notebook collects & saves 4147th to 4149th draws into the database by running it.

## Setup

In [69]:
import sys
sys.path.append('..')

In [70]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from lib.progress_bar import progress_bar
import json
import sqlite3

In [71]:
lotto_result_url = "https://www.dhlottery.co.kr/gameResult.do?method=byWin"
lotto_result_draw = lotto_result_url + "&drwNo="
store_url = "https://www.dhlottery.co.kr/store.do?method=topStore&pageGubun=L645&drwNo="

## Most Recent Draw
Let us determine what the most recent draw was in the real world.

In [97]:
def recentDraw():
    request = requests.get(lotto_result_url)
    soup = BeautifulSoup(request.text, "lxml")
    content = soup.find("meta", {"id" : "desc", "name" : "description"})['content']
    drawCount = re.compile(r'\d+회')
    mo = drawCount.search(content)
    return int(mo.group()[:-1])

In [102]:
mostRecent = recentDraw()
print(f"Most recent draw happend in the real world : {mostRecent}")

Latest draw happend in the real world : 976


## Most Recent Draw in our Database
Let us now determine what the most recent draw stored in our database is.

In [108]:
db_path = "lottery.db"

In [109]:
def recentDrawInDB(db_path):
    con = sqlite3.connect(db_path)
    cur = con.cursor()
    cur.execute("SELECT MAX(turn) FROM DRAWS")
    info = cur.fetchall()
    con.commit()
    con.close()
    return int(info[0][0])

In [110]:
dbMostRecent = recentDrawInDB(db_path)
print(f"Most recent draw stored in our database : {dbMostRecent}")

Most recent draw stored in our database : 976


## Draws Data Collection
The function below crawls over the Korean 6/45 lottery web html files and collects:
- Winning Numbers (1st, 2nd, ... 6th) + Bonus #
- \# of Winners (1st Place, ... 5th Place)
- Total Prize Money (1st Place, ... 5th Place)

In [79]:
def collectDraws(lst, start=dbMostRecent + 1, end=mostRecent):
    if start > end:
        print(f"No new draw to be added.")
        return
    print(f"Collecting Locations from Draw #{start} to #{end}")
    for idx in range(start, end + 1):
        progress_bar(idx - start, end - start + 1)
        
        draw = {'draw': idx}

        req = requests.get(lotto_result_draw + str(idx))
        soup = BeautifulSoup(req.text, 'lxml')

        meta = soup.find("meta", {"id" : "desc", "name" : "description"})['content']
        first, last = re.compile(r'(\d+,){4}\d+'), re.compile('\d+\+\d+')
        first_mo, last_mo = first.search(meta), last.search(meta)
        draw['nums'] = list(map(int, first_mo.group().split(',') + last_mo.group().split('+')))

        total, winners = [], [0] * 5
        table = soup.find("table").find_all("td", {"class":"tar"})
        regex = re.compile(r'>\d+.*원')
        for i, c in enumerate(table):
            mo = regex.search(str(c))
            num = mo.group()[1:-1]
            num = int(num.replace(",", ""))
            if i % 2 == 0: total.append(num)
            elif num > 0: winners[i // 2] = total[-1] // num
        
        draw['winners'] = winners
        draw['price'] = total
        
        lst.append(draw)
    progress_bar(end, end)
    print("")
    print("Done Collecting Draws!")

In [80]:
draws = []
collectDraws(lst=draws)

No new draw to be added.


In [84]:
if len(draws) > 0:
    print(f"We have collected {len(draws)} draws")
    print(f"First draw : {draws[0]}")
    print(f"Last draw : {draws[-1]}")
else:
    print("No additional draw collected.")

No additional draw collected.


## Stores Data Collection
The functions below crawl over the Korean 6/45 lottery web html files and collect:
- Stores' names (1st & 2nd Place Winners)
- Stores' addresses (1st & 2nd Place Winners)
- Whether the winning # were chosen automatic or manual (1st Place Winners Only)

In [85]:
def parseStores(content, win = 1):
    stores = []
    rows = content.find_all("tr")
    reg, regGen = re.compile(r'>.*<'), re.compile(r'[가-힣]+')

    for r in rows:
      tds = r.find_all('td')[1:4]
      if win == 2: tds = tds[:-1]
      store = []
      for i, td in enumerate(tds):
        if win == 1 and i == 1: td = str(regGen.search(str(td)).group())
        else: td = str(reg.search(str(td)).group()[1:-1])
        td = td.strip()
        store.append(td)
      stores.append(store)
    return stores

In [86]:
##### 924회부터 제공
def collectLocations(lst, start=dbMostRecent + 1, end=mostRecent):
    if start > end:
        print(f"No new draws to be added.")
        return
    if start < 924:
        print("Stores information is available since 924th draw.")
        return
    print(f"Collecting Locations from Draw #{start} to #{end}")

    table_class = "tbl_data tbl_data_col"

    for idx in range(start, end + 1):
        progress_bar(idx - start, end - start + 1)

        locations = {'draw': idx}
        idx_url = store_url + str(idx)

        soup = BeautifulSoup(requests.get(idx_url).text, 'lxml')
        tables = soup.find_all("table", {"class":table_class})
        tables = [x.find('tbody') for x in tables]

        locations['first'] = parseStores(content = tables[0])
        secondLocations = parseStores(content = tables[1], win = 2)

        maxPage = len(soup.find('div', {"id": "page_box"}).find_all('a'))
        for p in range(2, maxPage + 1):
            soup = BeautifulSoup(requests.get(idx_url + "&nowPage=" + str(p)).text, 'lxml')
            tables = soup.find_all("table", {"class":table_class})
            tables = [x.find('tbody') if i == 1 else None for i, x in enumerate(tables)]
            secondLocations.extend(parseStores(content = tables[1], win = 2))
        locations['second'] = secondLocations
        lst.append(locations)
    progress_bar(end, end)
    print("")
    print("Done Collecting Locations!")

In [87]:
stores = []
collectLocations(lst=stores)

No new draws to be added.


In [89]:
if len(stores) > 0:
    print(f"We have collected {len(stores)} draws")
    print(f"First Set : {stores[0]}")
    print(f"Last Set : {stores[-1]}")
else:
    print("No additional stores collected.")

No additional stores collected.


## Raw Data to the Database
Insert each draws' data and stores' data to 'DRAWS' & 'STORES' tables in lottery.db'.

In [90]:
sql_draw_create = """CREATE TABLE IF NOT EXISTS 
`DRAWS`(turn int, num_1 int, num_2 int, num_3 int, num_4 int, num_5 int, num_6 int, num_bonus int)"""
sql_draw = """INSERT INTO `DRAWS`(`turn`, `num_1`, `num_2`, `num_3`, `num_4`, `num_5`, `num_6`, `num_bonus`)
VALUES(?, ?, ?, ?, ?, ?, ?, ?)"""
sql_winner_create = """CREATE TABLE IF NOT EXISTS 
`WINNERS`(turn int, winner_1 int, winner_2 int, winner_3 int, winner_4 int, winner_5 int)"""
sql_winner = """INSERT INTO `WINNERS`(`turn`, `winner_1`, `winner_2`, `winner_3`, `winner_4`, `winner_5`)
VALUES(?, ?, ?, ?, ?, ?)"""
sql_prize_create = """CREATE TABLE IF NOT EXISTS 
`PRIZES`(turn int, prize_1 int, prize_2 int, prize_3 int, prize_4 int, prize_5 int)"""
sql_prize = """INSERT INTO `PRIZES`(`turn`, `prize_1`, `prize_2`, `prize_3`, `prize_4`, `prize_5`)
VALUES(?, ?, ?, ?, ?, ?)"""
sql_store_create = """CREATE TABLE IF NOT EXISTS 
`STORES`(idx int, turn int, name varchar(255), auto BOOLEAN, firstPrize BOOLEAN, address varchar(255))"""
sql_store = """INSERT INTO `STORES`(`idx`, `turn`, `name`, `auto`, `firstPrize`, `address`) 
VALUES(?, ?, ?, ?, ?, ?)"""

In [94]:
def rawToDB(draws, stores, db_path):
    con = sqlite3.connect(db_path)
    cur = con.cursor()
    if len(draws) < 1 or len(stores) < 1:
        print("No additional data to be added to the database.")
        return

    cur.execute(sql_draw_create)
    cur.execute(sql_winner_create)
    cur.execute(sql_prize_create)
    cur.execute(sql_store_create)

    for draw_info in draws:
        draw = draw_info["draw"]
        nums = draw_info["nums"]
        winners = draw_info["winners"]
        prizes = draw_info["price"]

        data = tuple([draw] + [int(n) for n in nums])
        cur.execute(sql_draw, data)
        data = tuple([draw] + [int(n) for n in winners])
        cur.execute(sql_winner, data)
        data = tuple([draw] + [int(n) for n in prizes])
        cur.execute(sql_prize, data)
    row_idx = 0
    for stores_info in stores:
        draw = stores_info["draw"] 
        for i in stores_info["first"]:
            data = tuple([row_idx, draw, str(i[0]), \
                1 if str(i[1]).strip() == "자동" else 0 
                , 1, str(i[-1])])
            cur.execute(sql_store, data)
            row_idx += 1
        for i in stores_info["second"]:
            data = tuple([row_idx, draw, str(i[0]), None, 0, str(i[-1])])
            cur.execute(sql_store, data)
            row_idx += 1
    con.commit()
    con.close()

In [95]:
rawToDB(draws, stores, db_path)

No additional data to be added to the database.
