# 台灣大樂透 爬取資料以及預測

In [25]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from collections import Counter

# ---------------------------
# 1. 網頁爬取與資料抓取
# ---------------------------
def fetch_page(url, session):
    response = session.get(url)
    return response.text if response.status_code == 200 else ""

def parse_numbers(html):
    soup = BeautifulSoup(html, 'html.parser')
    spans = soup.find_all('span', {'style': 'font-size: 32px; font-weight: bold; color: #000000'})
    data = [
        (list(map(int, re.findall(r'\d+', span.get_text(separator=' ', strip=True))[:6])),
         int(re.findall(r'\d+', span.get_text(separator=' ', strip=True))[6]))
        for span in spans
    ]
    return data

def crawl_data(base_url, page_range):
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    X_list, y_list = [], []
    for page in page_range:
        url = base_url.format(page)
        html = fetch_page(url, session)
        if not html:
            continue
        data = parse_numbers(html)
        for x_vals, y_val in data:
            X_list.append(x_vals)
            y_list.append(y_val)
    return X_list, y_list

BASE_URL = 'https://www.pilio.idv.tw/ltobig/listbigBIG.asp?indexpage={}&orderby=new'
PAGES = range(1, 24)

X_list, y_list = crawl_data(BASE_URL, PAGES)
X = np.array(X_list)    # shape: (n, 6)
y = np.array(y_list)    # shape: (n,)
print('資料抓取完畢')

# ---------------------------
# 2. 頻率分析與組合數學預測
# ---------------------------
def frequency_analysis_np(X, y):
    freq_X = []
    for i in range(X.shape[1]):
        col = X[:, i]
        counts = np.bincount(col, minlength=50)  # 假設號碼範圍 1~49，index0不使用
        most_common = np.argmax(counts[1:]) + 1
        freq_X.append(most_common)
    counts_y = np.bincount(y, minlength=50)
    most_common_y = np.argmax(counts_y[1:]) + 1
    return np.array(freq_X), most_common_y

def combination_analysis_np(X, y):
    data = [tuple(x) + (y_val,) for x, y_val in zip(X, y)]
    cnt = Counter(data)
    most_common_combo = cnt.most_common(1)[0][0]
    next_X = np.array(most_common_combo[:-1])
    next_y = most_common_combo[-1]
    return next_X, next_y

next_x_frequency, next_y_frequency = frequency_analysis_np(X, y)
next_x_combination, next_y_combination = combination_analysis_np(X, y)

combined_X = (next_x_frequency + next_x_combination) // 2
combined_y = next_y_frequency  # 以頻率分析結果為準

# ---------------------------
# 3. 能信度（Confidence）計算
# ---------------------------
def compute_frequency_confidence(X, predicted_X, predicted_y):
    """
    對頻率分析法，計算每一欄預測數字出現的比例，及特別號的出現比例。
    返回： (confidences_X, confidence_y)
    """
    n = X.shape[0]
    confidences = []
    for i in range(X.shape[1]):
        col = X[:, i]
        # 預測數字 predicted_X[i] 在該欄出現的次數
        count = np.sum(col == predicted_X[i])
        confidences.append(count / n)
    conf_y = np.sum(y == predicted_y) / n
    return np.array(confidences), conf_y

def compute_combination_confidence(X, y, combo):
    """
    計算最常見組合在所有資料中出現的比例
    """
    data = [tuple(x) + (y_val,) for x, y_val in zip(X, y)]
    cnt = Counter(data)
    n = len(data)
    return cnt[combo] / n

freq_conf_X, freq_conf_y = compute_frequency_confidence(X, next_x_frequency, next_y_frequency)
comb_conf = compute_combination_confidence(X, y, tuple(next_x_combination) + (next_y_combination,))

# 可對綜合法信度做簡單平均（例如頻率分析各欄的平均信度）
combined_conf = np.mean(freq_conf_X)

# ---------------------------
# 4. 格式化輸出
# ---------------------------
def format_output(np_array, special_num):
    x_str = ", ".join(str(int(x)) for x in np_array)
    y_str = f"特別號: {int(special_num)}"
    return x_str, y_str

freq_x_str, freq_y_str = format_output(next_x_frequency, next_y_frequency)
comb_x_str, comb_y_str = format_output(next_x_combination, next_y_combination)
combined_x_str, combined_y_str = format_output(combined_X, combined_y)

# 輸出預測結果與信度資訊
print(f'頻率分析法預測的下一筆資料: {freq_x_str}, {freq_y_str}')
print(f'--> 能信度: 每位能信度分別為 {", ".join(f"{conf*100:.1f}%" for conf in freq_conf_X)}, 特別號信度: {freq_conf_y*100:.1f}%')
print(f'組合數學方法預測的下一筆資料: {comb_x_str}, {comb_y_str}')
print(f'--> 能信度: 此組合出現機率: {comb_conf*100:.1f}%')
print(f'綜合預測的下一筆資料: {combined_x_str}, {combined_y_str}')
print(f'--> 能信度 (以頻率分析法平均能信度計算): {combined_conf*100:.1f}%')


資料抓取完畢
頻率分析法預測的下一筆資料: 1, 11, 23, 30, 41, 49, 特別號: 41
--> 能信度: 每位能信度分別為 13.3%, 5.8%, 5.4%, 5.1%, 5.8%, 12.2%, 特別號信度: 2.9%
組合數學方法預測的下一筆資料: 14, 29, 35, 36, 37, 49, 特別號: 40
--> 能信度: 此組合出現機率: 0.0%
綜合預測的下一筆資料: 7, 20, 29, 33, 39, 49, 特別號: 41
--> 能信度 (以頻率分析法平均能信度計算): 7.9%
