# Add data from [MTGTop8](https://www.mtgtop8.com/)

In [1]:
import requests

from bs4 import BeautifulSoup
from pymongo import MongoClient
from datetime import datetime

In [2]:
uri = "mongodb://localhost:27017"
db_name = "mtg_decks_db"
client = MongoClient(uri)
db = client[db_name]

In [3]:
base_url = "https://www.mtgtop8.com/"

In [4]:
def insert_or_get_pilot_id(name):
    collection = db["pilots"]

    existing_pilot = collection.find_one({"name": name})
    if existing_pilot:
        return existing_pilot["_id"]
    else:
        print("Cadastrando novo piloto")
        return collection.insert_one({
            "name": name,
            "createdAt": datetime.now(),
            "updatedAt": datetime.now(),
        }).inserted_id

In [5]:
def insert_or_get_deck_id(name, maindeck, sideboard, format):
    collection = db["decks"]

    maindeck.sort()
    sideboard.sort()

    md_text = "|".join(maindeck)
    sb_text = "|".join(sideboard)

    existing_deck = collection.find_one({
        "maindeckText": md_text,
        "sideboardText": sb_text,
        "format": format
    })
    if existing_deck:
        return existing_deck["_id"]
    else:
        print("Cadastrando novo deck")
        return collection.insert_one({
            "name": name,
            "format": format,
            "maindeck": maindeck,
            "sideboard": sideboard,
            "maindeckText": md_text,
            "sideboardText": sb_text,
            "createdAt": datetime.now(),
            "updatedAt": datetime.now(),
        }).inserted_id

In [6]:
def insert_or_get_event_id(name, location, date, format, participants):
    collection = db["events"]

    event = collection.find_one({
        "name": name,
        "date": date,
        "format": format,
    })
    if event:
        return event["_id"]
    else:
        print("Cadastrando novo evento")
        return collection.insert_one({
            "name": name,
            "date": date,
            "format": format,
            "location": location,
            "participants": participants,
            "createdAt": datetime.now(),
            "updatedAt": datetime.now(),
        }).inserted_id

In [7]:
def extract_event_links(url):
    response = requests.get(url)
    if response.status_code != 200:
        return f"Falha ao obter a página: {response.status_code}"

    bs = BeautifulSoup(response.text, "html.parser")
    events_table = bs.find_all(class_="Stable")

    if not events_table[1]:
        return f"Table de eventos não encontrada"

    events_link = []
    for row in events_table[1].find_all("tr"):
        event_link = row.find("a")
        if event_link:
            events_link.append(base_url + event_link["href"])

    return events_link

In [8]:
def string_to_date(date_str):
    date = date_str.split(" - ")
    date_format = "%d/%m/%y"

    if len(date) > 1:
        return datetime.strptime(date[1], date_format)
    return datetime.strptime(date_str, date_format)

In [9]:
def extract_event_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        return f"Falha ao obter a página: {response.status_code}"

    bs = BeautifulSoup(response.text, "html.parser")

    name = bs.find("div", class_="event_title").text.strip()
    format = bs.find("div", class_="meta_arch").text.strip()
    date_str = bs.find("div", class_="meta_arch").find_next("div").text.strip()
    date = string_to_date(date_str)

    decks = bs.find_all(class_=['hover_tr', 'chosen_tr'])

    decks_url = []
    for el in decks:
        if not el:
            continue

        anchor = el.find("a")
        if not anchor:
            continue

        link = anchor["href"]
        if link:
            decks_url.append(f"{base_url}event{link}")

    return name, format, date, decks_url

In [10]:

def rename_name_sticker_goblin(list):
    return [card.replace("________ Goblin", "_____ Goblin", 1) for card in list]


def remove_unused_snow_covered_lands(list):
    snow_cards = [
        "Arctic Foxes", "Balduvian Conjurer", "Barbarian Guides", "Drift of the Dead", "Frost Bite",
        "Icequake", "Into the North", "Kjeldoran Guard", "Priest of the Haunted Edge", "Rimewind Taskmage",
        "Ronom Serpent", "Sculptor of Winter", "Skred", "Snow Devil", "Thermokarst",
        "Whiteout", "Woolly Mammoths", "Zombie Musher"
    ]
    snow_lands = ["Snow-Covered Plains", "Snow-Covered Island",
                  "Snow-Covered Black", "Snow-Covered Mountain", "Snow-Covered Forest"]

    if not any(item in list for item in snow_cards) and any(item in list for item in snow_lands):
        print("Removendo Snow-Lands")
        return [card.replace("Snow-Covered ", "", 1) for card in list]
    return list


def extract_deck_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        return f"Falha ao obter a página: {response.status_code}"

    bs = BeautifulSoup(response.text, "html.parser")
    info = bs.find(class_="chosen_tr").find_all("a")
    decklist = bs.find_all("div", class_="deck_line")

    name = info[1].text.strip()
    pilot = info[2].text.strip()

    md = []
    sb = []
    for el in decklist:
        id = el["id"]
        card = el.text.split(maxsplit=1)

        card_quantity = card[0].strip()
        card_name = card[1].strip()

        if id.startswith("md"):
            md.extend([card_name] * int(card_quantity))
        elif id.startswith("sb"):
            sb.extend([card_name] * int(card_quantity))

    md = rename_name_sticker_goblin(md)
    sb = rename_name_sticker_goblin(sb)

    md = remove_unused_snow_covered_lands(md)
    sb = remove_unused_snow_covered_lands(sb)

    return name, pilot, md, sb

In [11]:
def get_event_name_and_location(event):
    splitted_event = event.split(" @ ")
    if len(splitted_event) > 1:
        return splitted_event[0].strip(), splitted_event[1].strip()

    return event.strip(), None

In [12]:
events = []

max_pages = 2
for i in range(1, max_pages + 1):
    events.extend(extract_event_links(
        f"{base_url}format?f=PAU&meta=282&cp={i}"))
    print(f"Eventos da página {i} adicionados")

Eventos da página 1 adicionados
Eventos da página 2 adicionados


In [14]:
for event in events:
    e_name_location, e_format, e_date, decks_url = extract_event_data(event)
    e_name, e_location = get_event_name_and_location(e_name_location)

    participants = []
    for deck_url in decks_url:
        name, pilot, md, sb = extract_deck_data(deck_url)

        pilot_id = insert_or_get_pilot_id(pilot)
        deck_id = insert_or_get_deck_id(name, md, sb, e_format)

        participants.append({"pilot": pilot_id, "deck": deck_id})

    event_id = insert_or_get_event_id(
        e_name, e_location, e_date, e_format, participants)

ConnectTimeout: HTTPSConnectionPool(host='www.mtgtop8.com', port=443): Max retries exceeded with url: /event?e=54417&d=604154&f=PAU (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x1099225d0>, 'Connection to www.mtgtop8.com timed out. (connect timeout=None)'))