In [86]:
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
import time

In [None]:

#
# ---------------------  EXTRACT
#
#Get the collection url ending, add the card number and returns the page soup
def card_html(card_url):
    card_page = requests.get(card_url)
    card_soup = BeautifulSoup(card_page.content, 'lxml')   
    return(card_soup)

#Get the card_soup file and return a json with the card information
def get_card_json(card_soup):
    card_info = re.findall(r'<script type="application/ld\+json">(.+)</script>', str(card_soup))[3]
    card_json = json.loads(card_info)
    return(card_json)

#Gets the set url and returns a list with ALL the card links
def card_links(set_code):
    set_url = 'https://scryfall.com/sets/' + set_code
    set_page = requests.get(set_url)
    set_soup = BeautifulSoup(set_page.content, 'lxml')
    
    #gets the url of each normal
    card_info = re.findall(r'<a class="card-grid-item-card" href="(.+)">', str(set_soup))
    #gets the url of each double face cards
    card_info_dfc = re.findall(r'<a class="card-grid-item-card" data-component="card-grid-dfc" href="(.+)"', str(set_soup))

    collection_name = re.findall(r'<h1 class="set-header-title-h1">\n(.+)', str(set_soup))[0].strip()
    collection_name = collection_name.replace("(" + set_code.upper() + ")", "").strip() #gets the set complete name to remove from the card names
    #return both kinds of card    
    return(card_info + card_info_dfc, collection_name)

#
# ---------------------  TRANSFORM
#
 
#gets the card json and returns a formated card list
def get_card_data(card_json, collection_name):
    card_data = []
    #print(card_json)
    card_name = card_json['name'].replace("(" + collection_name + ")", "").strip()
    card_data.append(card_name) #card name
    #print(card_json['description'].split('•'))
    mana_cost = card_json['description'].split('•')[0].strip() #mana cost
    if ('land' in mana_cost.lower()) or ('token' in mana_cost.lower()):
        card_type = mana_cost
        mana_cost = '-'
    
    else:        
        card_type = card_json['description'].split('•')[1].strip() #card type, hypertype
        
    card_data.append(mana_cost) #append the mana cost
    card_data.append(card_type) #append the card type
    #print("Mana cost: " + mana_cost)
    #print("Card type: " + card_type)
    #print(card_type)

    if "token" in card_type.lower(): #the card only has power and resistence if it's a creature
        if "creature" in card_type.lower():
            power_resistence = card_json['description'].split('•')[1].strip() #power/resistence
            card_text = card_json['description'].split('•')[2].strip() #card information, effects, etc
            #removes useless text and returns only the artist name
            artist = card_json['description'].split('•')[4].replace("Illustrated by ", "").strip()
        
        else:
            power_resistence = "-" #only creature cards has power/resistence, so here returns -
            card_text = card_json['description'].split('•')[2].strip()  #card information, effects, etc
            #removes useless text and returns only the artist name
            artist = card_json['description'].split('•')[4].replace("Illustrated by ", "").strip()  
    
    elif "creature" in card_type.lower(): #the card only has power and resistence if it's a creature
        power_resistence = card_json['description'].split('•')[2].strip() #power/resistence
        card_text = card_json['description'].split('•')[3].strip() #card information, effects, etc
        #removes useless text and returns only the artist name
        artist = card_json['description'].split('•')[5].replace("Illustrated by ", "").strip()
        
    elif ("land" in card_type.lower()): #the card only has power and resistence if it's a creature
        power_resistence = "-"
        card_text = card_json['description'].split('•')[1].strip() #card information, effects, etc
        #removes useless text and returns only the artist name
        artist = card_json['description'].split('•')[3].replace("Illustrated by ", "").strip()
        
    elif "planeswalker" in card_type.lower():
        #print("OPA OPA OPA PLANINALTO")
        power_resistence = card_json['description'].split('•')[2].replace("Loyalty: ", "").strip() #power/resistence
        card_text = card_json['description'].split('•')[3].strip() #card information, effects, etc
        #removes useless text and returns only the artist name
        artist = card_json['description'].split('•')[5].replace("Illustrated by ", "").strip()     
        
    else: #otherwise, the card will only have description text
        power_resistence = "-" #only creature cards has power/resistence, so here returns -
        card_text = card_json['description'].split('•')[2].strip()  #card information, effects, etc
        #removes useless text and returns only the artist name
        artist = card_json['description'].split('•')[4].replace("Illustrated by ", "").strip()
        
    card_data.append(power_resistence) #append power/resistence
    card_data.append(card_text) #the card text  
    #print(card_text)
    card_data.append(artist) #artist's name
    card_data.append(collection_name) #set name
    
    try: #get the card price
        lowPrice = card_json['offers'][0]['lowPrice']
        highPrice = card_json['offers'][0]['highPrice']
    
    except: #if the card has no price
        lowPrice = '-'
        highPrice = '-'

    card_data.append(lowPrice)
    card_data.append(highPrice)
    card_data.append(card_json['image'][0]) #image
    

    if '//' in card_name:
        card_data.append(card_json['image'][0].replace("front", "back")) #image back
        
    else:
        card_data.append("-")
    #print(power_resistence)
    #print(card_data)
    #if "artifact" in card_type.lower():
    #print(card_data)
    #print("-")
    return(card_data)
    
    
#Return's the soup content of the main set page
def get_set_soup():
    url = 'https://scryfall.com/sets'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'lxml')
    
    return(soup)
    
if __name__ == "__main__":
    dfColumns = ["name", "mana_cost", "card_type", "power", "card_text", "artist", "collection", "lowPrice", 'highPrice', 'front_image', 'back_image']
    soup = get_set_soup() #get the main soup content of the main set page
    
    #get all the set codes (like "vow", "soi")
    set_codes = re.findall(r'https://scryfall.com/sets/(.+)">', str(soup))
    #removes all the itens in the list that isn't relevant
    set_codes = [x for x in set_codes if "tabindex" not in x]
    
    all_cards = [] #empty list to append all the card data
    i = 1
    j = 68
    #Go through each set code
    for set_code in set_codes[88:]:
        time.sleep(i*6)
        if i > 6:
            i = 1
            print("Saving " + str(j))
            df = pd.DataFrame(data=all_cards, columns=dfColumns)
            df.to_csv('./mtg_data.csv', index = False, header=True)
            del df
        else:
            i += 1
        print("--- Collection: " + set_code)
        j += 1
        card_urls, collection_name = card_links(set_code) #get all the card links from this set and it's set full name

        for card_url in card_urls: #go through all the cards in each set
            card_soup = card_html(card_url) #takes the card url and returns it's soup
            try:
                card_json = get_card_json(card_soup) #takes the card soup and return the card's data as a json
                #get_card_data(card_json, collection_name)
                all_cards.append(get_card_data(card_json, collection_name)) #Returns a list with the formated card data from the card json
                #print(all_cards)
                
            except:
                pass
                
    df = pd.DataFrame(data=all_cards, columns=dfColumns)
    df.to_excel('./mtg_data.xlsx', index = False, header=True)
    print("ACABOU")
#nome, mana cost, card type, power/resistence, card text, artist, collection, id, lowPrice, highPrice ,front image, back image

--- Collection: 2xm
--- Collection: t2xm
--- Collection: htr19
--- Collection: jmp
--- Collection: ajmp
--- Collection: fjmp
Saving 74
--- Collection: m21
--- Collection: pm21
--- Collection: tm21
--- Collection: ss3
--- Collection: ha3
--- Collection: plg20
--- Collection: iko
Saving 81
--- Collection: piko
--- Collection: tiko
--- Collection: c20
--- Collection: tc20
--- Collection: oc20
--- Collection: ha2
--- Collection: und
Saving 88
--- Collection: tund
--- Collection: thb
--- Collection: pthb
--- Collection: tthb
--- Collection: j20
--- Collection: pf20
--- Collection: sld


'1.12'