In [1]:
from bs4 import BeautifulSoup
import requests, os, json, time
import pandas as pd

# Data is scraped from
# https://dbz-dokkanbattle.fandom.com/wiki/All_Cards:_(1)001_to_(1)100

only_new = False # need to update csv if true

In [2]:
condition_list = ['Transformation', 'Exchange', 'Rejuvenation', 'Evolution', 
                  'Special Fusion', 'Potara Fusion', 'Mutation', 'Fusion', 'Boost', 
                  'Rage', 'Deactivation', 'Awakening', 'Absorption', 'Outfit Change']

def getSoup(url):
    r = requests.get(url)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def cleanSkillText(row):
    rolling = ''
    for xx in row:
        if '<a href' not in str(xx):
            rolling += str(xx)
        elif 'Category:' in str(xx):
            rolling += str(xx).split('" title="')[0].split('Category:')[1]
        elif '<a href="#cite_note-' in str(xx):
            rolling += str(xx).split('</a')[0].split('">')[-1]
        else:
            rolling += str(xx).split('" title="')[1].split('">')[0]
    return rolling

# returns a dict of the card data
def get_dokkan_card_data(soup):
    card_data = {}
    pettan_data = {}
    prefix = '' # switch to 'EZA'
    stats_switch = ''
    card_list = []
    pettan_list = []
    pettan_url = ''
    
    rows = soup.find_all('td')
    
    if 'This character is unreleased' in soup.text:
        row_offset = 3
    else:
        row_offset = 6
    
    # go through the rows
    for i in range(1, len(rows)):
        if i == 1:
            temp = rows[1].find('b').contents
            if len(temp) == 3:
                card_data['Title'] = temp[0]
                card_data['Character'] = temp[2]
            else:
                card_data['Character'] = temp[0].text
                card_data['Title'] = ' '
        elif 'Max Lv' == rows[i].text:
            # check for Pettan battle card
            if 'This character is unreleased' in soup.text:
                row_offset = 3
            else:
                row_offset = 6
            for j in range(15):
                if 'Letter P.png' in str(rows[i+j]):
                    row_offset += 1
                    break
            card_data['Max Level'] = rows[i+row_offset].text if row_offset in [6, 7] else 'NA'
        elif 'SA Lv' == rows[i].text:
            card_data['SA Level'] = rows[i+row_offset].text if row_offset in [6, 7] else 'NA'
        elif 'Rarity' == rows[i].text:
            card_data['Rarity'] = ' to '.join([a.get('title').split(':')[1] for a in rows[i+row_offset].find_all('a')])
        elif 'Type' == rows[i].text:
            try:
                card_data['Type'] = ' to '.join([a.get('title').split(':')[1] for a in rows[i+row_offset].find_all('a')])
            except:
                card_data['Type'] = rows[i+row_offset].find('img').get('alt').split('.')[0]
        elif 'Cost' == rows[i].text:
            card_data['Cost'] = rows[i+row_offset].text if row_offset in [6, 7] else 'NA'
        elif 'ID' == rows[i].text:
            card_data['ID'] = rows[i+row_offset].text
            # what to connect the transformation
            if len(card_list) > 0:
                card_list[-1]['Transforms Into'] = card_data['ID']

        # for transforming units
        if rows[i].get('rowspan') == '3':
            card_list.append(card_data)
            pettan_list.append(pettan_data)
            pettan_data = {}
            prefix = ''
            if len(rows[i+1].find('b').contents) == 3:
                card_data = {
                    'Title': rows[i+1].find('b').contents[0],
                    'Character': rows[i+1].find('b').contents[2],
                }
            else:
                card_data = {
                    'Title': ' ',
                    'Character': rows[i+1].find('b').contents[0].text,
                }
        
        # Release Dates
        if rows[i].find('img', attrs={'alt': 'Japan server.png'}) is not None and 'JP Release Date' not in card_data.keys():
            card_data[prefix + 'JP Release Date'] = rows[i+1].text.strip()
            if rows[i+2].find('img', attrs={'alt': 'Z Only.png'}):
                card_data[prefix + 'JP EZA Release Date'] = rows[i+2].text.strip()
        elif rows[i].find('img', attrs={'alt': 'Global server.png'}) is not None and 'Global Release Date' not in card_data.keys():
            card_data[prefix + 'Global Release Date'] = rows[i+1].text
            if rows[i+2].find('img', attrs={'alt': 'Z Only.png'}):
                card_data[prefix + 'Global EZA Release Date'] = rows[i+2].text.strip()

        # Leader skill
        elif rows[i].find('img', attrs={'alt': 'Leader Skill.png'}) is not None:
            if 'Leader Skill' in card_data.keys():
                if card_data['Leader Skill'] != '':
                    prefix='EZA '
            card_data[prefix + 'Leader Skill'] = cleanSkillText(rows[i+1].contents)

        # Passive
        elif rows[i].find('img', attrs={'alt': 'Passive skill.png'}) is not None:
            temp = rows[i+1].find('strong')
            card_data['Passive Skill Name'] = temp.text if temp is not None else rows[i+1].text
            card_data[prefix + 'Passive Skill'] = cleanSkillText(rows[i+2].contents)
        # Active
        elif rows[i].find('img', attrs={'alt': 'Active skill.png'}) is not None:
            temp = rows[i+1].find('strong')
            card_data[prefix + 'Active Skill Name'] = temp.text if temp is not None else rows[i+1].text
            card_data[prefix + 'Active Skill'] = rows[i+2].text.strip()
        elif rows[i].find('img', attrs={'alt': 'Activation Condition.png'}) is not None:
            card_data[prefix + 'Activation Condition'] = rows[i+1].text.strip()
        
        # Super attack
        elif rows[i].find('img', attrs={'alt': 'Super atk.png'}) is not None:
            card_data[prefix + 'Super Attack Name'] = rows[i+1].text.strip()
            card_data[prefix + 'Super Attack Effect'] = rows[i+2].text
        # 18 ki
        elif rows[i].find('img', attrs={'alt': 'Ultra Super atk.png'}) is not None:
            card_data[prefix + 'Ultra Super Attack Name'] = rows[i+1].text
            card_data[prefix + 'Ultra Super Attack Effect'] = rows[i+2].text
        elif rows[i].find('img', attrs={'alt': 'Unit SA.png'}) is not None:
            card_data[prefix + 'Unit Super Attack Name'] = rows[i+1].text
            card_data[prefix + 'Unit Super Attack Effect'] = rows[i+2].text
        
        # Different types of character changes
        for condition in condition_list:
            if rows[i].find('img', attrs={'alt': condition + ' Condition.png'}) is not None:
                card_data['Transformation Type'] = condition
                card_data[prefix + 'Transformation Condition'] = rows[i+1].text
                break
        
        # Links and categories
        if rows[i].find('img', attrs={'alt': 'Link skill.png'}) is not None:
            card_data['Links'] = json.dumps([link.strip() for link in rows[i+1].contents[0].text.split('   -')])
        elif rows[i].find('img', attrs={'alt': 'Category.png'}) is not None:
            card_data['Categories'] = json.dumps([link.strip() for link in rows[i+1].contents[0].text.split(' - ')])

        # Stats
        elif rows[i].find('img', attrs={'alt': 'HP.png'}) is not None:
            card_data[stats_switch + 'HP'] = []
            j = 1
            while True:
                card_data[stats_switch + 'HP'].append(rows[i+j].text.strip())
                j += 1
                if (not rows[i+j].text.isdigit()) and (not (rows[i+j].text == '∞')):
                    break
            card_data[stats_switch + 'HP'] = json.dumps(card_data[stats_switch + 'HP'])
        elif rows[i].find('img', attrs={'alt': 'Atk.png'}) is not None:
            card_data[stats_switch + 'Attack'] = []
            j = 1
            while True:
                card_data[stats_switch + 'Attack'].append(rows[i+j].text.strip())
                j += 1
                if (not rows[i+j].text.isdigit()) and (not (rows[i+j].text == '∞')):
                    break
            card_data[stats_switch + 'Attack'] = json.dumps(card_data[stats_switch + 'Attack'])
        elif rows[i].find('img', attrs={'alt': 'Def.png'}) is not None:
            card_data[stats_switch + 'Defense'] = []
            j = 1
            while True:
                card_data[stats_switch + 'Defense'].append(rows[i+j].text.strip())
                j += 1
                if i+j >= len(rows): break
                if (not rows[i+j].text.isdigit()) and (not (rows[i+j].text == '∞')):
                    break
            card_data[stats_switch + 'Defense'] = json.dumps(card_data[stats_switch + 'Defense'])

        # Special Skills
        elif rows[i].find('img', attrs={'alt': 'Special Skill.png'}) is not None:
            card_data['Special Skills'] = json.dumps([img.get('title') for img in rows[i+1].find_all('a')])
        # Details
        elif rows[i].find('img', attrs={'alt': 'Details.png'}) is not None:
            card_data['Details'] = []
            j = 1
            while True:
                card_data['Details'].append(rows[i+j].text.strip())
                j += 1
                if i+j >= len(rows): break
                if '►' not in rows[i+j].text:
                    break
            card_data['Details'] = json.dumps(card_data['Details'])
        
        # References
        elif rows[i].find('div', class_='mw-references-wrap'):
            card_data['References'] = json.dumps([l.text for l in rows[i].find_all('li')])

        # Pettan card
        elif rows[i].find('img', attrs={'alt': 'Letter P.png'}) is not None:
            href = rows[i].find('a').get('href')
            if pettan_url == 'https://dbz-dokkanbattle.fandom.com'+href: # already grabbed it
                # for dokkan cards that have a different global and jp version
                continue
            pettan_data = get_pettan_card_data('https://dbz-dokkanbattle.fandom.com'+href)
            pettan_url = 'https://dbz-dokkanbattle.fandom.com'+href
       
        # Switch for stats / break point
        elif rows[i].find('img', attrs={'alt': 'Step1.png'}) is not None:
            stats_switch = 'EZA '
        elif rows[i].get('rowspan') == '3':
            stats_switch = ''
        
    card_list.append(card_data)
    pettan_list.append(pettan_data)
    return card_list, pettan_list

# gets data for a pettan card
def get_pettan_card_data(url):
    card_dict = {}
    soup = getSoup(url)
    rows = soup.find_all('tr')
    
    for i in range(len(rows)):
        # Title and character
        if rows[i].find('td', attrs={'colspan': '4'}) is not None:
            if rows[i].find('td', attrs={'colspan': '4'}).find('b') is not None:
                temp = rows[i].find('td', attrs={'colspan': '4'}).find('b').contents
                card_dict['Title'] = temp[0]
                card_dict['Character'] = temp[2]
        
        # Card info
        elif i == 1:
            card_dict['Series'] = rows[i].find_all('td')[2].text
        elif i == 2:
            temp = rows[i].find_all('td')
            card_dict['Number'] = temp[2].text
            card_dict['Rarity'] = temp[0].text
            card_dict['Type'] = temp[1].find('a').get('title').split(':')[1]

        # Stats
        elif rows[i].find('img', attrs={'alt': 'HP.png'}) is not None:
            card_dict['HP'] = rows[i].text   
        elif rows[i].find('img', attrs={'alt': 'Atk.png'}) is not None:
            card_dict['Attack'] = rows[i].text

        # Description
        elif rows[i].text == 'Description':
            card_dict['Description'] = rows[i+1].text

        # Release dates
        elif rows[i].find('img', attrs={'alt': 'Japan server.png'}) is not None:
            card_dict['Japan Release Date'] = rows[i].find_all('td')[-1].text
        elif rows[i].find('img', attrs={'alt': 'Global server.png'}) is not None:
            card_dict['Global Release Date'] = rows[i].find_all('td')[-1].text    
        
    return card_dict

In [3]:
stubs_to_check = [
    '(1)001_to_(1)100', '(1)101_to_(1)200', '(1)201_to_(1)300', '(1)301_to_(1)400', 
    '(1)401_to_(1)500', '(1)501_to_(1)600', '(1)601_to_(1)700', '(1)701_to_(1)800', 
    '(1)801_to_(1)900', '(1)901_to_(1)1000', '(1)1001_to_(1)1100', '(1)1101_to_(1)1200', 
    '(1)1201_to_(1)1300', '(1)1301_to_(1)1400', '(1)1401_to_(1)1500', '(1)1501_to_(1)1600',
    '(1)1601_to_(1)1700', '(1)1701_to_(1)1800', '(1)1801_to_(1)1900', '(1)1901_to_(1)2000',
    '(1)2001_to_(1)2100', '(1)2101_to_(1)2200', '(1)2201_to_(1)2300', '(1)2301_to_(1)2400',
    '(2)001_to_(2)1000', '(3)001_to_(3)1000',
#     '(4)001_to_(4)_unknown',
]

In [4]:
# create 'data' directory if it doesn't exist already
path = os.path.join('data')
if not os.path.exists(path):
    os.makedirs(path)

try:
    old_cards_df = pd.read_csv(os.path.join('data', 'dokkan_cards.csv'))
    old_ids = old_cards_df['ID']
except:
    old_ids = []

In [5]:
rolling_card_list = []
rolling_pettan_list = []
card_url = 'https://dbz-dokkanbattle.fandom.com{}'
list_link = 'https://dbz-dokkanbattle.fandom.com/wiki/All_Cards:_{}'

# for each of the links
for stub in stubs_to_check:
    print('Starting:', stub)
    soup = getSoup(list_link.format(stub))
    
    # go through each of the cards on that page
    for row in soup.find_all('tr')[1:]:
        # if the card's data is already stored, skip it
        if int(row.find_all('td')[0].text.strip().replace('(', '').replace(')', '')) in old_ids and only_new:
            continue
        
        # get the page
        href = row.find_all('td')[3].find('a').get('href') # card's path
        card_soup = getSoup(card_url.format(href))
        
        # get the data and store it
        card_list, pettan_list = get_dokkan_card_data(card_soup)
        
        rolling_card_list.extend([d for d in card_list if len(d.keys()) != 0])
        rolling_pettan_list.extend([d for d in pettan_list if len(d.keys()) != 0])
        time.sleep(.1)
    time.sleep(5) # wait a bit

Starting: (1)001_to_(1)100
Starting: (1)101_to_(1)200
Starting: (1)201_to_(1)300
Starting: (1)301_to_(1)400
Starting: (1)401_to_(1)500
Starting: (1)501_to_(1)600
Starting: (1)601_to_(1)700
Starting: (1)701_to_(1)800
Starting: (1)801_to_(1)900
Starting: (1)901_to_(1)1000
Starting: (1)1001_to_(1)1100
Starting: (1)1101_to_(1)1200
Starting: (1)1201_to_(1)1300
Starting: (1)1301_to_(1)1400
Starting: (1)1401_to_(1)1500
Starting: (1)1501_to_(1)1600
Starting: (1)1601_to_(1)1700
Starting: (1)1701_to_(1)1800
Starting: (1)1801_to_(1)1900
Starting: (1)1901_to_(1)2000
Starting: (1)2001_to_(1)2100
Starting: (1)2101_to_(1)2200
Starting: (1)2201_to_(1)2300
Starting: (1)2301_to_(1)2400
Starting: (2)001_to_(2)1000
Starting: (3)001_to_(3)1000


In [6]:
cards_df = pd.DataFrame(rolling_card_list)
pettan_df = pd.DataFrame(rolling_pettan_list)

In [7]:
cards_df['SA Level'].value_counts()

1/10    1681
1/15     189
1/20      93
N/A       17
1/25      15
Name: SA Level, dtype: int64

In [8]:
# if only updating, append new data to old data
if only_new:
    old_cards_df = pd.read_csv(os.path.join('data', 'dokkan_cards.csv'))
    old_pettan_df = pd.read_csv(os.path.join('data', 'pettan_cards.csv'))
    cards_df = old_cards_df.append(cards_df)
    pettan_df = old_pettan_df.append(pettan_df)

cards_df.to_csv(os.path.join('data', 'dokkan_cards.csv'), index=False)
pettan_df.to_csv(os.path.join('data', 'pettan_cards.csv'), index=False)

# For testing / misc.

In [10]:
assert False # so it doesn't hit this stuff

AssertionError: 

In [None]:
# url = 'https://dbz-dokkanbattle.fandom.com' + soup.find_all('tr')[1].find_all('td')[3].find('a').get('href')   
url = 'https://dbz-dokkanbattle.fandom.com/wiki/Despair%27s_Onslaught_Frieza_(1st_Form)'
card_soup = getSoup(url)
char_list, pettan_list = get_dokkan_card_data(card_soup)
char_list