In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import re
import ssl
import pandas as pd
from time import sleep

In [2]:
base_URL = "https://www.aonsrd.com/" 
index_URL = base_URL + "Aliens.aspx?Letter=All"
headers = {'User-Agent': ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) "
                                "AppleWebKit/537.36 (KHTML, like Gecko) " 
                                "Chrome/35.0.1916.47 Safari/537.36")}
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
def open_page(request):
    trying = True
    while trying:
        trying = False
        try:
            page = urlopen(request)
        except ConnectionResetError:
            trying = True
            print(">>> Connection reset, waiting and trying again")
            sleep(120)
    return page

def get_page(url):
    request = Request(
            url, 
            data=None, 
            headers=headers)
    page = open_page(request)
    soup = BeautifulSoup(page, 'html.parser')
    return soup

def replace_spaces_in_link(link):
    return link.replace(" ", "%20")

In [4]:
index_soup = get_page(index_URL)
links_to_aliens = index_soup.table.find_all('a')
links_to_aliens = [replace_spaces_in_link(link["href"]) for link in links_to_aliens[4:]]
len(links_to_aliens)

277

In [5]:
def get_last_value_as_int(result):
    text = result.group()
    val = text.split()[-1]
    val = val.replace(",", "")
    val = int(val)
    return(val)

def get_second_last_value_as_int(result):
    text = result.group()
    val = text.split()[-2]
    val = val.replace(",", "")
    val = int(val)
    return(val)

def get_average_damage(text):
    regex_res = re.search(r"([0-9]+)d([0-9]+)\+([0-9]+)", text)
    a = int(regex_res.group(1))
    b = int(regex_res.group(2))
    c = int(regex_res.group(3))
    return a*(b+1)/2+c

def get_nth_group(n, conversion = str):
    def fnc(result):
        return conversion(result.group(n))
    return fnc


FEATURES = [
    ('XP', r"XP [0-9,]*", get_last_value_as_int),
    ('alignment', r"([LCN][GEN]|N) (\w+) ([a-z ]+)", get_nth_group(1)),
    ('size', r"([LCN][GEN]|N) (\w+) ([a-z ]+)", get_nth_group(2)),
    ('type', r"([LCN][GEN]|N) (\w+) ([a-z]+( [a-z]+)?)", get_nth_group(3)),
    ('Init', r"Init [-+]?\d+", get_last_value_as_int),
    ('HP', r"HP [-+]?\d+", get_last_value_as_int),
    ('EAC', r"EAC [-+]?\d+", get_last_value_as_int),
    ('KAC', r"KAC [-+]?\d+", get_last_value_as_int),
    ('fortitude', r"Fort [-+]?\d+", get_last_value_as_int),
    ('reflex', r"Ref [-+]?\d+", get_last_value_as_int),
    ('will', r"Will [-+]?\d+", get_last_value_as_int),
    ('speed', r"Speed [\w ]*[-+]?\d+ ft", get_second_last_value_as_int),
    ('STR', r"STR [-+]?\d+", get_last_value_as_int),
    ('DEX', r"DEX [-+]?\d+", get_last_value_as_int),
    ('CON', r"CON [-+]?\d+", get_last_value_as_int),
    ('INT', r"INT [-+]?\d+", get_last_value_as_int),
    ('WIS', r"WIS [-+]?\d+", get_last_value_as_int),
    ('CHA', r"CHA [-+]?\d+", get_last_value_as_int),
    ('melee_name', r"Melee ([\w ]*) ([-+]?\d+) \(([0-9]+d[0-9]+\+[0-9]+)", get_nth_group(1)),
    ('melee_ab', r"Melee ([\w ]*) ([-+]?\d+) \(([0-9]+d[0-9]+\+[0-9]+)", get_nth_group(2, int)),
    ('melee_damage_raw', r"Melee ([\w ]*) ([-+]?\d+) \(([0-9]+d[0-9]+\+[0-9]+)", get_nth_group(3)),
    ('melee_damage_avg', r"Melee ([\w ]*) ([-+]?\d+) \(([0-9]+d[0-9]+\+[0-9]+)", get_nth_group(3, get_average_damage)),
    ('ranged_name', r"Ranged ([\w ]*) ([-+]?\d+) \(([0-9]+d[0-9]+\+[0-9]+)", get_nth_group(1)),
    ('ranged_ab', r"Ranged ([\w ]*) ([-+]?\d+) \(([0-9]+d[0-9]+\+[0-9]+)", get_nth_group(2, int)),
    ('ranged_damage_raw', r"Ranged ([\w ]*) ([-+]?\d+) \(([0-9]+d[0-9]+\+[0-9]+)", get_nth_group(3)),
    ('ranged_damage_avg', r"Ranged ([\w ]*) ([-+]?\d+) \(([0-9]+d[0-9]+\+[0-9]+)", get_nth_group(3, get_average_damage)),
    ('acrobatics', r"Acrobatics [-+]?\d+", get_last_value_as_int),
    ('athletics', r"Athletics [-+]?\d+", get_last_value_as_int),
    ('bluff', r"Bluff [-+]?\d+", get_last_value_as_int),
    ('computers', r"Computers [-+]?\d+", get_last_value_as_int),
    ('culture', r"Culture [-+]?\d+", get_last_value_as_int),
    ('diplomacy', r"Diplomacy [-+]?\d+", get_last_value_as_int),
    ('disguise', r"Disguise [-+]?\d+", get_last_value_as_int),
    ('engineering', r"Engineering [-+]?\d+", get_last_value_as_int),
    ('intimidate', r"Intimidate [-+]?\d+", get_last_value_as_int),
    ('life_science', r"Life Science [-+]?\d+", get_last_value_as_int),
    ('medicine', r"Medicine [-+]?\d+", get_last_value_as_int),
    ('mysticism', r"Mysticism [-+]?\d+", get_last_value_as_int),
    ('perception', r"Perception [-+]?\d+", get_last_value_as_int),
    ('physical_science', r"Physical Science [-+]?\d+", get_last_value_as_int),
    ('piloting', r"Piloting [-+]?\d+", get_last_value_as_int),
    ('profession', r"Profession \(\w+\) [-+]?\d+", get_last_value_as_int),
    ('sense_motive', r"Sense Motive [-+]?\d+", get_last_value_as_int),
    ('sleight_of_hand', r"Sleight of Hand [-+]?\d+", get_last_value_as_int),
    ('stealth', r"Stealth [-+]?\d+", get_last_value_as_int),
    ('survival', r"Survival [-+]?\d+", get_last_value_as_int),
]

FRACTIONS = {"1/2": 0.5, "1/3": 0.3}

def get_name_and_CR(page):
    header_text = page.find("h2", class_="title", text = re.compile('.* CR .*')).text
    header_text = header_text.split()
    name = " ".join(header_text[:-2])
    try:
        CR = int(header_text[-1])
    except ValueError:
        CR = FRACTIONS[header_text[-1]]
    return(name, CR)

def get_main_text(page):
    return page.find(id = "ctl00_MainContent_DataListTalentsAll_ctl00_LabelName").get_text()

def process_features(alien, text):
    for feature in FEATURES:
        result = re.search(feature[1], text)
        if result is not None:
            result = feature[2](result)
        alien[feature[0]] = result
    return alien

aliens_list = []
for link in links_to_aliens:
    alien = {}
    alien_url = base_URL + link
    alien_page = get_page(alien_url)
    
    try:
        alien["name"], alien["CR"] = get_name_and_CR(alien_page)
    except:
        print(">>> Processing error, please check the entry")
        print(alien_url)
        continue
    
    main_text = get_main_text(alien_page)
    alien = process_features(alien, main_text)
    print(alien["name"])
    aliens_list.append(alien)
    

Acrochor
Aeon Guard
Aeon Guard Specialist
Aeon Stone Network
Tekhoinos
AHAV
Akata
Void Zombie
Anacite Ambassador
Anacite Laborer
Anacite Predator Drone
Anacite Wingbot
Barachius
Apari
Apari Constituent
Power Archon
Arquand Gazelle
Assembly Ooze
Asteray
Atrocite
Tritidair
Barathu
Barathu (Early Stage)
Baykok
Uplifted Bear Avenger
Uplifted Bear Constellate
Bloodbrother
Bodysnatcher Autocrat
Bodysnatcher Slime
Bolida Miner
Bolida Overseer
Bone Trooper
Bone Trooper Captain
Bone Trooper Technomancer
Bryrvath
Calecor
Carnivorous Crystal
Carrion Dreg
Caypin
Cerebric Fungus
Cerebric Fungus Voyager
Colour out of Space
Comanide
Contemplative
Contemplative Mentor
Corpsefolk Marine
Corpsefolk Operative
Crest-Eater
Damai
Damai Guardian
Deh-Nolo
Pluprex
Prexian Mutantspawn
Endbringer Devil (Dhalochar)
Warmonger Devil (Levaloch)
Ceratopsid
Dromaeosaurid
Plesiosaur
Pterosaur
Sauropod
Theropod
Thyreophoran
Draelik
Young Adult Blue Dragon
Adult Silver Dragon
Old Void Dragon
Dragonkin
Radiation Drake
Dre

In [10]:
len(aliens_list)

273

In [7]:
aliens = pd.DataFrame(aliens_list)

column_names = ["name", "CR"] + [feat[0] for feat in FEATURES]
skill_list = column_names[-20:]

aliens = aliens[column_names]
aliens.set_index('name', inplace=True)
aliens.sort_index(inplace=True)
aliens[skill_list] = aliens[skill_list].fillna(0).astype(int)

In [8]:
aliens.describe()

Unnamed: 0,CR,XP,Init,HP,EAC,KAC,fortitude,reflex,will,speed,...,medicine,mysticism,perception,physical_science,piloting,profession,sense_motive,sleight_of_hand,stealth,survival
count,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,...,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0
mean,6.313187,12563.186813,3.47619,98.681319,18.362637,19.824176,7.849817,7.509158,7.490842,32.948718,...,0.805861,4.212454,14.128205,1.358974,1.758242,0.509158,2.893773,0.106227,5.736264,3.040293
std,4.481117,38245.745259,2.432996,85.151234,5.591189,5.63718,4.875223,4.288419,5.038903,12.522228,...,3.541098,9.156659,6.971189,4.995152,5.90116,2.636165,7.047089,1.06052,8.089802,6.616756
min,0.3,135.0,-1.0,5.0,9.0,10.0,0.0,-1.0,-2.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,800.0,2.0,35.0,14.0,15.0,4.0,4.0,4.0,30.0,...,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,2400.0,3.0,84.0,18.0,20.0,7.0,7.0,7.0,30.0,...,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9.0,6400.0,5.0,135.0,22.0,23.0,11.0,10.0,10.0,40.0,...,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0
max,20.0,307200.0,12.0,485.0,35.0,37.0,23.0,20.0,22.0,120.0,...,25.0,39.0,34.0,30.0,33.0,19.0,34.0,14.0,31.0,34.0


In [9]:
aliens.to_csv("starfinder_aliens.csv")