In [53]:
from urllib.request import Request, urlopen, urlretrieve
from bs4 import BeautifulSoup
import pandas as pd

In [116]:
class HTMLTableParser:
    def __init__(self, url):
        self.url = url
        self.table = self.find_table()
        
    def find_table(self):
        response = requests.get(self.url)
        soup = BeautifulSoup(response.text,  "html.parser")
        table = soup.find_all('table')[0]
        return table

    def parse_html_table(self):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

        return df

In [119]:
i5k = HTMLTableParser("https://i5k.nal.usda.gov/data/Arthropoda/")
i5k.parse_html_table()

Unnamed: 0,Unnamed: 1,Name,Last modified,Size,Description
0,,Parent Directory,,-,
1,,aettum-(Aethina_tumida)/,30-Mar-2018 14:40,-,
2,,agrpla-(Agrilus_planipennis)/,08-Dec-2017 15:26,-,
3,,amytra-(Amyelois_transitella)/,30-Mar-2018 14:43,-,
4,,anogla-(Anoplophora_glabripennis)/,10-Oct-2018 23:56,-,
5,,apimel-(Apis_mellifera)/,18-Sep-2018 12:28,-,
6,,athros-(Athalia_rosae)/,10-Oct-2018 23:57,-,
7,,baccuc-(Bactrocera_cucurbitae)/,30-Mar-2018 14:43,-,
8,,bacdor-(Bactrocera_dorsalis)/,30-Mar-2018 14:43,-,
9,,bacole-(Bactrocera_oleae)/,30-Mar-2018 14:43,-,
