In [94]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests

In [95]:
def extract_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table containing the data
    table = soup.find('table', cellpadding="1")

    # Find all the rows in the table
    rows = table.find_all('tr', valign="top", align="right")

    # Lists to store the Energy and mu_en/rho values
    energy_values = []
    mu_en_rho_values = []

    # Iterate through each row and extract the data
    for row in rows:
        cols = row.find_all('td')

        first_data_col_index = 0
        # Check if the first column contains the Energy value
        try:
            float(cols[first_data_col_index].text.strip())  # If it is a number, then it is the Energy value. Otherwise, it is subshell information
        except ValueError:
            # If the first column contains the mu_en/rho value, then skip the row
            first_data_col_index = 1
            continue


        # Extracting the Energy value
        energy = float(cols[0].text.strip())
        energy_values.append(energy)

        # Extracting the mu_en/rho value
        mu_en_rho = float(cols[2].text.strip())
        mu_en_rho_values.append(mu_en_rho)

    # Convert the lists to NumPy arrays
    energy_array = np.array(energy_values)
    mu_en_rho_array = np.array(mu_en_rho_values)

    return energy_array, mu_en_rho_array


In [96]:
def get_html_content(url):
    # Make a GET request to fetch the raw HTML content
    html_content = requests.get(url).text
    return html_content

In [97]:
def get_element_url(Z):
    # URL of the page containing the data
    url = "https://physics.nist.gov/PhysRefData/XrayMassCoef/ElemTab/z" + str(Z) + ".html"
    return url


In [98]:
def create_dataframe():
    # Create a Pandas DataFrame to store the data
    df = pd.DataFrame(columns=["index", "ElementID", "Energy", "Coefficient"])

    last_array_size = 0

    for ElementID in range(1, 92 + 1):
        print("ElementID: ", ElementID)
        # convert the ElementID to a string with 2 digits
        ElementID_str = str(ElementID).zfill(2)

        # Get the URL for the element
        url = get_element_url(ElementID_str)

        # Get the raw HTML content
        html_content = get_html_content(url)

        # Extract the data from the HTML content
        energy_array, mu_en_rho_array = extract_data(html_content)

        index_array = np.arange(last_array_size, last_array_size + len(energy_array))

        # Create a DataFrame for the element
        df_element = pd.DataFrame({
            "index": index_array,
            "ElementID": ElementID,
            "Energy": energy_array*1E6,
            "Coefficient": mu_en_rho_array
        })

        # add the element DataFrame to the main DataFrame without using append because it doesnt exist yet
        df = pd.concat([df, df_element], ignore_index=True)

        last_array_size += len(energy_array)


    return df

In [99]:
df = create_dataframe()

ElementID:  1
ElementID:  2
ElementID:  3
ElementID:  4
ElementID:  5
ElementID:  6
ElementID:  7
ElementID:  8
ElementID:  9
ElementID:  10
ElementID:  11
ElementID:  12
ElementID:  13
ElementID:  14
ElementID:  15
ElementID:  16
ElementID:  17
ElementID:  18
ElementID:  19
ElementID:  20
ElementID:  21
ElementID:  22
ElementID:  23
ElementID:  24
ElementID:  25
ElementID:  26
ElementID:  27
ElementID:  28
ElementID:  29
ElementID:  30
ElementID:  31
ElementID:  32
ElementID:  33
ElementID:  34
ElementID:  35
ElementID:  36
ElementID:  37
ElementID:  38
ElementID:  39
ElementID:  40
ElementID:  41
ElementID:  42
ElementID:  43
ElementID:  44
ElementID:  45
ElementID:  46
ElementID:  47
ElementID:  48
ElementID:  49
ElementID:  50
ElementID:  51
ElementID:  52
ElementID:  53
ElementID:  54
ElementID:  55
ElementID:  56
ElementID:  57
ElementID:  58
ElementID:  59
ElementID:  60
ElementID:  61
ElementID:  62
ElementID:  63
ElementID:  64
ElementID:  65
ElementID:  66
ElementID:  67
Elem

In [104]:
# export the Dataframe to a SQLite database file
import sqlite3
import sqlalchemy
sqlite_connection = sqlite3.connect('/home/john/Documents/MCXrayTransport/data/data_sources/EPDL/EPDL.db')
sqlite_table = "MassEnergyAbsorptionCoefficients"

df.to_sql(sqlite_table, sqlite_connection, if_exists='replace', index=False)
sqlite_connection.close()