In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import hashlib

In [2]:
#name of file to be read
filename = "CyclotronSched.html"

#use BeautifulSoup package to put parser onto file
with open(filename) as fp:
    soup = BeautifulSoup(fp, "html.parser")

In [3]:
#find all span elements to create array of text in file
data = np.array(soup.find_all("span"))

#iterate through array, stripping whitespace from each value
for index, value in enumerate(data):
    data[index] = np.char.strip(value)
    
data_length = data.size

In [4]:
#remove header text and column indices
data = data[8::] 

#resize the data into rows and columns
data = np.reshape(data, ((int((data_length-8)/6)),6))

#create a dataframe using data with appropriate column headers
df = pd.DataFrame(data, columns=["ID", "Time", "Desc.", "Location", "MD", "Notes"])

In [5]:
#remove rows with lunch and repeated column headers
df = df[df["ID"] != "null"]
df = df[df["ID"] != "App_DtTm"]

#hash the ID column to protect patient info
df["ID"] = df["ID"].apply(
    lambda x: 
        hashlib.sha256(x.encode()).hexdigest()
)

#drop unnecessary columns
df = df.drop(["MD"], axis =1)
df = df.drop(["Location"], axis =1)

#reset indices after dropping rows
df = df.reset_index().drop(["index"], axis=1)

In [6]:
#create a new column in the dataframe indicating event type (IMNT, conformal, VISM, or TBI)


types = ["" for x in range(len(df.index))]
for index, row in df.iterrows():
    if row["Desc."] == "Neutrons TC":
        if row["Notes"].find("IMNT") == -1:
            types[index] = "Conformal"
        else:
            types[index] = "IMNT"
    elif row["Desc."] == "Verify Sim":
        types[index] = "VSIM"
    elif row["Desc."] == "Sim No Charge":
        types[index] = "TBI"
    else:
        types[index] == row["Desc."]

#add column to indicate event type
df["Type"] = types

#drop unnecessary columns
df = df.drop(["Notes"], axis =1)
df = df.drop(["Desc."], axis =1)

#drop hashed patient id
df = df.drop(["ID"], axis=1)

In [7]:
#convert times to datetime objects
df["Time"] = pd.to_datetime(df["Time"])

In [8]:
df

Unnamed: 0,Time,Type
0,2023-02-21 10:15:00,IMNT
1,2023-02-21 11:15:00,IMNT
2,2023-02-21 13:30:00,IMNT
3,2023-02-21 14:30:00,VSIM
4,2023-02-22 10:15:00,IMNT
5,2023-02-22 11:15:00,IMNT
6,2023-02-22 13:00:00,Conformal
7,2023-02-23 10:15:00,IMNT
8,2023-02-23 11:15:00,IMNT
9,2023-02-23 13:00:00,Conformal
