In [2]:
import pandas as pd
import docx

doc_path = 'GA.docx'

doc = docx.Document(doc_path)
all_tables = []


for table in doc.tables:
    data = []
    for row in table.rows:
        data.append([cell.text.strip() for cell in row.cells])  # Extract text from each cell

    df = pd.DataFrame(data)  # Convert to DataFrame
    df = df[2:]
    all_tables.append(df)

final_df = pd.concat(all_tables).reset_index(drop=True)
final_df.columns = ['pdl_name', 'preferred', 'non-preferred', 'pa', 'qll']
final_df['status'] = ['Preferred' if row['preferred'] == "P" else "Non-Preferred" for _, row in final_df.iterrows()]

final_df = final_df[['pdl_name', 'status']]
final_df['therapeutic_class'] = ''

final_df

Unnamed: 0,pdl_name,status,therapeutic_class
0,abacavir soln. generic,Preferred,
1,abacavir tabs generic,Preferred,
2,abacavir/lamivudine generic,Preferred,
3,abacavir/lamivudine/zidovudine generic,Non-Preferred,
4,ABILIFY ASIMTUFII,Preferred,
...,...,...,...
2271,ZYPITAMAG,Non-Preferred,
2272,ZYPREXA RELPREVV,Preferred,
2273,ZYTIGA 250mg,Preferred,
2274,ZYTIGA 500mg,Non-Preferred,


In [6]:
import pandas as pd
import docx

def is_therapeutic_class(cell):
    """
    Check if a cell’s text is intended to be a therapeutic class.
    We assume that the therapeutic class cells are bold and non-italic.
    """
    bold = False
    italic = False
    for paragraph in cell.paragraphs:
        for run in paragraph.runs:
            if run.font.bold:
                bold = True
            if run.font.italic:
                italic = True
    return bold and italic

def is_bold(cell):
    """
    Check if any run in this cell is italicized.
    Returns True if at least one run is italic, otherwise False.
    """
    for paragraph in cell.paragraphs:
        for run in paragraph.runs:
            if run.font.bold:
                return True
    return False

doc_path = 'GA2.docx'
doc = docx.Document(doc_path)
rows_list = []  # List to store rows for our final DataFrame
current_therapeutic_class = None

# Loop over each table in the document
for table in doc.tables:
    # Assuming the first two rows are header rows; iterate from the third row onward.
    for row in table.rows[2:]:
        first_cell = row.cells[0]
        cell_text = first_cell.text.strip()
        if not cell_text:
            continue

        # If the first cell's text is both bold and non-italic, treat it as a therapeutic class header.
        if is_therapeutic_class(first_cell):
            current_therapeutic_class = cell_text
        else:
            # If the product row's pdl_name is italicized, skip this row.
            if is_bold(first_cell):
                continue

            # Treat this row as a product row.
            cells = [cell.text.strip() for cell in row.cells]
            pdl_name = cells[0]
            preferred_value = cells[1] if len(cells) > 1 else ""
            status = "Preferred" if preferred_value == "P" else "Non-Preferred"
            # Append the row with pdl_name, status, and the current therapeutic class.
            rows_list.append([pdl_name, status, current_therapeutic_class])

# Create the final DataFrame with three columns.
final_df = pd.DataFrame(rows_list, columns=['pdl_name', 'status', 'therapeutic_class'])
final_df

Unnamed: 0,pdl_name,status,therapeutic_class
0,amoxicillin 775mg generic,Non-Preferred,ANTIINFECTIVES
1,"amox/clavulanate IR tabs, susp generic",Preferred,ANTIINFECTIVES
2,amox/clavulanate chew tabs,Non-Preferred,ANTIINFECTIVES
3,amox/clavulanate 250-125mg tabs generic,Non-Preferred,ANTIINFECTIVES
4,amox/clavulanate ER tabs generic,Non-Preferred,ANTIINFECTIVES
...,...,...,...
2269,RAPAFLO,Non-Preferred,UROLOGICAL/RENAL MEDICATIONS
2270,tamsulosin generic,Preferred,UROLOGICAL/RENAL MEDICATIONS
2271,"FREESTYLE LIBRE, -2, -2 Plus, -3, -3 Plus",Preferred,DIABETIC SUPPLIES
2272,"OMNIPOD DASH, -5 (covered 2 - 21 yrs old)",Preferred,DIABETIC SUPPLIES
