In [28]:
from docx import Document
from lxml import etree
from zipfile import ZipFile
import pandas as pd

In [29]:
doc = Document('NH.docx')

all_data = []
for table in doc.tables:
    data = []
    for row in table.rows:
        data.append([cell.text for cell in row.cells])

    df = pd.DataFrame(data)
    df.columns = df.iloc[0]
    df = df.iloc[1:]
    all_data.append(df)

In [30]:
def get_textboxes(docx_path):
    
    with ZipFile(docx_path) as docx_zip:
        xml_content = docx_zip.read("word/document.xml")
    
    tree = etree.fromstring(xml_content)
    ns = {
        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
        "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
        "v": "urn:schemas-microsoft-com:vml",
        "o": "urn:schemas-microsoft-com:office:office"
    }

    textboxes = []

    for drawing in tree.xpath(".//w:drawing", namespaces=ns):
        texts = drawing.xpath(".//w:t", namespaces=ns)
        textboxes.append("".join([t.text for t in texts if t.text]))

    for pict in tree.xpath(".//w:pict", namespaces=ns):
        texts = pict.xpath(".//w:t", namespaces=ns)
        textboxes.append("".join([t.text for t in texts if t.text]))

    return textboxes

docx_file = "NH.docx"
boxes = get_textboxes(docx_file)

true_boxes = []

for box in boxes:
    if box != '':
        if box != 'TABLE OF CONTENTS':
            true_boxes.append(box)

In [40]:
table_num = 0

all_temps = []

for table in all_data:

    tc = true_boxes[table_num]
    table_num += 1

    cols = table.columns
    status = []
    pdl_name = []
    therapeutic_class = []
    for index, row in table.iterrows():
        try:
            prefs = row[cols[0]].split('\n')
            for i in prefs:
                pdl_name.append(i)
                status.append('Preferred')
                therapeutic_class.append(tc)
        except:
            None
        try:
            nonprefs = row[cols[1]].split('\n')
            for i in nonprefs:
                pdl_name.append(i)
                status.append('Non-Preferred')
                therapeutic_class.append(tc)
        except:
            None
        
    temp_df = pd.DataFrame([therapeutic_class, pdl_name, status]).transpose()
    temp_df.columns = ['therapeutic_class', 'pdl_name', 'status']
    all_temps.append(temp_df)
    

    

In [46]:
final_df = pd.concat(all_temps).reset_index(drop=True)
final_df

Unnamed: 0,therapeutic_class,pdl_name,status
0,ANALGESICS – LONG-ACTING OPIOIDS***,buprenorphine patch (generic for Butrans),Preferred
1,ANALGESICS – LONG-ACTING OPIOIDS***,Butrans,Preferred
2,ANALGESICS – LONG-ACTING OPIOIDS***,fentanyl patch (generic for Duragesic),Preferred
3,ANALGESICS – LONG-ACTING OPIOIDS***,hydrocodone bitartrate ER (generic for Hysingla),Preferred
4,ANALGESICS – LONG-ACTING OPIOIDS***,hydrocodone bitartrate ER (generic for Zohydro...,Preferred
...,...,...,...
1557,TOPICAL – TOPICAL AGENTS FOR PSORIASIS,Trial and failure of 1 Preferred product requi...,Non-Preferred
1558,TOPICAL – TOPICAL COMBINATION BENZOYL PEROXIDE...,Myfembree,Preferred
1559,TOPICAL – TOPICAL COMBINATION BENZOYL PEROXIDE...,Oriahnn,Preferred
1560,TOPICAL – TOPICAL COMBINATION BENZOYL PEROXIDE...,Orilissa,Preferred
