In [None]:
%%capture
!pip install python-docx beautifulsoup4

In [1]:
import json

In [3]:
os.environ["AZURE_OPENAI_API_KEY"] = config('OPENAI_API_KEY')
os.environ["AZURE_OPENAI_ENDPOINT"] = config('AZURE_ENDPOINT')

In [4]:
from openai import AzureOpenAI
client = AzureOpenAI(
    api_version=config('AZURE_CHAT_OPENAI_API_VERSION'),
)

In [None]:
## Alternate Approach
import webcolors

def closest_colour(requested_colour):
    min_colours = {}
    for key, name in webcolors.CSS3_HEX_TO_NAMES.items():
        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
        rd = (r_c - requested_colour[0]) ** 2
        gd = (g_c - requested_colour[1]) ** 2
        bd = (b_c - requested_colour[2]) ** 2
        min_colours[(rd + gd + bd)] = name
    return min_colours[min(min_colours.keys())]

def get_colour_name(requested_colour):
    requested_colour = webcolors.hex_to_rgb("#"+requested_colour)
    try:
        closest_name = actual_name = webcolors.rgb_to_name(requested_colour)
    except ValueError:
        closest_name = closest_colour(requested_colour)
        actual_name = None
    if actual_name:
        return actual_name
    else:
        return closest_name

colour_name = get_colour_name("EEEEEE")

print("Colour name:", colour_name)

In [5]:
from docx import Document
import webcolors
from docx.oxml.ns import qn

def closest_color(hex_color):
    """ Function to find the closest color name for a given hex color """
    hex_color = '#'+hex_color
    try:
        color_name = webcolors.hex_to_name(hex_color)
    except ValueError:
        try:
            closest_hex = webcolors.hex_to_rgb(hex_color)
            color_name = webcolors.rgb_to_name(closest_hex)
        except ValueError:
            color_name = hex_color  # If no close match is found, return the hex color
    return color_name

def get_cell_color(cell):
    """ Function to extract the color information of a specified cell """
    shading = cell._element.xpath('.//w:shd')
    return shading[0].get(qn('w:fill')) if shading else None

def read_docx(file_path):
    doc = Document(file_path)
    text = ''
    tables = []
    for para in doc.paragraphs:
        text += ' ' + para.text

    for table in doc.tables:
        table_text = ''
        for row in table.rows:
            for cell in row.cells:
                hex_color = get_cell_color(cell)
                color_name = closest_color(hex_color) if hex_color else ""
                if color_name == 'white':
                    table_text += ' | ' + cell.text.replace('\n', ' ')
                else:
                    table_text += ' | ' + cell.text.replace('\n', ' ') + ' {'+color_name+'} '
            table_text += '|\n'
        tables.append(table_text)
    return text, tables

In [6]:
file_path = './new_data/NLP_ORs_example_results_1.0.docx'

In [7]:
_, tables = read_docx(file_path)

In [8]:
for table in tables:
    print(table)
    response = client.chat.completions.create(
        model=deployment_name,
        max_tokens = 1024,
        n=1,
        stop=None,
        temperature = 0.5,
        top_p = 1,
        messages=[
            {"role": "system", "content": " You are an assistant and your job is to summarise and give insights from tables with cell color (in HEX) inside {} in clinical study reports.\
             Insructions:\
             - Only answer questions related to the table.\
             - If you are unsure about an answer, do not make false assumptions.\
             - Give a row-wise explanation in table in paragraphs.\
             - Decode color name from HEX value and use color name insted of the HEX value in the explanation.\
             - Give additional insights based on colored cells. \
             - Do not give overall inference"},
            {"role": "user", "content": 'Here is the table:'+table+'Also start the answer as the table summarises...'}

        ]
    )
    print(response.choices[0].message.content, '\n')
    # print(response['choices'][0]['message']['content'], '\n')

 | Treatment {#EEEEEE}  | Median {#EEEEEE}  | 95% CrI lower limit {#EEEEEE}  | 95% CrI upper limit {#EEEEEE} |
 | Placebo  | 	0.09526 | 	0.03717 | 	0.22097|
 | Lasmiditan_100_Oral  | 	0.22197 | 	0.09079 | 	0.44483|
 | Lasmiditan_200_Oral  | 	0.27306 | 	0.11644 | 	0.51285|
 | Sumatriptan_50_Oral  | 	0.24744 | 	0.10570 | 	0.47468|
 | Sumatriptan_100_Oral  | 	0.28356 | 	0.12522 | 	0.51939|
 | Rizatriptan_10_Oral  | 	0.37855 | 	0.17904 | 	0.62595|
 | Eletriptan_40_Oral  | 	0.34408 | 	0.15942 | 	0.58879|
 | Eletriptan_80_Oral  | 	0.40424 | 	0.19605 | 	0.65059|
 | Acetaminophen_1000_Oral  | 	0.17631 | 	0.06866 | 	0.38223|
 | Aspirin_1000_Oral  | 	0.22621 | 	0.09267 | 	0.45219|
 | Ibuprofen_400_Oral  | 	0.19699 | 	0.07705 | 	0.41654|

The table summarises the treatment, median, and 95% credible interval (CrI) lower and upper limits for various oral medications used in the clinical study. The cell color in HEX code {#EEEEEE} indicates that the cells are not significant and do not require any f

In [10]:
captions = [
    "Median event rate with 95% CrI – Pain free at 2 hours (random effects model adjusted for baseline risk).\n",
    "Relative Treatment Effect of Pairwise Comparisons Expressed as Posterior Median ORs (with 95 % Credible Intervals) - Pain free at 2 hours (random effects model adjusted for baseline risk).\n"
]

after_txt = [
    "CrI: Credible interval. The results presented in this table are a subset of the NMA performed with the full list of treatments.\n",
    "The results presented in this table are a subset of the NMA performed with the full list of treatments.\n"
]

In [13]:
for caption, table, txt in zip(captions, tables, after_txt):
    print(caption+table+txt)
    response = client.chat.completions.create(
        model=deployment_name,
        max_tokens = 1024,
        n=1,
        stop=None,
        temperature = 0.5,
        top_p = 1,
        messages=[
            {"role": "system", "content": " You are an assistant and your job is to summarise and give insights from tables with cell color (in HEX) inside {} in clinical study reports.\
             Insructions:\
             - Only answer questions related to the table.\
             - If you are unsure about an answer, do not make false assumptions.\
             - Give a row-wise explanation in table in paragraphs.\
             - Decode color name from HEX value. Use use color name insted of the HEX value in the explanation.\
             - Give explanation based on colored cells.\
             - Do not give overall inference"},
            {"role": "user", "content": 'Here is the table:'+table+'Also start the answer as the table summarises...'}

        ]
    )
    print(response.choices[0].message.content, '\n')

Median event rate with 95% CrI – Pain free at 2 hours (random effects model adjusted for baseline risk).
 | Treatment {#EEEEEE}  | Median {#EEEEEE}  | 95% CrI lower limit {#EEEEEE}  | 95% CrI upper limit {#EEEEEE} |
 | Placebo  | 	0.09526 | 	0.03717 | 	0.22097|
 | Lasmiditan_100_Oral  | 	0.22197 | 	0.09079 | 	0.44483|
 | Lasmiditan_200_Oral  | 	0.27306 | 	0.11644 | 	0.51285|
 | Sumatriptan_50_Oral  | 	0.24744 | 	0.10570 | 	0.47468|
 | Sumatriptan_100_Oral  | 	0.28356 | 	0.12522 | 	0.51939|
 | Rizatriptan_10_Oral  | 	0.37855 | 	0.17904 | 	0.62595|
 | Eletriptan_40_Oral  | 	0.34408 | 	0.15942 | 	0.58879|
 | Eletriptan_80_Oral  | 	0.40424 | 	0.19605 | 	0.65059|
 | Acetaminophen_1000_Oral  | 	0.17631 | 	0.06866 | 	0.38223|
 | Aspirin_1000_Oral  | 	0.22621 | 	0.09267 | 	0.45219|
 | Ibuprofen_400_Oral  | 	0.19699 | 	0.07705 | 	0.41654|
CrI: Credible interval. The results presented in this table are a subset of the NMA performed with the full list of treatments.

The table summarises the trea