# CancerCare AI

In [None]:
import openai
from openai import AzureOpenAI
import io
import re
import pandas as pd
import numpy as np
from IPython.display import display, clear_output
from pdfminer.high_level import extract_text
from docx import Document
from PIL import Image
import pytesseract
from tabula import read_pdf
import ipywidgets as widgets
import mlflow.sklearn
import joblib
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import load_model


# Set your OpenAI API Key and Azure endpoint
openai_endpoint = "https://polite-ground-030dc3103.4.azurestaticapps.net/api/v1"
openai_api_key = "30caa1b3-137b-47e7-9404-8bf39871fff5"
openai.api_key = openai_api_key
openai.api_base = openai_endpoint
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
sc = StandardScaler()

# Define the list of target variables we need to extract for the report
target_variables = {
    'sex', 'age', 'Time', 'Number_of_Warts', 'Type', 'Area', 
    'induration_diameter', 'Result_of_Treatment'
}

# Define your widgets for mode selection
mode_input = widgets.Text(
    placeholder='Enter your option:',
    description='Select Mode:',
    disabled=False
)
mode_button = widgets.Button(
    description="Submit",
    button_style='success',
    disabled=False
)
output = widgets.Output()

# Define the chatbot widgets
chat_input = widgets.Text(
    placeholder='Type something',
    description='Chat:',
    disabled=False
)
chat_send_button = widgets.Button(
    description="Send",
    button_style='success',
    disabled=False
)

# Define function to switch between modes
def select_mode(btn):
    with output:
        clear_output()
        mode = mode_input.value.strip()
        if mode == '1':
            display_chatbot_ui()
        elif mode == '2':
            display_report_ui()
        elif mode=='3':
            display_braintumor_ui()
        elif mode=='4':
            display_breasttumor_ui()
        else:
            print("Invalid input. Please enter '1' for Chatbot or '2' for Report Prediction.")

# Helper function to display chatbot UI
def display_chatbot_ui():
    display(chat_input, chat_send_button, output)

# Helper function to display report prediction UI
def display_report_ui():
    display(file_path_text, report_extract_button, output)

# This function should display the image and analyze it for brain tumor
def display_braintumor_ui():
    display(image_file_path_text, image_display_button, image_output)

def display_breasttumor_ui():
    display(breast_image_file_path_text, breast_image_display_button, breast_image_output)

# Add the event to the mode selection button
mode_button.on_click(select_mode)

# This is the function that will run when the user wants to use the chatbot
def ask_question(btn):
    with output:
        question = chat_input.value.strip()
        if question.lower() == 'exit':
            print("\nBye. See you again......")
            print("Chatbot session ended.")
            chat_input.close()  # Remove the input widget
            chat_send_button.close()  # Remove the send button
            chat_input.value = ''
            return
        else:
            client = AzureOpenAI(
                azure_endpoint=openai_endpoint,
                api_key=openai_api_key,
                api_version="2023-09-01-preview",
            )
            # Here, you would add your code to process the question with your chatbot model.
            print("\nUser:", question)
            MESSAGES = [{"role": "user", "content": question}]
            completion = client.chat.completions.create(
                model="gpt-35-turbo",
                messages=MESSAGES,
                temperature=0.9
            )
        
            # Display the AI's response
            print("AI:", completion.choices[0].message.content)
            chat_input.value = ''  # Clear the input for the next message

# Add the event to the chat send button
chat_send_button.on_click(ask_question)

# Report prediction widgets and logic
file_path_text = widgets.Text(description='File Path:')
report_extract_button = widgets.Button(description="Predict Immunotherapy", button_style='success')

extracted_data_map = {}

# Define a function to extract text from tables in a PDF using tabula-py
def extract_text_from_pdf_table(file_path):
    # Use tabula to read tables from the PDF
    tables = read_pdf(file_path, pages='all', multiple_tables=True, stream=True, guess=False)

    # If tables are found, concatenate them and attempt to extract text
    if tables:
        all_tables = pd.concat(tables, ignore_index=True)
        # Check if DataFrame has the expected number of columns (assumes at least two)
        if not all_tables.empty and len(all_tables.columns) >= 2:
            # Extract text as 'Variable: Value' pairs
            text = "\n".join(f"{row.iloc[0]}: {row.iloc[1]}" for _, row in all_tables.iterrows() if pd.notnull(row.iloc[0]) and pd.notnull(row.iloc[1]))
            return text
        else:
            return ""
    else:
        return ""

# Define a function to extract plain text from PDFs
def extract_text_from_pdf(file_path):
    return extract_text(file_path)

# Define a function to extract plain text from DOCX files
def extract_text_from_docx(file_path):
    document = Document(file_path)
    return '\n'.join([paragraph.text for paragraph in document.paragraphs])

# Define a function to extract text from images using OCR
def extract_text_from_image(file_path):
    image = Image.open(file_path)
    return pytesseract.image_to_string(image)

# Define a function to filter the extracted text and obtain only target variables
def filter_target_variables(text):
    # Split the text into lines and check each line for a match against target variables
    information = {}
    lines = text.split('\n')
    for line in lines:
        # Update regex to match any non-alphanumeric and non-space characters as delimiters
        # We exclude spaces to prevent accidentally capturing the last word of the key as part of the delimiter
        match = re.match(r'^(.*?)[^\w\s]+(.*)$', line)
        if match:
            key, value = match.group(1).strip(), match.group(2).strip()
            # Ensure the key is one of the target variables
            norm_key = re.sub(r'\s+', ' ', key)  # Normalize spaces in the key
            if norm_key in target_variables:
                if value.startswith('"') and value.endswith('"'):
                    value = value[1:-1]  # Strip quotes if necessary
                information[norm_key] = value
    return information

# This is the function that will run when the user wants to extract data from reports
def extract_data(btn):
    # Update the global dataframe and extracted data map
    global df, extracted_data_map
    df = pd.DataFrame(columns=["Variable", "Value"])
    extracted_data_map.clear()

    # Get the file path entered by the user
    file_path = "/lakehouse/default/Files/"+file_path_text.value.strip()
    if not file_path:
        # If no file path is entered, prompt the user
        with output:
            clear_output(wait=True)
            print("Please enter a valid file path.")
        return

    # Process the file based on file extension
    file_extension = file_path.split('.')[-1].lower()
    try:
        if file_extension == 'pdf':
            # Decide to extract tables or plain text based on content
            text = extract_text_from_pdf_table(file_path)
            if not text.strip():
                text = extract_text_from_pdf(file_path)
        elif file_extension == 'docx':
            text = extract_text_from_docx(file_path)
        elif file_extension in ['jpg', 'jpeg', 'png', 'tiff', 'bmp', 'gif']:
            text = extract_text_from_image(file_path)
        else:
            with output:
                print(f"File type {file_extension} is not supported.")
                return

        # Filter the relevant data based on target variables
        data = filter_target_variables(text)
        extracted_data_map[file_path] = data
        new_rows = pd.DataFrame(list(data.items()), columns=["Variable", "Value"])
        df = pd.concat([df, new_rows], ignore_index=True)
        clear_output(wait=True)
        with output:
            clear_output(wait=True)
            display(df)
        
        loaded_model = mlflow.sklearn.load_model('runs:/9340cddc-a819-463a-bbd0-c01c27c99681/random-forest-model')
        new_data_df = pd.DataFrame([extracted_data_map[file_path]], columns=extracted_data_map[file_path].keys())
        new_data_df = new_data_df.astype(float)
        data_scaled = sc.fit_transform(new_data_df)

        # Use the trained model to make a prediction on the scaled new data
        prediction = loaded_model.predict(data_scaled)

        # Print the prediction

        if prediction[0]==0:
            print("\nThe Immunotherapy is not suitable")
        else:
            print("\nThe Immunotherapy is suitable")

    except Exception as e:
        with output:
            clear_output(wait=True)
            print(f"Error processing file {file_path}: {e}")


image_file_path_text = widgets.Text(description='Brain X-ray Image Path:')
image_display_button = widgets.Button(description="Analyze Brain X-ray", button_style='success')
image_output = widgets.Output()

def analyze_brain_tumor_image(btn):
    image_file_path = image_file_path_text.value.strip()
    
    if not image_file_path:
        with image_output:
            clear_output(wait=True)
            print("Please enter a valid image file path.")
        return

    try:
        brain_model = load_model("/lakehouse/default/Files/Models/braintumor.h5")
        img = Image.open(image_file_path)
        x = np.array(img.resize((128, 128)))

        # If the image is grayscale, it will not have the third dimension for channels. 
        # In that case, we need to create it by duplicating the grayscale data across three channels.
        if x.ndim == 2:
            x = np.repeat(x[..., np.newaxis], 3, axis=-1)

        # The array should now be of shape (128, 128, 3). We can add the batch dimension by reshaping.
        x = x.reshape(1, 128, 128, 3)

        # Perform prediction
        res = brain_model.predict_on_batch(x)
        classification = np.where(res == np.amax(res))[1][0]
        with image_output:
            clear_output(wait=True)

        clear_output(wait=True)
        if classification==0:
            print(str(res[0][classification]*100) +'% Chance Its a Tumor. It is recommended to consult a doctor.')
        else:
            print(str(res[0][classification]*100) +'% Chance Its not a Tumor.')
    
    except Exception as e:
        with image_output:
            clear_output(wait=True)
            print(f"Error processing image {image_file_path}: {e}")

# Add the event to the brain tumor analysis button
image_display_button.on_click(analyze_brain_tumor_image)


breast_image_file_path_text = widgets.Text(description='Breast X-ray Image Path:')
breast_image_display_button = widgets.Button(description="Analyze Breast X-ray", button_style='success')
breast_image_output = widgets.Output()

def analyze_breast_tumor_image(btn):
    breast_image_file_path = breast_image_file_path_text.value.strip()
    
    if not breast_image_file_path:
        with breast_image_output:
            clear_output(wait=True)
            print("Please enter a valid breast X-ray image file path.")
        return

    try:
        breast_model = load_model("/lakehouse/default/Files/Models/breasttumor.h5")
        
        img = Image.open(breast_image_file_path)
        img_array = np.array(img) 
        if img_array.ndim == 3:
            img_array = img_array.mean(axis=2) 

        img_array_resized = np.resize(img_array, (128, 128))

        img1 = img_array_resized.reshape(1, 128, 128, 1)

        predict = np.argmax(breast_model.predict(img1))
        with breast_image_output:
            clear_output(wait=True)
            if predict==0:
                print("It is a normal condition.")
            elif predict==1:
                print("The cells are not yet cancerous, but they have the potential to become malignant. It is recommended to consult the doctor")
            else:
                print("Malignant tumors are cancerous. It is recommended to consult the doctor")
    
    except Exception as e:
        with breast_image_output:
            clear_output(wait=True)
            print(f"Error processing breast X-ray image {breast_image_file_path}: {e}")

# Add the event to the breast tumor analysis button
breast_image_display_button.on_click(analyze_breast_tumor_image)



print("Enter 1 to talk to chatbot")
print("Enter 2 to check whether the immunotherapy is suitable or not")
print("Enter 3 to check for brain tumor")
print("Enter 4 to check for breast cancer\n")
# Add the event to the report extraction button
report_extract_button.on_click(extract_data)

# Display the mode selection widgets
display(mode_input, mode_button, output)

The cells are not yet cancerous, but they have the potential to become malignant. It is recommended to consult the doctor
