### Imports

In [1]:
# Packages
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import AzureChatOpenAI
from langchain.llms import AzureOpenAI

In [2]:
# Load environment variables from .env file
load_dotenv()

# Azure OpenAI API
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
os.environ["OPENAI_API_BASE"] = os.getenv('openai_base')
os.environ["OPENAI_API_KEY"] = os.getenv('openai_key')

### Topic extraction functions

In [3]:
def gpt_analyze(comment):
    llm = AzureChatOpenAI(deployment_name="gpt-35-turbo", model_name="gpt-35-turbo", temperature=0.1)

    template = """
    You will be given a patient's comment about their drug treatment.
    In the comment, please identify the main categories of side effects listed by the patient, and return them as a Python list (ex: ['chest pain', 'rash']).
    Act step by step. Remember to return a python list.
    If there are no side effects listed in the comment, simply return an empty Python list ([]).
    Act step by step and return a Python list, not a text list.

    COMMENT:

    {comment}

    MAIN SIDE EFFECTS:
    """

    prompt = PromptTemplate(template=template, input_variables=["comment"])
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    answer = llm_chain(comment)
    return(answer['text'])

In [4]:
def topic_extract(analysis):
    llm = AzureOpenAI(deployment_name="text-davinci-003", model_name="text-davinci-003", temperature=0.1)

    template = """
    You are an assistant helping to classify topics within the following text: {analysis}
    Your objective is to find the kinds of topics mentionned in the text.
    
    The different kinds of topics are:
    - No side effect: return 0
    - Fatigue (or sleepiness, tiredness, no energy): return 1
    - Diarrhea: return 2
    - Arthralgia (and anything related to joint pain): return 3
    - Headaches: return 4
    - Nausea: return 5
    - Rash (and anything related to skin problems): return 6
    - Hair loss: return 7
    - Constipation: return 8
    - Mental health issues (depression, ...): return 9
    - Leg cramps (and anything related to muscle pain): return 10
    - Heart or blood pressure issues : return 11
    - Liver or kidney pain: return 12
    - Weight loss: return 13
    - Weight gain: return 14

    Act step by step.

    Return only the topics numbers.
    """

    prompt = PromptTemplate(template=template, input_variables=["analysis"])
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    answer = llm_chain(analysis)
    return(answer['text'])

### Creating the dataframe

In [5]:
df = pd.read_csv("data/raw_data_healthcare.csv")

# Drop text_index column
df.drop(columns="text_index", inplace=True)

# Select only comments for Crohn's Disease and Ulcerative Colitis
df = df.loc[df["medication"].str.contains("Crohn") | df["medication"].str.contains("Ulcerative")]

# Drop rows where medication is not specified
df = df.loc[~df["medication"].str.contains("For")]

# Separate drug from disease
df["drug"] = df["medication"].str.split("for").str[0]
df["disease"] = np.where(df["medication"].str.contains("Crohn"), "Crohn's Disease",
                         np.where(df["medication"].str.contains("Ulcerative"), "Ulcerative Colitis", ""))
df.drop(columns="medication", inplace=True)

# Drop row n°3 (triggering OpenAI's policy)
y = df.iloc[3]
df = df.drop(3)

# Apply gpt analysis function to each comment
df['gpt_analysis'] = df['comment'].apply(gpt_analyze)

# Apply topic extraction function to each gpt_analysis
df['topics'] = df['gpt_analysis'].apply(topic_extract)

# Apply topic extraction function to each gpt_analysis
df["no_side_effects"] = df["topics"].str.contains(" 0|Ex") & ~df["topics"].str.contains(", 0|0,")
df["fatigue"] = (df["topics"].str.contains("1") & ~df["topics"].str.contains("10|11|12|13|14")) | df["topics"].str.contains("1,")
df["diarrhea"] = (df["topics"].str.contains("2") & ~df["topics"].str.contains("12")) | (df["topics"].str.contains("2,") & ~df["topics"].str.contains("12,"))
df["arthralgia"] = (df["topics"].str.contains("3") & ~df["topics"].str.contains("13")) | (df["topics"].str.contains("3,") & ~df["topics"].str.contains("13,"))
df["headaches"] = (df["topics"].str.contains("4") & ~df["topics"].str.contains("14")) | (df["topics"].str.contains("4,") & ~df["topics"].str.contains("14,"))
df["nausea"] = df["topics"].str.contains("5")
df["rash"] = df["topics"].str.contains("6")
df["hair loss"] = df["topics"].str.contains("7")
df["constipation"] = df["topics"].str.contains("8")
df["mental_health_issues"] = df["topics"].str.contains("9")
df["leg_cramps"] = df["topics"].str.contains("10")
df["heart_blood_pressure_issues"] = df["topics"].str.contains("11")
df["liver_kidney_pain"] = df["topics"].str.contains("12")
df["weight_loss"] = df["topics"].str.contains("13")
df["weight_gain"] = df["topics"].str.contains("14")

# Save DataFrame to CSV file
df.to_csv("output.csv", index=False)

Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
