<a href="https://colab.research.google.com/github/josephmailil1/Speciale---Stibo/blob/main/Agent%20Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
%pip install -qU langchain langchain-openai langchain-community langchain-experimental pandas

In [21]:
from google.colab import userdata
openai_api_key = userdata.get('OpenAI_Key') #use the name of your colab key

In [22]:
import pandas as pd

In [23]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [24]:
import os
os.chdir("/content/drive/My Drive/Speciale")

## Load data

In [25]:
shap = pd.read_csv("shap_noincome.csv")
data = pd.read_csv("basedata.csv")

In [26]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4-turbo", temperature=0, openai_api_key=openai_api_key)

In [27]:
from langchain_experimental.tools import PythonAstREPLTool

In [28]:
tools = [PythonAstREPLTool(locals={"data": data, "shap": shap})]

In [29]:
llm_with_tools = llm.bind_tools(tools)

In [30]:
from langchain.output_parsers import JsonOutputToolsParser

In [31]:
parser = JsonOutputToolsParser(tool = tools)

# Prompt Template

In [33]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            f"""You have access to a pandas dataframe `data`. \
            This is the final dataframe that was used to predict Income with, using a Random Forest ML model. \
            The dataset is a publicly available dataset called `adult census`, where prediction task is to determine whether a person makes over $50K a year. \
            Here is the output of `data.head().to_markdown()`:

              ```
            {data.head().to_markdown()}
            ```
            You are also given the pandas dateframe `shap`. This dataframe contains shap values for the test set obsevations. \
            The shap obsevation shows which values are important for the final prediction of each local prediction.
            ```
            {shap.head().to_markdown()}
            ```

            If asked about the importance/relevance of obsevations or features, use the `shap` dataframe to answer. \
            A high absolute value of the shap dataset means that the value is relevant. \
            If a prediction is made on race, sex, native-country or age please highlight the possibility that the individual is a victim of discrimination \

            Use the `data` dataframe to answer question about what the input values to the obsevation is. \

            Respond directly to the question once you have enough information to answer it. \
            Please add context to your answers, and try to explain the implications of the result.""",
        ),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

# Agent

In [34]:
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser

agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_tool_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
)

In [35]:
from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, return_intermediate_steps = False)

In [36]:
import warnings
warnings.filterwarnings('ignore')

In [37]:
def call_agent(Question):
    answer = agent_executor.invoke({"input": Question})
    return answer['output']

# Gradio

In [38]:
pip install gradio

Collecting gradio
  Downloading gradio-4.31.0-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.16.2 (from gradio)
  Downloading gradio_client-0.16.2-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting python-multipart>=0.0.9 (from 

In [39]:
import gradio as gr

In [40]:
demo = gr.Interface(
    fn=call_agent,
    inputs=["text"],
    outputs=[gr.Text(label = "Answer")],
    theme = "Soft",
    title = 'Adult Census Explanation',
    description = "Explanation of Random Forest Prediction Modelling and SHAP values for local interpretation",
    examples=["Explain the 1st obsevation", "What is the model accuracy", "What are the reason for the 58th obsevation"],
    allow_flagging = "never"
)

In [41]:
demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://7a5fa62748a801e5fc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




# Evaluation Metrics

In [110]:
shapsubset = shap.head(5)

In [119]:
# Concatenate column names and values into a single column
eval_data['concatenated'] = shapsubset.apply(lambda x: ' '.join([f"{col}:{val}" for col, val in x.items()]), axis=1)
eval_data = eval_data['concatenated'].tolist()

In [62]:
shap_answers = []

# Iterate from 1 to 10
for i in range(0, 5):
    # Generate the question
    question = f"Provide me a SHAP analysis of the {i} observation"

    # Call the function and append the answer to the list
    shap_answers.append(call_agent(question))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe SHAP (SHapley Additive exPlanations) values provide insights into the contribution of each feature towards the prediction for a specific observation in the dataset. Here's the SHAP analysis for observation 0:

- **age**: SHAP value = -0.0221014, indicating a negative impact on the prediction. This suggests that the age of the individual in this observation slightly decreases the likelihood of the model predicting an income above $50K.
- **education-num**: SHAP value = 0.0840737, showing a positive impact. Higher education levels increase the likelihood of predicting a higher income.
- **hours-per-week**: SHAP value = -0.0349533, a negative impact, suggesting that the number of hours worked per week in this case decreases the likelihood of a higher income prediction.
- **marital-status_Married-civ-spouse**: SHAP value = 0.0487345, a strong positive impact, indicating that being married to a civilian spouse significantly in

In [65]:
shap_answers_df = pd.DataFrame(shap_answers, columns=['Answer'])

In [68]:
questions = ('Provide me a SHAP analysis of the 0 observation', 'Provide me a SHAP analysis of the 1 observation',
'Provide me a SHAP analysis of the 2 observation', 'Provide me a SHAP analysis of the 3 observation',
'Provide me a SHAP analysis of the 4 observation')

In [72]:
pip install datasets



In [73]:
pip install ragas



In [128]:
from datasets import Dataset
from ragas.metrics import faithfulness
from ragas import evaluate

data_samples = {
    'question': questions[0],
    'answer': shap_answers[0],
    'contexts' : [eval_data[0]]
}
#dataset = Dataset.from_dict(data_samples)
#score = evaluate(dataset,metrics=[faithfulness])
#score.to_pandas()

In [130]:
shap_answers[0]

"The SHAP (SHapley Additive exPlanations) values provide insights into the contribution of each feature towards the prediction for a specific observation in the dataset. Here's the SHAP analysis for observation 0:\n\n- **age**: SHAP value = -0.0221014, indicating a negative impact on the prediction. This suggests that the age of the individual in this observation slightly decreases the likelihood of the model predicting an income above $50K.\n- **education-num**: SHAP value = 0.0840737, showing a positive impact. Higher education levels increase the likelihood of predicting a higher income.\n- **hours-per-week**: SHAP value = -0.0349533, a negative impact, suggesting that the number of hours worked per week in this case decreases the likelihood of a higher income prediction.\n- **marital-status_Married-civ-spouse**: SHAP value = 0.0487345, a strong positive impact, indicating that being married to a civilian spouse significantly increases the likelihood of a higher income prediction.\n

In [129]:
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset, metrics=[faithfulness])
#score.to_pandas()

ArrowInvalid: Column 1 named answer expected length 47 but got length 2364

In [124]:
eval_data[1]

'age:0.0116017750151382 fnlwgt:0.0002937710309883 education-num:-0.021314214613862 capital-gain:-0.0175375210314278 capital-loss:-0.0044773347175808 hours-per-week:-0.0181535438431001 workclass_Federal-gov:-0.0002671952851563 workclass_Local-gov:-0.0003311422109901 workclass_Private:0.0033768498387535 workclass_Self-emp-inc:-0.0011283343217516 workclass_Self-emp-not-inc:0.0001988081043027 workclass_State-gov:-4.106194972312202e-06 workclass_Without-pay:0.0 workclass_nan:0.0 education_10th:0.0002027662769998 education_11th:0.0002012019132795 education_12th:7.429561735220964e-06 education_1st-4th:1.6876717877989136e-05 education_5th-6th:4.086321728407747e-05 education_7th-8th:0.0002875696147223 education_9th:2.962469501521281e-05 education_Assoc-acdm:-9.598673448576815e-06 education_Assoc-voc:-7.951660599549981e-05 education_Bachelors:-0.0036844333152913 education_Doctorate:-0.0004896135283528 education_HS-grad:-0.0071046288240719 education_Masters:-0.0014077338160311 education_Preschool

In [109]:
dataset[1]

{'question': 'Provide me a SHAP analysis of the 1 observation',
 'answer': "The SHAP analysis for the first observation reveals the following insights about the features that most influenced the model's prediction:\n\n1. **Education Number (`education-num`)**: The highest positive impact with a SHAP value of approximately 0.084, suggesting that higher educational attainment strongly pushes the prediction towards a higher income.\n\n2. **Marital Status - Married Civil Spouse (`marital-status_Married-civ-spouse`)**: A significant positive SHAP value of about 0.049, indicating that being married to a civilian spouse is a strong predictor of higher income.\n\n3. **Relationship - Husband (`relationship_Husband`)**: Another strong positive influence with a SHAP value of approximately 0.042, suggesting that the model associates being a husband with higher income.\n\n4. **Hours Per Week (`hours-per-week`)**: This feature has a negative SHAP value of about -0.035, indicating that fewer hours wo

In [102]:
type(eval_data_list)

list

In [101]:
type(shap_answers)

list