# LLM: Zero-shot classification through LLMs and prompts

**Models**:

- GPT-4o (OpenAI)
- Gemini (Google)
- Gemma (Google)
- Llama (Meta)
- Claude (Anthropic)
- DeepSeek


## 0 Imports

In [34]:
import os
import pandas as pd
import numpy as np
import time
import re
from openai import OpenAI
from google import genai
from google.genai import types
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import pdist

In [60]:
X_test_simple_prompt_df = pd.read_csv("../dat/prompts/X_test_simple_prompt.csv", sep = ",", index_col = 0)

X_test_class_definitions_prompt_df = pd.read_csv("../dat/prompts/X_test_class_definitions_prompt.csv", sep = ",", index_col = 0)

X_test_profiled_simple_prompt_df = pd.read_csv("../dat/prompts/X_test_profiled_simple_prompt.csv", sep = ",", index_col = 0)

X_test_few_shot_prompt_df = pd.read_csv("../dat/prompts/X_test_few_shot_prompt.csv", sep = ",", index_col = 0)

X_test_vignette_prompt_df = pd.read_csv("../dat/prompts/X_test_vignette_prompt.csv", sep = ",", index_col = 0)

In [67]:
# convert to arrays
X_test_simple_prompt = X_test_simple_prompt_df.values.flatten()
X_test_class_definitions_prompt = X_test_class_definitions_prompt_df.values.flatten()
X_test_profiled_simple_prompt = X_test_profiled_simple_prompt_df.values.flatten()
X_test_few_shot_prompt = X_test_few_shot_prompt_df.values.flatten()
X_test_vignette_prompt = X_test_vignette_prompt_df.values.flatten()

## 2 Zero-shot classification with LLMs

In this section, I will use the prompts created in the previous section to **classify the test set using different LLMs**. The LLMs will be used to classify whether a person develops a psychological disorder between time point T1 and T2.

### 2.1 ChatGPT-4o (OpenAI)

#### 2.1.1 Testing prompting

In [30]:
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # testing
# response = client.responses.create(
#     model = "gpt-4o-mini",
#     instructions = "You are a coding assistant that talks like a pirate.",
#     input = "How do I check if a Python object is an instance of a class?",
# )
#
# print(response.output_text)

#### 2.1.2. Prompting with ChatGPT-4o

In [32]:
# simple_prompt_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_simple_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = simple_instruction,
#         input = prompt,
#     )
#     simple_prompt_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_simple_prompt = end - start
# time_GPT_simple_prompt_df = pd.DataFrame({"time": [time_GPT_simple_prompt]})
# time_GPT_simple_prompt_df.to_csv("../exp/times_LLMs/time_GPT4_simple_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_simple_GPT = pd.Series(simple_prompt_array_GPT).value_counts()
# print(counts_simple_GPT)
#
# # convert YES to 1 and NO to 0
# simple_prompt_array_GPT = [1 if response == "YES" else 0 if response == "NO" else np.nan for response in simple_prompt_array_GPT]
# simple_prompt_array_GPT
#
# # save the array to a csv file
# simple_prompt_df_GPT = pd.DataFrame(simple_prompt_array_GPT, columns = ["y_pred"])
# simple_prompt_df_GPT.to_csv("../exp/preds_LLMs/y_pred_GPT4_simple_prompt.csv", sep = ",", index = False)

In [33]:
# class_def_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_class_definitions_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = class_definitions_instruction,
#         input = prompt,
#     )
#     class_def_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_class_definitions = end - start
# time_GPT_class_definitions_df = pd.DataFrame({"time": [time_GPT_class_definitions]})
# time_GPT_class_definitions_df.to_csv("../exp/times_LLMs/time_GPT4_class_definitions_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_class_def_GPT = pd.Series(class_def_array_GPT).value_counts()
# print(counts_class_def_GPT)
#
# # convert YES to 1 and NO to 0
# class_def_array_GPT = [1 if response == "YES" else 0 for response in class_def_array_GPT]
# class_def_array_GPT
#
# # save the array to a csv file
# class_def_df_GPT = pd.DataFrame(class_def_array_GPT, columns = ["y_pred"])
# class_def_df_GPT.to_csv("../exp/preds_LLMs/y_pred_GPT4_class_definitions_prompt.csv", sep = ",", index = False)

In [34]:
# profiled_simple_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_profiled_simple_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = profiled_simple_instruction,
#         input = prompt,
#     )
#     profiled_simple_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_profiled_simple = end - start
# time_GPT_profiled_simple_df = pd.DataFrame({"time": [time_GPT_profiled_simple]})
# time_GPT_profiled_simple_df.to_csv("../exp/times_LLMs/time_GPT4_profiled_simple_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_profiled_simple_GPT = pd.Series(profiled_simple_array_GPT).value_counts()
# print(counts_profiled_simple_GPT)
#
# # convert YES to 1 and NO to 0
# profiled_simple_array_GPT_val = [1 if response == "YES" else 0 for response in profiled_simple_array_GPT]
# profiled_simple_array_GPT_val
#
# # save the array to a csv file
# profiled_simple_df_GPT = pd.DataFrame(profiled_simple_array_GPT_val, columns = ["y_pred"])
# profiled_simple_df_GPT.to_csv("../exp/preds_LLMs/y_pred_GPT4_profiled_simple_prompt.csv", sep = ",", index = False)

In [36]:
# few_shot_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_few_shot_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = few_shot_instruction,
#         input = prompt,
#     )
#     few_shot_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_few_shot = end - start
# time_GPT_few_shot_df = pd.DataFrame({"time": [time_GPT_few_shot]})
# time_GPT_few_shot_df.to_csv("../exp/times_LLMs/time_GPT4_few_shot_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_few_shot_GPT = pd.Series(few_shot_array_GPT).value_counts()
# print(counts_few_shot_GPT)
#
# # convert YES to 1 and NO to 0
# few_shot_array_GPT_val = [1 if response == "YES" else 0 for response in few_shot_array_GPT]
# few_shot_array_GPT_val
#
# # save the array to a csv file
# few_shot_df_GPT = pd.DataFrame(few_shot_array_GPT_val, columns = ["y_pred"])
# few_shot_df_GPT.to_csv("../exp/preds_LLMs/y_pred_GPT4_few_shot_prompt.csv", sep = ",", index = False)

In [37]:
# vignette_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_vignette_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = vignette_instruction,
#         input = prompt,
#     )
#     vignette_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_vignette = end - start
# time_GPT_vignette_df = pd.DataFrame({"time": [time_GPT_vignette]})
# time_GPT_vignette_df.to_csv("../exp/times_LLMs/time_GPT4_vignette_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_vignette_GPT = pd.Series(vignette_array_GPT).value_counts()
# print(counts_vignette_GPT)
#
# # convert YES to 1 and NO to 0
# vignette_array_GPT_val = [1 if response == "YES" else 0 for response in vignette_array_GPT]
# vignette_array_GPT_val
#
# # save the array to a csv file
# vignette_df_GPT = pd.DataFrame(vignette_array_GPT_val, columns = ["y_pred"])
# vignette_df_GPT.to_csv("../exp/preds_LLMs/y_pred_GPT4_vignette_prompt.csv", sep = ",", index = False)

#### 2.1.3 Misclassified cases reasons

In [31]:
y_pred_GPT4_simple_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_simple_prompt.csv", sep = ",")
y_pred_GPT4_class_definition_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_class_definitions_prompt.csv", sep = ",")
y_pred_GPT4_profiled_simple_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_profiled_simple_prompt.csv", sep = ",")
y_pred_GPT4_few_shot_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_few_shot_prompt.csv", sep = ",")
y_pred_GPT4_vignette_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_vignette_prompt.csv", sep = ",")

# convert to array
y_pred_GPT4_simple_prompt = y_pred_GPT4_simple_prompt["y_pred"].to_numpy()
y_pred_GPT4_class_definition_prompt = y_pred_GPT4_class_definition_prompt["y_pred"].to_numpy()
y_pred_GPT4_profiled_simple_prompt = y_pred_GPT4_profiled_simple_prompt["y_pred"].to_numpy()
y_pred_GPT4_few_shot_prompt = y_pred_GPT4_few_shot_prompt["y_pred"].to_numpy()
y_pred_GPT4_vignette_prompt = y_pred_GPT4_vignette_prompt["y_pred"].to_numpy()

In [32]:
# indentify misclassified cases by comparing y_pred_GPT4_XXX and y_test, save index
misclassified_cases_simple = []
misclassified_cases_class_def = []
misclassified_cases_profiled_simple = []
misclassified_cases_few_shot = []
misclassified_cases_vignette = []

for i in range(len(y_pred_GPT4_simple_prompt)):
    if y_pred_GPT4_simple_prompt[i] != y_test.iloc[i]:
        misclassified_cases_simple.append(i)
total_cases_simple = len(y_pred_GPT4_simple_prompt)
misscl_cases_simple = len(misclassified_cases_simple)
correct_clases_simple = total_cases_simple - misscl_cases_simple

for i in range(len(y_pred_GPT4_class_definition_prompt)):
    if y_pred_GPT4_class_definition_prompt[i] != y_test.iloc[i]:
        misclassified_cases_class_def.append(i)
total_cases_class_def = len(y_pred_GPT4_class_definition_prompt)
misscl_cases_class_def = len(misclassified_cases_class_def)
correct_clases_class_def = total_cases_class_def - misscl_cases_class_def

for i in range(len(y_pred_GPT4_profiled_simple_prompt)):
    if y_pred_GPT4_profiled_simple_prompt[i] != y_test.iloc[i]:
        misclassified_cases_profiled_simple.append(i)
total_cases_profiled = len(y_pred_GPT4_profiled_simple_prompt)
misscl_cases_profiled = len(misclassified_cases_profiled_simple)
correct_clases_profiled = total_cases_profiled - misscl_cases_profiled

for i in range(len(y_pred_GPT4_few_shot_prompt)):
    if y_pred_GPT4_few_shot_prompt[i] != y_test.iloc[i]:
        misclassified_cases_few_shot.append(i)
total_cases_few_shot = len(y_pred_GPT4_few_shot_prompt)
misscl_cases_few_shot = len(misclassified_cases_few_shot)
correct_clases_few_shot = total_cases_few_shot - misscl_cases_few_shot

for i in range(len(y_pred_GPT4_vignette_prompt)):
    if y_pred_GPT4_vignette_prompt[i] != y_test.iloc[i]:
        misclassified_cases_vignette.append(i)
total_cases_vignette = len(y_pred_GPT4_vignette_prompt)
misscl_cases_vignette = len(misclassified_cases_vignette)
correct_clases_vignette = total_cases_vignette - misscl_cases_vignette

In [33]:
# save as df with total, correct and missclassified cases
simple_cases_df = pd.DataFrame({"total": [total_cases_simple], "correct": [correct_clases_simple], "missclassified": [misscl_cases_simple]})
simple_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/simple_cases_GPT_df.csv", sep = ",", index = True)

class_def_cases_df = pd.DataFrame({"total": [total_cases_class_def], "correct": [correct_clases_class_def], "missclassified": [misscl_cases_class_def]})
class_def_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/class_def_cases_GPT_df.csv", sep = ",", index = True)

profiled_cases_df = pd.DataFrame({"total": [total_cases_profiled], "correct": [correct_clases_profiled], "missclassified": [misscl_cases_profiled]})
profiled_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/profiled_cases_GPT_df.csv", sep = ",", index = True)

few_shot_cases_df = pd.DataFrame({"total": [total_cases_few_shot], "correct": [correct_clases_few_shot], "missclassified": [misscl_cases_few_shot]})
few_shot_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/few_shot_cases_GPT_df.csv", sep = ",", index = True)

vignette_cases_df = pd.DataFrame({"total": [total_cases_vignette], "correct": [correct_clases_vignette], "missclassified": [misscl_cases_vignette]})
vignette_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/vignette_cases_GPT_df.csv", sep = ",", index = True)


In [40]:
# simple_prompt_reasons = []
# class_def_prompt_reasons = []
# profiled_simple_prompt_reasons = []
# few_shot_prompt_reasons = []
# vignette_prompt_reasons = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# instruction_reason = "Please categorize why you misclassified the data. Respond only with the following categories as reasons for the misclassification in order to improve prompting. Possible categories are: \nLack of context (emphasize or indicate the context of the query), \nLack of examples (few-shot prompting with several examples of appropriate responses are shown before posing the actual question missing), \nLack of feedback (interactive refining the prompt), \nLack of counterfactual demonstrations (instances containing false facts to improve faithfulness in knowledge conflict situations), \nLack of opinion-based information (reframe the context as a narrator’s statement and inquire about the narrator’s opinions), \nKnowledge conflicts (memorized facts became outdated and counterfactual facts), \nPrediction with Abstention (model is uncertain about their predictions) \n \n Do not mention specific change (e.g., increase or decrease) in predictors, do not go into detail of this specific case and do not repeat the question. Only respond with one or multiple of the categories as reasons for the misclassification, separated by ','. Mention the most important category first."
#
# # iterate over the misclassified cases and save the response for each prompt in an array
# print("Simple prompt: \n \n")
# for i in misclassified_cases_simple:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_simple_prompt[i]} Response: {y_pred_GPT4_simple_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     simple_prompt_reasons.append(response.output_text)
#     print(response.output_text)
#
# print("\n \n Class definition prompt: \n \n")
# for i in misclassified_cases_class_def:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_class_definitions_prompt[i]} Response: {y_pred_GPT4_class_definition_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     class_def_prompt_reasons.append(response.output_text)
#     print(response.output_text)
#
# print("\n \n Profiled simple prompt: \n \n")
# for i in misclassified_cases_profiled_simple:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_profiled_simple_prompt[i]} Response: {y_pred_GPT4_profiled_simple_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     profiled_simple_prompt_reasons.append(response.output_text)
#     print(response.output_text)
#
# print("\n \n Few shot prompt: \n \n")
# for i in misclassified_cases_few_shot:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_few_shot_prompt[i]} Response: {y_pred_GPT4_few_shot_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     few_shot_prompt_reasons.append(response.output_text)
#     print(response.output_text)
#
# print("\n \n Vignette prompt: \n \n")
# for i in misclassified_cases_vignette:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_vignette_prompt[i]} Response: {y_pred_GPT4_vignette_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     vignette_prompt_reasons.append(response.output_text)
#     print(response.output_text)

Simple prompt: 
 

Lack of context, Lack of examples, Lack of feedback
Lack of context, Lack of feedback.
Lack of context
Lack of context, Lack of examples.
Lack of context, Lack of examples
Lack of context, Lack of examples
Lack of context, Lack of examples
Lack of context, Prediction with Abstention
Lack of context
Lack of context, Lack of examples, Prediction with Abstention
Lack of examples, Lack of context
Lack of context, Lack of examples, Prediction with Abstention
Lack of context
Lack of context, Lack of examples, Prediction with Abstention
Lack of context, Lack of examples.
Lack of context, Lack of examples, Prediction with Abstention
Knowledge conflicts
Lack of context
Lack of context, Lack of examples.
Lack of context, Lack of examples, Knowledge conflicts
Lack of context
Lack of context
Lack of context, Lack of examples.
Lack of context, Lack of examples
Lack of context, Lack of feedback
Lack of context
Lack of context
Lack of context, Lack of feedback
Lack of context, Lack

In [48]:
all_reasons_simple = []
all_reasons_class_def = []
all_reasons_profiled_simple = []
all_reasons_few_shot = []
all_reasons_vignette = []

for reason in simple_prompt_reasons:
    reason = reason.split(", ")
    reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
    all_reasons_simple.append(reason)

for reason in class_def_prompt_reasons:
    reason = reason.split(", ")
    reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
    all_reasons_class_def.append(reason)

for reason in profiled_simple_prompt_reasons:
    reason = reason.split(", ")
    reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
    all_reasons_profiled_simple.append(reason)

for reason in few_shot_prompt_reasons:
    reason = reason.split(", ")
    reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
    all_reasons_few_shot.append(reason)

for reason in vignette_prompt_reasons:
    reason = reason.split(", ")
    reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
    all_reasons_vignette.append(reason)

In [51]:
simple_prompt_reasons_dict = {}
class_def_prompt_reasons_dict = {}
profiled_simple_prompt_reasons_dict = {}
few_shot_prompt_reasons_dict = {}
vignette_prompt_reasons_dict = {}

for i in all_reasons_simple:
    for j in i:
        # count the occurrences of each reason
        if j in simple_prompt_reasons_dict:
            simple_prompt_reasons_dict[j] += 1
        else:
            simple_prompt_reasons_dict[j] = 1
simple_prompt_reasons_df = pd.DataFrame.from_dict(simple_prompt_reasons_dict, orient='index', columns=['count'])
# simple_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/simple_prompt_reasons.csv", sep = ",", index = True)


for i in all_reasons_class_def:
    for j in i:
        # count the occurrences of each reason
        if j in class_def_prompt_reasons_dict:
            class_def_prompt_reasons_dict[j] += 1
        else:
            class_def_prompt_reasons_dict[j] = 1
class_def_prompt_reasons_df = pd.DataFrame.from_dict(class_def_prompt_reasons_dict, orient='index', columns=['count'])
# class_def_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/class_def_prompt_reasons.csv", sep = ",", index = True)

for i in all_reasons_profiled_simple:
    for j in i:
        # count the occurrences of each reason
        if j in profiled_simple_prompt_reasons_dict:
            profiled_simple_prompt_reasons_dict[j] += 1
        else:
            profiled_simple_prompt_reasons_dict[j] = 1
profiled_simple_prompt_reasons_df = pd.DataFrame.from_dict(profiled_simple_prompt_reasons_dict, orient='index', columns=['count'])
# profiled_simple_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/profiled_simple_prompt_reasons.csv", sep = ",", index = True)


for i in all_reasons_few_shot:
    for j in i:
        # count the occurrences of each reason
        if j in few_shot_prompt_reasons_dict:
            few_shot_prompt_reasons_dict[j] += 1
        else:
            few_shot_prompt_reasons_dict[j] = 1
few_shot_prompt_reasons_df = pd.DataFrame.from_dict(few_shot_prompt_reasons_dict, orient='index', columns=['count'])
# few_shot_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/few_shot_prompt_reasons.csv", sep = ",", index = True)

for i in all_reasons_vignette:
    for j in i:
        # count the occurrences of each reason
        if j in vignette_prompt_reasons_dict:
            vignette_prompt_reasons_dict[j] += 1
        else:
            vignette_prompt_reasons_dict[j] = 1
vignette_prompt_reasons_df = pd.DataFrame.from_dict(vignette_prompt_reasons_dict, orient='index', columns=['count'])
# vignette_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/vignette_prompt_reasons.csv", sep = ",", index = True)

In [53]:
vignette_prompt_reasons_df

Unnamed: 0,count
Lack of context,69
Prediction with Abstention,17
Lack of examples,20
Knowledge conflicts,18
Lack of counterfactual demonstrations,5
Lack of feedback,4


In [52]:
print(simple_prompt_reasons_dict, "\n \n")
print(class_def_prompt_reasons_dict, "\n \n")
print(profiled_simple_prompt_reasons_dict, "\n \n")
print(few_shot_prompt_reasons_dict, "\n \n")
print(vignette_prompt_reasons_dict)

{'Lack of context': 85, 'Lack of examples': 48, 'Lack of feedback': 6, 'Prediction with Abstention': 18, 'Knowledge conflicts': 5, 'Lack of counterfactual demonstrations': 1} 
 

{'Lack of context': 85, 'Lack of examples': 17, 'Prediction with Abstention': 7, 'Lack of feedback': 4, 'Knowledge conflicts': 1} 
 

{'Lack of context': 78, 'Lack of examples': 46, 'Lack of feedback': 5, 'Prediction with Abstention': 5, 'Knowledge conflicts': 1, 'Lack of counterfactual demonstrations': 1} 
 

{'Lack of context': 83, 'Lack of examples': 84, 'Lack of feedback': 7, 'Knowledge conflicts': 1, 'Prediction with Abstention': 5} 
 

{'Lack of context': 69, 'Prediction with Abstention': 17, 'Lack of examples': 20, 'Knowledge conflicts': 18, 'Lack of counterfactual demonstrations': 5, 'Lack of feedback': 4}


### 2.2 Gemini (Google)

In [45]:
# client = genai.Client(api_key = os.environ.get("GEMINI_API_KEY"))
#
# response = client.models.generate_content(
#     model = "gemini-2.0-flash",
#     contents = "Explain how AI works in a few words",
# )
#
# print(response.text)

In [46]:
# client = genai.Client(api_key = os.environ.get("GEMINI_API_KEY"))
#
# response = client.models.generate_content(
#     model = "gemini-2.0-flash",
#     config = types.GenerateContentConfig(
#         system_instruction = simple_instruction),
#     contents = simple_prompt
# )

# # gemini-2.5-pro-preview-05-06

### 2.3 Gemma (Google)

### 2.4 Llama (Meta)

### 2.5 Claude (Anthropic)

### 2.6 DeepSeek

In [47]:
# client = OpenAI(api_key = os.environ.get("DeepSeek_API_Key"), base_url = "https://api.deepseek.com")
#
# response = client.chat.completions.create(
#     model="deepseek-chat",
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant"},
#         {"role": "user", "content": "Hello"},
#     ],
#     stream=False
# )
#
# print(response.choices[0].message.content)