# LLM: Zero-shot classification through LLMs and prompts

**Models**:

- GPT-4o (OpenAI)
- Gemini (Google)
- Gemma (Google)
- Llama (Meta)
- Claude (Anthropic)
- DeepSeek


## 0 Imports

In [2]:
import os
import pandas as pd
import anthropic
import numpy as np
import time
import re
from openai import OpenAI
from google import genai
from google.genai import types
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import pdist

In [3]:
X_test_simple_prompt_df = pd.read_csv("../dat/prompts/X_test_simple_prompt.csv", sep = ",", index_col = 0)
X_test_class_definitions_prompt_df = pd.read_csv("../dat/prompts/X_test_class_definitions_prompt.csv", sep = ",", index_col = 0)
X_test_profiled_simple_prompt_df = pd.read_csv("../dat/prompts/X_test_profiled_simple_prompt.csv", sep = ",", index_col = 0)
X_test_few_shot_prompt_df = pd.read_csv("../dat/prompts/X_test_few_shot_prompt.csv", sep = ",", index_col = 0)
X_test_vignette_prompt_df = pd.read_csv("../dat/prompts/X_test_vignette_prompt.csv", sep = ",", index_col = 0)
X_test_claude_prompt_df = pd.read_csv("../dat/prompts/X_test_claude_prompt.csv", sep = ",", index_col = 0)

In [4]:
# convert to arrays
X_test_simple_prompt = X_test_simple_prompt_df.values.flatten()
X_test_class_definitions_prompt = X_test_class_definitions_prompt_df.values.flatten()
X_test_profiled_simple_prompt = X_test_profiled_simple_prompt_df.values.flatten()
X_test_few_shot_prompt = X_test_few_shot_prompt_df.values.flatten()
X_test_vignette_prompt = X_test_vignette_prompt_df.values.flatten()
X_test_claude_prompt = X_test_claude_prompt_df.values.flatten()

In [6]:
simple_instruction = "Respond only with YES or NO."
class_definitions_instruction = "Respond only with YES or NO."
profiled_simple_instruction = "Respond only with YES or NO."
few_shot_instruction = "Respond only with YES or NO."
vignette_instruction = "Respond only with YES or NO."
claude_instruction = "You are an expert psychologist tasked with predicting whether an individual will develop a psychological disorder between two time points (T1 and T2) based on various psychological measures and demographic information. Your goal is to provide an accurate YES or NO prediction, supported by a brief explanation of your reasoning. Example output format: \n Prediction: [YES/NO] \n Explanation: [Brief explanation supporting your prediction]."


## 1 Zero-shot classification with LLMs

In this section, I will use the prompts created in the previous section to **classify the test set using different LLMs**. The LLMs will be used to classify whether a person develops a psychological disorder between time point T1 and T2.

### 1.1 ChatGPT-4o (OpenAI)

#### 1.1.1 Testing prompting

In [71]:
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # testing
# response = client.responses.create(
#     model = "gpt-4o-mini",
#     instructions = "You are a coding assistant that talks like a pirate.",
#     input = "How do I check if a Python object is an instance of a class?",
# )
#
# print(response.output_text)

#### 1.1.2. Prompting with ChatGPT-4o

In [72]:
# simple_prompt_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_simple_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = simple_instruction,
#         input = prompt,
#     )
#     simple_prompt_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_simple_prompt = end - start
# time_GPT_simple_prompt_df = pd.DataFrame({"time": [time_GPT_simple_prompt]})
# time_GPT_simple_prompt_df.to_csv("../exp/times_LLMs/GPT4/time_GPT4_simple_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_simple_GPT = pd.Series(simple_prompt_array_GPT).value_counts()
# print(counts_simple_GPT)
#
# # convert YES to 1 and NO to 0
# simple_prompt_array_GPT = [1 if response == "YES" else 0 if response == "NO" else np.nan for response in simple_prompt_array_GPT]
# simple_prompt_array_GPT
#
# # save the array to a csv file
# simple_prompt_df_GPT = pd.DataFrame(simple_prompt_array_GPT, columns = ["y_pred"])
# simple_prompt_df_GPT.to_csv("../exp/preds_LLMs/GPT4/y_pred_GPT4_simple_prompt.csv", sep = ",", index = False)

In [73]:
# class_def_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_class_definitions_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = class_definitions_instruction,
#         input = prompt,
#     )
#     class_def_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_class_definitions = end - start
# time_GPT_class_definitions_df = pd.DataFrame({"time": [time_GPT_class_definitions]})
# time_GPT_class_definitions_df.to_csv("../exp/times_LLMs/GPT4/time_GPT4_class_definitions_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_class_def_GPT = pd.Series(class_def_array_GPT).value_counts()
# print(counts_class_def_GPT)
#
# # convert YES to 1 and NO to 0
# class_def_array_GPT = [1 if response == "YES" else 0 for response in class_def_array_GPT]
# class_def_array_GPT
#
# # save the array to a csv file
# class_def_df_GPT = pd.DataFrame(class_def_array_GPT, columns = ["y_pred"])
# class_def_df_GPT.to_csv("../exp/preds_LLMs/GPT4/y_pred_GPT4_class_definitions_prompt.csv", sep = ",", index = False)

In [74]:
# profiled_simple_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_profiled_simple_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = profiled_simple_instruction,
#         input = prompt,
#     )
#     profiled_simple_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_profiled_simple = end - start
# time_GPT_profiled_simple_df = pd.DataFrame({"time": [time_GPT_profiled_simple]})
# time_GPT_profiled_simple_df.to_csv("../exp/times_LLMs/GPT4/time_GPT4_profiled_simple_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_profiled_simple_GPT = pd.Series(profiled_simple_array_GPT).value_counts()
# print(counts_profiled_simple_GPT)
#
# # convert YES to 1 and NO to 0
# profiled_simple_array_GPT_val = [1 if response == "YES" else 0 for response in profiled_simple_array_GPT]
# profiled_simple_array_GPT_val
#
# # save the array to a csv file
# profiled_simple_df_GPT = pd.DataFrame(profiled_simple_array_GPT_val, columns = ["y_pred"])
# profiled_simple_df_GPT.to_csv("../exp/preds_LLMs/GPT4/y_pred_GPT4_profiled_simple_prompt.csv", sep = ",", index = False)

In [75]:
# few_shot_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_few_shot_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = few_shot_instruction,
#         input = prompt,
#     )
#     few_shot_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_few_shot = end - start
# time_GPT_few_shot_df = pd.DataFrame({"time": [time_GPT_few_shot]})
# time_GPT_few_shot_df.to_csv("../exp/times_LLMs/GPT4/time_GPT4_few_shot_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_few_shot_GPT = pd.Series(few_shot_array_GPT).value_counts()
# print(counts_few_shot_GPT)
#
# # convert YES to 1 and NO to 0
# few_shot_array_GPT_val = [1 if response == "YES" else 0 for response in few_shot_array_GPT]
# few_shot_array_GPT_val
#
# # save the array to a csv file
# few_shot_df_GPT = pd.DataFrame(few_shot_array_GPT_val, columns = ["y_pred"])
# few_shot_df_GPT.to_csv("../exp/preds_LLMs/GPT4/y_pred_GPT4_few_shot_prompt.csv", sep = ",", index = False)

In [76]:
# vignette_array_GPT = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_vignette_prompt:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = vignette_instruction,
#         input = prompt,
#     )
#     vignette_array_GPT.append(response.output_text)
#     print(response.output_text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_GPT_vignette = end - start
# time_GPT_vignette_df = pd.DataFrame({"time": [time_GPT_vignette]})
# time_GPT_vignette_df.to_csv("../exp/times_LLMs/GPT4/time_GPT4_vignette_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_vignette_GPT = pd.Series(vignette_array_GPT).value_counts()
# print(counts_vignette_GPT)
#
# # convert YES to 1 and NO to 0
# vignette_array_GPT_val = [1 if response == "YES" else 0 for response in vignette_array_GPT]
# vignette_array_GPT_val
#
# # save the array to a csv file
# vignette_df_GPT = pd.DataFrame(vignette_array_GPT_val, columns = ["y_pred"])
# vignette_df_GPT.to_csv("../exp/preds_LLMs/GPT4/y_pred_GPT4_vignette_prompt.csv", sep = ",", index = False)

#### 1.1.3 Misclassified cases reasons

In [77]:
# y_pred_GPT4_simple_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_simple_prompt.csv", sep = ",")
# y_pred_GPT4_class_definition_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_class_definitions_prompt.csv", sep = ",")
# y_pred_GPT4_profiled_simple_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_profiled_simple_prompt.csv", sep = ",")
# y_pred_GPT4_few_shot_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_few_shot_prompt.csv", sep = ",")
# y_pred_GPT4_vignette_prompt = pd.read_csv("../exp/preds_LLMs/y_pred_GPT4_vignette_prompt.csv", sep = ",")
#
# # convert to array
# y_pred_GPT4_simple_prompt = y_pred_GPT4_simple_prompt["y_pred"].to_numpy()
# y_pred_GPT4_class_definition_prompt = y_pred_GPT4_class_definition_prompt["y_pred"].to_numpy()
# y_pred_GPT4_profiled_simple_prompt = y_pred_GPT4_profiled_simple_prompt["y_pred"].to_numpy()
# y_pred_GPT4_few_shot_prompt = y_pred_GPT4_few_shot_prompt["y_pred"].to_numpy()
# y_pred_GPT4_vignette_prompt = y_pred_GPT4_vignette_prompt["y_pred"].to_numpy()

In [78]:
# # indentify misclassified cases by comparing y_pred_GPT4_XXX and y_test, save index
# misclassified_cases_simple = []
# misclassified_cases_class_def = []
# misclassified_cases_profiled_simple = []
# misclassified_cases_few_shot = []
# misclassified_cases_vignette = []
#
# for i in range(len(y_pred_GPT4_simple_prompt)):
#     if y_pred_GPT4_simple_prompt[i] != y_test.iloc[i]:
#         misclassified_cases_simple.append(i)
# total_cases_simple = len(y_pred_GPT4_simple_prompt)
# misscl_cases_simple = len(misclassified_cases_simple)
# correct_clases_simple = total_cases_simple - misscl_cases_simple
#
# for i in range(len(y_pred_GPT4_class_definition_prompt)):
#     if y_pred_GPT4_class_definition_prompt[i] != y_test.iloc[i]:
#         misclassified_cases_class_def.append(i)
# total_cases_class_def = len(y_pred_GPT4_class_definition_prompt)
# misscl_cases_class_def = len(misclassified_cases_class_def)
# correct_clases_class_def = total_cases_class_def - misscl_cases_class_def
#
# for i in range(len(y_pred_GPT4_profiled_simple_prompt)):
#     if y_pred_GPT4_profiled_simple_prompt[i] != y_test.iloc[i]:
#         misclassified_cases_profiled_simple.append(i)
# total_cases_profiled = len(y_pred_GPT4_profiled_simple_prompt)
# misscl_cases_profiled = len(misclassified_cases_profiled_simple)
# correct_clases_profiled = total_cases_profiled - misscl_cases_profiled
#
# for i in range(len(y_pred_GPT4_few_shot_prompt)):
#     if y_pred_GPT4_few_shot_prompt[i] != y_test.iloc[i]:
#         misclassified_cases_few_shot.append(i)
# total_cases_few_shot = len(y_pred_GPT4_few_shot_prompt)
# misscl_cases_few_shot = len(misclassified_cases_few_shot)
# correct_clases_few_shot = total_cases_few_shot - misscl_cases_few_shot
#
# for i in range(len(y_pred_GPT4_vignette_prompt)):
#     if y_pred_GPT4_vignette_prompt[i] != y_test.iloc[i]:
#         misclassified_cases_vignette.append(i)
# total_cases_vignette = len(y_pred_GPT4_vignette_prompt)
# misscl_cases_vignette = len(misclassified_cases_vignette)
# correct_clases_vignette = total_cases_vignette - misscl_cases_vignette

In [79]:
# # save as df with total, correct and missclassified cases
# simple_cases_df = pd.DataFrame({"total": [total_cases_simple], "correct": [correct_clases_simple], "missclassified": [misscl_cases_simple]})
# simple_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/simple_cases_GPT_df.csv", sep = ",", index = True)
#
# class_def_cases_df = pd.DataFrame({"total": [total_cases_class_def], "correct": [correct_clases_class_def], "missclassified": [misscl_cases_class_def]})
# class_def_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/class_def_cases_GPT_df.csv", sep = ",", index = True)
#
# profiled_cases_df = pd.DataFrame({"total": [total_cases_profiled], "correct": [correct_clases_profiled], "missclassified": [misscl_cases_profiled]})
# profiled_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/profiled_cases_GPT_df.csv", sep = ",", index = True)
#
# few_shot_cases_df = pd.DataFrame({"total": [total_cases_few_shot], "correct": [correct_clases_few_shot], "missclassified": [misscl_cases_few_shot]})
# few_shot_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/few_shot_cases_GPT_df.csv", sep = ",", index = True)
#
# vignette_cases_df = pd.DataFrame({"total": [total_cases_vignette], "correct": [correct_clases_vignette], "missclassified": [misscl_cases_vignette]})
# vignette_cases_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/vignette_cases_GPT_df.csv", sep = ",", index = True)


In [80]:
# simple_prompt_reasons = []
# class_def_prompt_reasons = []
# profiled_simple_prompt_reasons = []
# few_shot_prompt_reasons = []
# vignette_prompt_reasons = []
#
# client = OpenAI(
#     api_key = os.environ.get("OPENAI_API_KEY"),
# )
#
# instruction_reason = "Please categorize why you misclassified the data. Respond only with the following categories as reasons for the misclassification in order to improve prompting. Possible categories are: \nLack of context (emphasize or indicate the context of the query), \nLack of examples (few-shot prompting with several examples of appropriate responses are shown before posing the actual question missing), \nLack of feedback (interactive refining the prompt), \nLack of counterfactual demonstrations (instances containing false facts to improve faithfulness in knowledge conflict situations), \nLack of opinion-based information (reframe the context as a narrator’s statement and inquire about the narrator’s opinions), \nKnowledge conflicts (memorized facts became outdated and counterfactual facts), \nPrediction with Abstention (model is uncertain about their predictions) \n \n Do not mention specific change (e.g., increase or decrease) in predictors, do not go into detail of this specific case and do not repeat the question. Only respond with one or multiple of the categories as reasons for the misclassification, separated by ','. Mention the most important category first."
#
# # iterate over the misclassified cases and save the response for each prompt in an array
# print("Simple prompt: \n \n")
# for i in misclassified_cases_simple:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_simple_prompt[i]} Response: {y_pred_GPT4_simple_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     simple_prompt_reasons.append(response.output_text)
#     print(response.output_text)
#
# print("\n \n Class definition prompt: \n \n")
# for i in misclassified_cases_class_def:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_class_definitions_prompt[i]} Response: {y_pred_GPT4_class_definition_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     class_def_prompt_reasons.append(response.output_text)
#     print(response.output_text)
#
# print("\n \n Profiled simple prompt: \n \n")
# for i in misclassified_cases_profiled_simple:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_profiled_simple_prompt[i]} Response: {y_pred_GPT4_profiled_simple_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     profiled_simple_prompt_reasons.append(response.output_text)
#     print(response.output_text)
#
# print("\n \n Few shot prompt: \n \n")
# for i in misclassified_cases_few_shot:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_few_shot_prompt[i]} Response: {y_pred_GPT4_few_shot_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     few_shot_prompt_reasons.append(response.output_text)
#     print(response.output_text)
#
# print("\n \n Vignette prompt: \n \n")
# for i in misclassified_cases_vignette:
#     response = client.responses.create(
#         model = "gpt-4o",
#         instructions = instruction_reason,
#         input = f"Misclassified case {i}: Prompt: {X_test_vignette_prompt[i]} Response: {y_pred_GPT4_vignette_prompt[i]} True label: {y_test.iloc[i]}"
#     )
#     vignette_prompt_reasons.append(response.output_text)
#     print(response.output_text)

In [81]:
# all_reasons_simple = []
# all_reasons_class_def = []
# all_reasons_profiled_simple = []
# all_reasons_few_shot = []
# all_reasons_vignette = []
#
# for reason in simple_prompt_reasons:
#     reason = reason.split(", ")
#     reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
#     all_reasons_simple.append(reason)
#
# for reason in class_def_prompt_reasons:
#     reason = reason.split(", ")
#     reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
#     all_reasons_class_def.append(reason)
#
# for reason in profiled_simple_prompt_reasons:
#     reason = reason.split(", ")
#     reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
#     all_reasons_profiled_simple.append(reason)
#
# for reason in few_shot_prompt_reasons:
#     reason = reason.split(", ")
#     reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
#     all_reasons_few_shot.append(reason)
#
# for reason in vignette_prompt_reasons:
#     reason = reason.split(", ")
#     reason = [re.sub(r'[^A-Za-z\s]', '', r).strip() for r in reason]
#     all_reasons_vignette.append(reason)

In [82]:
# simple_prompt_reasons_dict = {}
# class_def_prompt_reasons_dict = {}
# profiled_simple_prompt_reasons_dict = {}
# few_shot_prompt_reasons_dict = {}
# vignette_prompt_reasons_dict = {}
#
# for i in all_reasons_simple:
#     for j in i:
#         # count the occurrences of each reason
#         if j in simple_prompt_reasons_dict:
#             simple_prompt_reasons_dict[j] += 1
#         else:
#             simple_prompt_reasons_dict[j] = 1
# simple_prompt_reasons_df = pd.DataFrame.from_dict(simple_prompt_reasons_dict, orient='index', columns=['count'])
# # simple_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/simple_prompt_reasons.csv", sep = ",", index = True)
#
#
# for i in all_reasons_class_def:
#     for j in i:
#         # count the occurrences of each reason
#         if j in class_def_prompt_reasons_dict:
#             class_def_prompt_reasons_dict[j] += 1
#         else:
#             class_def_prompt_reasons_dict[j] = 1
# class_def_prompt_reasons_df = pd.DataFrame.from_dict(class_def_prompt_reasons_dict, orient='index', columns=['count'])
# # class_def_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/class_def_prompt_reasons.csv", sep = ",", index = True)
#
# for i in all_reasons_profiled_simple:
#     for j in i:
#         # count the occurrences of each reason
#         if j in profiled_simple_prompt_reasons_dict:
#             profiled_simple_prompt_reasons_dict[j] += 1
#         else:
#             profiled_simple_prompt_reasons_dict[j] = 1
# profiled_simple_prompt_reasons_df = pd.DataFrame.from_dict(profiled_simple_prompt_reasons_dict, orient='index', columns=['count'])
# # profiled_simple_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/profiled_simple_prompt_reasons.csv", sep = ",", index = True)
#
#
# for i in all_reasons_few_shot:
#     for j in i:
#         # count the occurrences of each reason
#         if j in few_shot_prompt_reasons_dict:
#             few_shot_prompt_reasons_dict[j] += 1
#         else:
#             few_shot_prompt_reasons_dict[j] = 1
# few_shot_prompt_reasons_df = pd.DataFrame.from_dict(few_shot_prompt_reasons_dict, orient='index', columns=['count'])
# # few_shot_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/few_shot_prompt_reasons.csv", sep = ",", index = True)
#
# for i in all_reasons_vignette:
#     for j in i:
#         # count the occurrences of each reason
#         if j in vignette_prompt_reasons_dict:
#             vignette_prompt_reasons_dict[j] += 1
#         else:
#             vignette_prompt_reasons_dict[j] = 1
# vignette_prompt_reasons_df = pd.DataFrame.from_dict(vignette_prompt_reasons_dict, orient='index', columns=['count'])
# # vignette_prompt_reasons_df.to_csv("../exp/reasons_misclassifications_LLMs/GPT4/vignette_prompt_reasons.csv", sep = ",", index = True)

In [83]:
# print(simple_prompt_reasons_dict, "\n \n")
# print(class_def_prompt_reasons_dict, "\n \n")
# print(profiled_simple_prompt_reasons_dict, "\n \n")
# print(few_shot_prompt_reasons_dict, "\n \n")
# print(vignette_prompt_reasons_dict)

### 1.2 Gemini (Google)

#### 1.2.1 Testing prompting

In [84]:
# client = genai.Client(api_key = os.environ.get("GEMINI_API_KEY"))
#
# response = client.models.generate_content(
#     model = "gemini-2.0-flash",
#     contents = "Explain how AI works in a few words",
# )
#
# print(response.text)

In [85]:
# client = genai.Client(api_key = os.environ.get("GEMINI_API_KEY"))
#
# response = client.models.generate_content(
#     model = "gemini-2.0-flash",
#     config = types.GenerateContentConfig(
#         system_instruction = simple_instruction),
#     contents = simple_prompt
# )
#
# # gemini-2.5-pro-preview-05-06

### 1.3 Gemma (Google)

### 1.4 Llama (Meta)

### 1.5 Claude (Anthropic)

#### 1.5.1 Testing prompting

In [21]:
# client = anthropic.Anthropic(api_key = os.environ.get("ANTHROPIC_API_KEY"))
#
# message = client.messages.create(
#     model = "claude-3-7-sonnet-20250219",
#     max_tokens = 20000,
#     temperature = 1,
#     thinking = {
#         "type": "enabled",
#         "budget_tokens": 16000
#     },
#     system = claude_instruction,
#     messages = [
#         {
#             "role": "user",
#             "content": [
#                 {
#                     "type": "text",
#                     "text": X_test_claude_prompt[0]
#                 }
#             ]
#         }
#     ]
# )
# print(message.content)

[ThinkingBlock(signature='ErUBCkYIAxgCIkDffTxBOdm1xHuuprm6mOE0hHT/lSo/htgiGsrgKTObUT63OVxdYuLoOiRRT/F9rkti/Tj/X9GS3uPmBHFpJsqfEgyylE8MaEDQcWZshF4aDMDoSXKovP4KDsAtniIwvAzWRfPaCRykRCslJUMmbaAfvuDhxom5rPN4ppwl+/h5Q3/FvdgH5UIn1bevv49gKh3EP6HsuEP6sjr3HKelTCOkvkEaD5wDrvWha8+n7hgC', thinking="Let me conduct a thorough psychological assessment based on the provided variables.\n\n<psychological_assessment>\nI'll first categorize the variables into risk factors and protective factors at baseline (T1), then analyze the changes between T1 and T2, and finally make an overall assessment.\n\n**Baseline (T1) Assessment:**\n\n**Protective Factors at T1:**\n- T1 Social support: 0.1421 (positive value indicates good social support)\n- T1 General self-efficacy: 0.3650 (positive value indicates good self-efficacy)\n- T1 Life satisfaction: 0.3373 (positive value indicates good life satisfaction)\n- T1 Problem-focused coping: 1.7319 (strong positive value indicates good coping skills)\n- T1 Emotion-focused c

In [18]:
# print(message.content[0].thinking)

Let me analyze the available data to determine if this person develops a psychological disorder between time points T1 and T2.

First, I'll look at the baseline (T1) measures:
- T1 Positive mental health: -0.0279170753483525 (slightly below average)
- T1 Social support: 0.1421238143169474 (slightly above average)
- T1 General self-efficacy: 0.3649793457412237 (above average)
- T1 Life satisfaction: 0.3372886835461141 (above average)
- T1 Stress: 0.4419361727222826 (above average)
- T1 Problem-focused coping: 1.7319368683783989 (well above average)
- T1 Emotion-focused coping: 0.2078300133169115 (slightly above average)
- T1 Anxiety sensitivity: 0.1594156886399411 (slightly above average)
- T1 Fear of bodily sensations: 0.2863750811390516 (above average)
- T1 Dysfunctional attitudes: 0.2750686254386546 (above average)
- T1 General psychopathology (GSI): 0.0172227087467131 (very slightly above average)
- Education: 2.0
- T1 BMI: 1.0
- Socioeconomic status: 2.0

Now I'll look at the chang

In [30]:
prediction = re.findall(r'Prediction: (.*)', message.content[1].text)
prediction[0]

IndexError: list index out of range

In [23]:
# extract what comes after Explanation:
explanation = re.findall(r'Explanation: (.*)', message.content[1].text)
explanation[0]

'Despite some concerning trends like decreased positive mental health (-0.75) and increased dysfunctional attitudes (0.48), the substantial reduction in general psychopathology (GSI: -0.83) and anxiety-related measures combined with improved social support (0.71) strongly indicate the individual did not develop a psychological disorder between timepoints.'

#### 1.5.2 Prompting with Claude 3.7 Sonnet

In [39]:
# simple_prompt_array_claude = []
# simple_prompt_thinking_claude = []
#
# client = anthropic.Anthropic(api_key = os.environ.get("ANTHROPIC_API_KEY"))
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_simple_prompt:
#     message = client.messages.create(
#         model = "claude-3-7-sonnet-20250219",
#         max_tokens = 20000,
#         temperature = 1,
#         thinking = {
#             "type": "enabled",
#             "budget_tokens": 16000
#         },
#         system = simple_instruction,
#         messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt
#                     }
#                 ]
#             }
#         ]
#     )
#     simple_prompt_array_claude.append(message.content[1].text)
#     simple_prompt_thinking_claude.append(message.content[0].thinking)
#     print(message.content[1].text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_claude_simple_prompt = end - start
# time_claude_simple_prompt_df = pd.DataFrame({"time": [time_claude_simple_prompt]})
# time_claude_simple_prompt_df.to_csv("../exp/times_LLMs/Claude/time_claude_simple_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_simple_claude = pd.Series(simple_prompt_array_claude).value_counts()
# print(counts_simple_claude)
#
# # convert YES to 1 and NO to 0
# simple_prompt_array_claude = [1 if response == "YES" else 0 if response == "NO" else np.nan for response in simple_prompt_array_claude]
#
# # save the array to a csv file
# simple_prompt_df_claude = pd.DataFrame(simple_prompt_array_claude, columns = ["y_pred"])
# simple_prompt_df_claude.to_csv("../exp/preds_LLMs/Claude/y_pred_claude_simple_prompt.csv", sep = ",", index = False)
#
# simple_prompt_df_thinking_claude = pd.DataFrame(simple_prompt_thinking_claude, columns = ["thinking"])
# simple_prompt_df_thinking_claude.to_csv("../exp/preds_LLMs/Claude/Thinking/thinking_claude_simple_prompt.csv", sep = ",", index = False)

In [40]:
# class_def_array_claude = []
# class_def_thinking_claude = []
#
# client = anthropic.Anthropic(api_key = os.environ.get("ANTHROPIC_API_KEY"))
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_class_definitions_prompt:
#     message = client.messages.create(
#         model = "claude-3-7-sonnet-20250219",
#         max_tokens = 20000,
#         temperature = 1,
#         thinking = {
#             "type": "enabled",
#             "budget_tokens": 16000
#         },
#         system = class_definitions_instruction,
#         messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt
#                     }
#                 ]
#             }
#         ]
#     )
#     class_def_array_claude.append(message.content[1].text)
#     class_def_thinking_claude.append(message.content[0].thinking)
#     print(message.content[1].text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_claude_class_definitions = end - start
# time_claude_class_definitions_df = pd.DataFrame({"time": [time_claude_class_definitions]})
# time_claude_class_definitions_df.to_csv("../exp/times_LLMs/Claude/time_claude_class_definitions_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_class_def_claude = pd.Series(class_def_array_claude).value_counts()
# print(counts_class_def_claude)
#
# # convert YES to 1 and NO to 0
# class_def_array_claude = [1 if response == "YES" else 0 for response in class_def_array_claude]
#
# # save the array to a csv file
# class_def_df_claude = pd.DataFrame(class_def_array_claude, columns = ["y_pred"])
# class_def_df_claude.to_csv("../exp/preds_LLMs/Claude/y_pred_claude_class_definitions_prompt.csv", sep = ",", index = False)
#
# class_def_prompt_df_thinking_claude = pd.DataFrame(class_def_thinking_claude, columns = ["thinking"])
# class_def_prompt_df_thinking_claude.to_csv("../exp/preds_LLMs/Claude/Thinking/thinking_claude_class_def_prompt.csv", sep = ",", index = False)

In [41]:
# profiled_simple_array_claude = []
# profiled_simple_thinking_claude = []
#
# client = anthropic.Anthropic(api_key = os.environ.get("ANTHROPIC_API_KEY"))
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_profiled_simple_prompt:
#     message = client.messages.create(
#         model = "claude-3-7-sonnet-20250219",
#         max_tokens = 20000,
#         temperature = 1,
#         thinking = {
#             "type": "enabled",
#             "budget_tokens": 16000
#         },
#         system = profiled_simple_instruction,
#         messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt
#                     }
#                 ]
#             }
#         ]
#     )
#     profiled_simple_array_claude.append(message.content[1].text)
#     profiled_simple_thinking_claude.append(message.content[0].thinking)
#     print(message.content[1].text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_claude_profiled_simple = end - start
# time_claude_profiled_simple_df = pd.DataFrame({"time": [time_claude_profiled_simple]})
# time_claude_profiled_simple_df.to_csv("../exp/times_LLMs/Claude/time_claude_profiled_simple_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_profiled_simple_claude = pd.Series(profiled_simple_array_claude).value_counts()
# print(counts_profiled_simple_claude)
#
# # convert YES to 1 and NO to 0
# profiled_simple_array_claude_val = [1 if response == "YES" else 0 for response in profiled_simple_array_claude]
#
# # save the array to a csv file
# profiled_simple_df_claude = pd.DataFrame(profiled_simple_array_claude_val, columns = ["y_pred"])
# profiled_simple_df_claude.to_csv("../exp/preds_LLMs/Claude/y_pred_claude_profiled_simple_prompt.csv", sep = ",", index = False)
#
# profiled_simple_prompt_df_thinking_claude = pd.DataFrame(profiled_simple_thinking_claude, columns = ["thinking"])
# profiled_simple_prompt_df_thinking_claude.to_csv("../exp/preds_LLMs/Claude/Thinking/thinking_claude_profiled_simple_prompt.csv", sep = ",", index = False)

In [42]:
# few_shot_array_claude = []
# few_shot_thinking_claude = []
#
# client = anthropic.Anthropic(api_key = os.environ.get("ANTHROPIC_API_KEY"))
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_few_shot_prompt:
#     message = client.messages.create(
#         model = "claude-3-7-sonnet-20250219",
#         max_tokens = 20000,
#         temperature = 1,
#         thinking = {
#             "type": "enabled",
#             "budget_tokens": 16000
#         },
#         system = few_shot_instruction,
#         messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt
#                     }
#                 ]
#             }
#         ]
#     )
#     few_shot_array_claude.append(message.content[1].text)
#     few_shot_thinking_claude.append(message.content[0].thinking)
#     print(message.content[1].text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_claude_few_shot = end - start
# time_claude_few_shot_df = pd.DataFrame({"time": [time_claude_few_shot]})
# time_claude_few_shot_df.to_csv("../exp/times_LLMs/Claude/time_claude_few_shot_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_few_shot_claude = pd.Series(few_shot_array_claude).value_counts()
# print(counts_few_shot_claude)
#
# # convert YES to 1 and NO to 0
# few_shot_array_claude_val = [1 if response == "YES" else 0 for response in few_shot_array_claude]
#
# # save the array to a csv file
# few_shot_df_claude = pd.DataFrame(few_shot_array_claude_val, columns = ["y_pred"])
# few_shot_df_claude.to_csv("../exp/preds_LLMs/Claude/y_pred_claude_few_shot_prompt.csv", sep = ",", index = False)
#
# few_shot_prompt_df_thinking_claude = pd.DataFrame(few_shot_thinking_claude, columns = ["thinking"])
# few_shot_prompt_df_thinking_claude.to_csv("../exp/preds_LLMs/Claude/Thinking/thinking_claude_few_shot_prompt.csv", sep = ",", index = False)

In [43]:
# vignette_array_claude = []
# vignette_thinking_claude = []
#
# client = anthropic.Anthropic(api_key = os.environ.get("ANTHROPIC_API_KEY"))
#
# # measure time in seconds
# start = time.time()
#
# # iterate over the test set and save the response for each prompt in an array
# for prompt in X_test_vignette_prompt:
#     message = client.messages.create(
#         model = "claude-3-7-sonnet-20250219",
#         max_tokens = 20000,
#         temperature = 1,
#         thinking = {
#             "type": "enabled",
#             "budget_tokens": 16000
#         },
#         system = vignette_instruction,
#         messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": prompt
#                     }
#                 ]
#             }
#         ]
#     )
#     vignette_array_claude.append(message.content[1].text)
#     vignette_thinking_claude.append(message.content[0].thinking)
#     print(message.content[1].text)
#
# end = time.time()
# print(f"Time taken: {end - start} seconds")
# time_claude_vignette = end - start
# time_claude_vignette_df = pd.DataFrame({"time": [time_claude_vignette]})
# time_claude_vignette_df.to_csv("../exp/times_LLMs/Claude/time_claude_vignette_prompt.csv", sep = ",", index = False)
#
# # value counts for array
# counts_vignette_claude = pd.Series(vignette_array_claude).value_counts()
# print(counts_vignette_claude)
#
# # convert YES to 1 and NO to 0
# vignette_array_claude_val = [1 if response == "YES" else 0 for response in vignette_array_claude]
#
# # save the array to a csv file
# vignette_df_claude = pd.DataFrame(vignette_array_claude_val, columns = ["y_pred"])
# vignette_df_claude.to_csv("../exp/preds_LLMs/Claude/y_pred_claude_vignette_prompt.csv", sep = ",", index = False)
#
# vignette_prompt_df_thinking_claude = pd.DataFrame(vignette_thinking_claude, columns = ["thinking"])
# vignette_prompt_df_thinking_claude.to_csv("../exp/preds_LLMs/Claude/Thinking/thinking_claude_vignette_prompt.csv", sep = ",", index = False)

In [54]:
claude_prompt_array_claude = []
claude_prompt_explanation_claude = []
claude_prompt_thinking_claude = []

client = anthropic.Anthropic(api_key = os.environ.get("ANTHROPIC_API_KEY"))

# measure time in seconds
start = time.time()

# iterate over the test set and save the response for each prompt in an array
for prompt in X_test_claude_prompt:
    message = client.messages.create(
        model = "claude-3-7-sonnet-20250219",
        max_tokens = 20000,
        temperature = 1,
        thinking = {
            "type": "enabled",
            "budget_tokens": 16000
        },
        system = claude_instruction,
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ]
    )
    try:
        prediction = re.findall(r'Prediction: (.*)', message.content[1].text)[0]
        explanation = re.findall(r'Explanation: (.*)', message.content[1].text)[0]
        claude_prompt_array_claude.append(prediction)
        claude_prompt_explanation_claude.append(explanation)
        claude_prompt_thinking_claude.append(message.content[0].thinking)
        print(prediction)
    except IndexError:
        print("IndexError")
        claude_prompt_array_claude.append("IndexError")
        claude_prompt_explanation_claude.append("IndexError")
        claude_prompt_thinking_claude.append("IndexError")


end = time.time()
print(f"Time taken: {end - start} seconds")
time_claude_claude_prompt = end - start
time_claude_claude_prompt_df = pd.DataFrame({"time": [time_claude_claude_prompt]})
time_claude_claude_prompt_df.to_csv("../exp/times_LLMs/Claude/time_claude_claude_prompt.csv", sep = ",", index = False)

# value counts for array
counts_claude_prompt_claude = pd.Series(claude_prompt_array_claude).value_counts()
print(counts_claude_prompt_claude)

# convert YES to 1 and NO to 0
claude_prompt_array_claude_val = [1 if response == "YES" else 0 for response in claude_prompt_array_claude]

# save the array to a csv file
claude_prompt_df_claude = pd.DataFrame(claude_prompt_array_claude_val, columns = ["y_pred"])
claude_prompt_df_claude.to_csv("../exp/preds_LLMs/Claude/y_pred_claude_claude_prompt_prompt.csv", sep = ",", index = False)

claude_prompt_prompt_df_thinking_claude = pd.DataFrame(claude_prompt_thinking_claude, columns = ["thinking"])
claude_prompt_prompt_df_thinking_claude.to_csv("../exp/preds_LLMs/Claude/Thinking/thinking_claude_claude_prompt.csv", sep = ",", index = False)

claude_prompt_prompt_df_explanation_claude = pd.DataFrame(claude_prompt_explanation_claude, columns = ["thinking"])
claude_prompt_prompt_df_explanation_claude.to_csv("../exp/preds_LLMs/Claude/Thinking/explanation_claude_claude_prompt.csv", sep = ",", index = False)


NO
YES
Time taken: 49.63070583343506 seconds
NO     1
YES    1
Name: count, dtype: int64


### 1.6 DeepSeek

In [86]:
# client = OpenAI(api_key = os.environ.get("DeepSeek_API_Key"), base_url = "https://api.deepseek.com")
#
# response = client.chat.completions.create(
#     model="deepseek-chat",
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant"},
#         {"role": "user", "content": "Hello"},
#     ],
#     stream=False
# )
#
# print(response.choices[0].message.content)