# Library Imports

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from openai import OpenAI

# Data Pre-pocessing

In [3]:
iris = load_iris()

# Features and target
X = iris.data
y = iris.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (120, 4)
Test set size: (30, 4)


# Fine-tuned SVM

In [4]:
# Hyperparameter tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'degree': [2, 3, 4]  # only used for 'poly' kernel
}
grid_search = GridSearchCV(SVC(probability=True), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 1, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}


In [8]:
# Train SVM model with best parameters
svm_model = grid_search.best_estimator_
svm_model.fit(X_train, y_train)

# Predict
y_pred = svm_model.predict(X_test)

print(f'Ground Truth labels: {y_test}')
print(f'SVM Prediction:      {y_pred}')

Ground Truth labels: [0 2 1 1 0 1 0 0 2 1 2 2 2 1 0 0 0 1 1 2 0 2 1 2 2 1 1 0 2 0]
SVM Prediction:      [0 2 1 1 0 1 0 0 2 1 2 2 2 1 0 0 0 1 1 2 0 2 1 2 2 2 1 0 2 0]


In [9]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))

Accuracy: 0.9666666666666667
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



# ChatGPT-4o-mini


In [64]:
# API_KEY = "OpenAI API Key"
client = OpenAI(api_key="OPENAI_API_KEY", base_url="https://api.openai.com/v1")
N = len(X_test)

## Zero-shot Prompting

In [67]:
true_label = []
pred_label = []
prompt = """The input contains 4 elements, which are the length and the width of the sepals and petals of an iris flower, in centimeters. Base on the combination of these four features, help me predict the Output value, i.e., the exact spicy of the iris flower(0 = setosa, 1 = versicolor, 2 = virginica).**
    Your response should only contain the Output value in the format of #your prediction label#.\n"""
for n in range(N):
    print("Predicting Test Example", n)


    # Here we construct the prompt for querying the LLM

    s = f"Input: " + str(X_test[n]) + "\n"

    prompt = s + prompt
    # print(prompt)
    # Sometimes the LLM may not return our desired results. So, we try we try querying the LLM up to max_tries times. If still unsuccessful, we return a random label as prediction.
    max_tries = 5
    err_counter = 0
    while err_counter < max_tries:
        try:
            completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user",
                           "content": prompt},
                          ],
                temperature=0.7,  # controls randomness
                max_tokens=150,   # controls response length
            )
            response = completion.choices[0].message.content
            pred = int(response.replace("#", ""))
            break

        except Exception as e:
            print(f"Error encountered: {e}. Retrying...")
            err_counter += 1

    if err_counter == max_tries:
        # if still unsuccessful after "max_tries" tries, return a random label
        print("max number of tries exceeded")
        pred = random.randint(0, 2)

    true_label.append(y_test[n])
    pred_label.append(pred)

Predicting Test Example 0
Predicting Test Example 1
Predicting Test Example 2
Predicting Test Example 3
Predicting Test Example 4
Predicting Test Example 5
Predicting Test Example 6
Predicting Test Example 7
Predicting Test Example 8
Predicting Test Example 9
Predicting Test Example 10
Predicting Test Example 11
Predicting Test Example 12
Predicting Test Example 13
Predicting Test Example 14
Predicting Test Example 15
Predicting Test Example 16
Predicting Test Example 17
Predicting Test Example 18
Predicting Test Example 19
Predicting Test Example 20
Predicting Test Example 21
Predicting Test Example 22
Predicting Test Example 23
Predicting Test Example 24
Predicting Test Example 25
Predicting Test Example 26
Predicting Test Example 27
Predicting Test Example 28
Predicting Test Example 29


In [68]:
# Print the results
print("Groundtrugh labels:")
print(list(map(int,true_label)))
print("Predicted labels by ICL:")
print(pred_label)

accuracy = accuracy_score(true_label, pred_label)
print("\nICL Accuracy:", accuracy)
print("Classification Report:\n", classification_report(true_label, pred_label, target_names=iris.target_names))

Groundtrugh labels:
[0, 2, 1, 1, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2, 0]
Predicted labels by ICL:
[1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2]

ICL Accuracy: 0.36666666666666664
Classification Report:
               precision    recall  f1-score   support

      setosa       0.00      0.00      0.00        10
  versicolor       0.38      0.90      0.53        10
   virginica       0.40      0.20      0.27        10

    accuracy                           0.37        30
   macro avg       0.26      0.37      0.27        30
weighted avg       0.26      0.37      0.27        30



## Few-shot Prompting

In [65]:
true_label = []
pred_label = []
for n in range(N):
    print("Predicting Test Example", n)


    # Here we construct the prompt for querying the LLM
    prompt = "Help me predict the Output value for the last Input. Your response should only contain the Output value in the format of #Output value#.\n"

    s = ""
    for i in np.arange(len(X_train)):
        s += f"Input: {X_train[i]}, Output: {y_train[i]}\n"
    s += f"Input: " + str(X_test[n]) + ", Output: "

    prompt += s
    # print(prompt)
    # Sometimes the LLM may not return our desired results. So, we try we try querying the LLM up to max_tries times. If still unsuccessful, we return a random label as prediction.
    max_tries = 5
    err_counter = 0
    while err_counter < max_tries:
        try:
            completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user",
                           "content": prompt},
                          ],
                temperature=0.7,  # controls randomness
                max_tokens=150,   # controls response length
            )
            response = completion.choices[0].message.content
            pred = int(response.replace("#", ""))
            break

        except Exception as e:
            print(f"Error encountered: {e}. Retrying...")
            err_counter += 1

    if err_counter == max_tries:
        # if still unsuccessful after "max_tries" tries, return a random label
        print("max number of tries exceeded")
        pred = random.randint(0, 2)

    true_label.append(y_test[n])
    pred_label.append(pred)

Predicting Test Example 0
Predicting Test Example 1
Predicting Test Example 2
Predicting Test Example 3
Predicting Test Example 4
Predicting Test Example 5
Predicting Test Example 6
Predicting Test Example 7
Predicting Test Example 8
Predicting Test Example 9
Predicting Test Example 10
Predicting Test Example 11
Predicting Test Example 12
Predicting Test Example 13
Predicting Test Example 14
Predicting Test Example 15
Predicting Test Example 16
Predicting Test Example 17
Predicting Test Example 18
Predicting Test Example 19
Predicting Test Example 20
Predicting Test Example 21
Predicting Test Example 22
Predicting Test Example 23
Predicting Test Example 24
Predicting Test Example 25
Predicting Test Example 26
Predicting Test Example 27
Predicting Test Example 28
Predicting Test Example 29


In [66]:
# Print the results
print("Groundtrugh labels:")
print(list(map(int,true_label)))
print("Predicted labels by ICL:")
print(pred_label)

accuracy = accuracy_score(true_label, pred_label)
print("\nICL Accuracy:", accuracy)
print("Classification Report:\n", classification_report(true_label, pred_label, target_names=iris.target_names))

Groundtrugh labels:
[0, 2, 1, 1, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2, 0]
Predicted labels by ICL:
[0, 1, 1, 1, 0, 2, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 1, 0, 2, 1, 2, 2, 1, 1, 0, 2, 0]

ICL Accuracy: 0.9
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.82      0.90      0.86        10
   virginica       0.89      0.80      0.84        10

    accuracy                           0.90        30
   macro avg       0.90      0.90      0.90        30
weighted avg       0.90      0.90      0.90        30



## Self-consistency CoT Prompting


In [69]:
prompt = """contains 4 features: the length and the width of the sepals and petals of an iris flower, in centimeters. Base on the combination of these four features, help me predict the Output value, i.e., the exact spicy of the iris flower(0 = setosa, 1 = versicolor, 2 = virginica).**\n\
Your response should only contain the Output value in the format of #your prediction label#.
**Examples**:\n"""

s = ""
for i in np.arange(len(X_train)):
  s += f" Input: {X_train[i]}, Output: {y_train[i]}\n"

problem_str = f"**Problem: Input: " + str(X_test[0]) + ", "

task_str = """**Your tasks:**
  - The given 120 examples can be considered as labeled training pairs, try to solve the problem as a 3-class classification task.\n"""

consistency = """**Consistency Check:**
  - Sample several reasoning paths. i.e., for each test example, sample several different reasoning paths or try different methods to predict the label.
  - Compare the answers and select the most frequently occurring result.\n"""

answer = """**Final Answer:**
  - After verifying consistency across samples, conclude with the most consistent answer."""

# print(problem_str + prompt + s + task_str + consistency + answer)

In [52]:
# Testing, ignore this chunk
# completion = client.chat.completions.create(
#                 model="gpt-4o-mini",
#                 messages=[{"role": "user",
#                            "content": problem_str + prompt + s + task_str + reasoning + answer},
#                           ],
#                 temperature=0.7,  # controls randomness
#                 max_tokens=150,   # controls response length
#             )
# response = completion.choices[0].message.content
# print(response)
# pred = int(response.replace("#", ""))
# print(pred)

#2#
2


In [71]:
true_label = []
pred_label = []

for n in range(N):
    print("Predicting Test Example", n)


    # Here we construct the prompt for querying the LLM
    problem_str = f"**Problem: Input: " + str(X_test[n]) + ", "
    prompt = problem_str + prompt
    prompt += s
    prompt += task_str
    prompt += reasoning
    prompt += answer
    # print(prompt)
    # Sometimes the LLM may not return our desired results. So, we try we try querying the LLM up to max_tries times. If still unsuccessful, we return a random label as prediction.
    max_tries = 5
    err_counter = 0
    while err_counter < max_tries:
        try:
          completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user",
                           "content": prompt},
                          ],
                temperature=0.7,  # controls randomness
                max_tokens=150,   # controls response length
            )
          response = completion.choices[0].message.content
          pred = int(response.replace("#", ""))
          break

        except Exception as e:
            print(f"Error encountered: {e}. Retrying...")
            err_counter += 1

    if err_counter == max_tries:
        # if still unsuccessful after "max_tries" tries, return a random label
        print("max number of tries exceeded")
        pred = random.randint(0, 2)

    true_label.append(y_test[n])
    pred_label.append(pred)

Predicting Test Example 0
Predicting Test Example 1
Predicting Test Example 2
Predicting Test Example 3
Predicting Test Example 4
Predicting Test Example 5
Predicting Test Example 6
Predicting Test Example 7
Predicting Test Example 8
Predicting Test Example 9
Predicting Test Example 10
Predicting Test Example 11
Predicting Test Example 12
Predicting Test Example 13
Predicting Test Example 14
Predicting Test Example 15
Predicting Test Example 16
Predicting Test Example 17
Predicting Test Example 18
Predicting Test Example 19
Predicting Test Example 20
Predicting Test Example 21
Predicting Test Example 22
Predicting Test Example 23
Predicting Test Example 24
Predicting Test Example 25
Predicting Test Example 26
Predicting Test Example 27
Predicting Test Example 28
Predicting Test Example 29


In [72]:
# Print the results
print("Groundtrugh labels:")
print(list(map(int,true_label)))
print("Predicted labels by ICL:")
print(pred_label)

accuracy = accuracy_score(true_label, pred_label)
print("\nICL Accuracy:", accuracy)
print("Classification Report:\n", classification_report(true_label, pred_label, target_names=iris.target_names))

Groundtrugh labels:
[0, 2, 1, 1, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2, 0]
Predicted labels by ICL:
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

ICL Accuracy: 0.3
Classification Report:
               precision    recall  f1-score   support

      setosa       0.35      0.90      0.50        10
  versicolor       0.00      0.00      0.00        10
   virginica       0.00      0.00      0.00        10

    accuracy                           0.30        30
   macro avg       0.12      0.30      0.17        30
weighted avg       0.12      0.30      0.17        30



# ChatGPT-4o

In [73]:
true_label = []
pred_label = []
for n in range(N):
    print("Predicting Test Example", n)


    # Here we construct the prompt for querying the LLM
    prompt = "Help me predict the Output value for the last Input. Your response should only contain the Output value in the format of #Output value#.\n"

    s = ""
    for i in np.arange(len(X_train)):
        s += f"Input: {X_train[i]}, Output: {y_train[i]}\n"
    s += f"Input: " + str(X_test[n]) + ", Output: "

    prompt += s
    # print(prompt)
    # Sometimes the LLM may not return our desired results. So, we try we try querying the LLM up to max_tries times. If still unsuccessful, we return a random label as prediction.
    max_tries = 5
    err_counter = 0
    while err_counter < max_tries:
        try:
            completion = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user",
                           "content": prompt},
                          ],
                temperature=0.7,  # controls randomness
                max_tokens=150,   # controls response length
            )
            response = completion.choices[0].message.content
            pred = int(response.replace("#", ""))
            break

        except Exception as e:
            print(f"Error encountered: {e}. Retrying...")
            err_counter += 1

    if err_counter == max_tries:
        # if still unsuccessful after "max_tries" tries, return a random label
        print("max number of tries exceeded")
        pred = random.randint(0, 2)

    true_label.append(y_test[n])
    pred_label.append(pred)

Predicting Test Example 0
Predicting Test Example 1
Predicting Test Example 2
Predicting Test Example 3
Predicting Test Example 4
Predicting Test Example 5
Predicting Test Example 6
Predicting Test Example 7
Predicting Test Example 8
Predicting Test Example 9
Predicting Test Example 10
Predicting Test Example 11
Predicting Test Example 12
Predicting Test Example 13
Predicting Test Example 14
Predicting Test Example 15
Predicting Test Example 16
Predicting Test Example 17
Predicting Test Example 18
Predicting Test Example 19
Predicting Test Example 20
Predicting Test Example 21
Predicting Test Example 22
Predicting Test Example 23
Predicting Test Example 24
Predicting Test Example 25
Predicting Test Example 26
Predicting Test Example 27
Predicting Test Example 28
Predicting Test Example 29


In [75]:
# Print the results
print("Groundtrugh labels:")
print(list(map(int,true_label)))
print("Predicted labels by ICL:")
print(pred_label)

accuracy = accuracy_score(true_label, pred_label)
print("\nICL Accuracy:", accuracy)
print("Classification Report:\n", classification_report(true_label, pred_label, target_names=iris.target_names))

Groundtrugh labels:
[0, 2, 1, 1, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2, 0]
Predicted labels by ICL:
[0, 2, 1, 1, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 0, 2, 1, 2, 2, 2, 1, 0, 2, 0]

ICL Accuracy: 0.9666666666666667
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

