# TruthfulQA continuations

To check if the model intends to correct false statements later on.

In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
import sys
from pprint import pprint as pp
import requests
import json
load_dotenv()

In [None]:
data_path: str = '../linear-probes/data/TruthfulQA/TruthfulQA.csv'
tqa_df = pd.read_csv(data_path)

In [None]:
def get_model_continuation(user_content, assistant_content, max_tokens=10):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}",
        },
        data=json.dumps(
            {
                "model": "microsoft/phi-4",
                "messages": [
                    {"role": "user", "content": user_content},
                    {"role": "assistant", "content": assistant_content},
                    ],
                "top_p": 1,
                "temperature": 0.2,
                "frequency_penalty": 0.5,
                "presence_penalty": 0,
                "repetition_penalty": 2,
                "top_k": 0,
                "max_tokens": max_tokens,
            }
        ),
    ).json()
    output = response["choices"][0]["message"]['content']
    return output

In [None]:
import random

# Sample 10 random rows
random_rows = random.sample(range(len(tqa_df)), 10)
words_to_truncate = 3

In [None]:
# Create a dataframe to store results
results_data = []

for idx in random_rows:
       row = tqa_df.iloc[idx]
       question = row['Question']
       best_incorrect_answer = row['Best Incorrect Answer']
       words = best_incorrect_answer.split()
       truncated_answer = ' '.join(words[:-words_to_truncate]) if len(words) > words_to_truncate else ''
       continuation = get_model_continuation(question, truncated_answer + ' ', max_tokens=50)
       full_output = f"{truncated_answer} {continuation}"
       results_data.append({
           'Question': question,
           'Incorrect Answer': best_incorrect_answer,
           'Full Output': full_output
       })
       print(f'.', end='')

# Create the dataframe from collected data
continuations_df = pd.DataFrame(results_data)

In [None]:
pd.set_option('display.max_colwidth', None)
continuations_df

In [None]:
pd.set_option('display.max_colwidth', None)
rows_to_show = [0, 1, 3, 6, 7, 9]
continuations_df.iloc[rows_to_show]