### Testing different prompting techniques for the ScaDS.AI Llama Model with a small sample from our dataset

In [2]:
import os
from openai import OpenAI
import json


my_api_key = os.getenv("MY_API_KEY")
if not my_api_key:
    print("Error: API key not found. Ensure it's set in the environment.")
    exit(1)
else: print("API KEY successfully stored")

API KEY successfully stored


### !! connect to Dresden VPN first

In [3]:
client = OpenAI(base_url="https://llm.scads.ai/v1",api_key=my_api_key)
# Find model with "llama" in name
for model in client.models.list().data:
    model_name = model.id
    if "llama" in model_name:
        break

acm_file = 'HCI_DB.ACM_paper_filter2.json'

with open(acm_file, 'r', encoding='utf-8') as file:
    data = json.load(file)
    print("JSON data loaded successfully.")

JSON data loaded successfully.


In [4]:
# Reduce the dataset by filtering only relevant entries
filtered_data_100 = [
    {key: entry.get(key, None) for key in ['_id', 'abstract', 'title', 'fullText']}
    for entry in data[:100]  # Process only the first 40 entries as an example
]

# Specify the output JSON file name
output_file = 'filtered_data_100.json'

# Save to a JSON file
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(filtered_data_100, file, indent=4)
print(f"Filtered data successfully saved to {output_file}")

Filtered data successfully saved to filtered_data_100.json


In [5]:
# Load the JSON file
with open('mapping.json', 'r') as f:
    mapping = json.load(f)

# Extract the 'Taxonomy' section
taxonomy = mapping.get('Taxonomy', None)

#### Testing whether it is working with other Mappings like the Contribution Type

In [5]:
# Load the JSON file
with open('mapping.json', 'r') as f:
    mapping = json.load(f)

# Extract the 'Taxonomy' section
contribution = mapping.get('Contribution', None)

In [7]:
# Parameters
max_attempts = 5  # Maximum number of retries for a single entry

# Extract the keys from the taxonomy JSON
contribution_keys = set(contribution.keys())

# Loop through each entry in the data
for i, entry in enumerate(filtered_data_40):
    # Serialize the entry and taxonomy JSON
    entry_json = json.dumps(entry, indent=2, ensure_ascii=False)
    contribution_json_str = json.dumps(contribution, indent=2, ensure_ascii=False)

    # Retry mechanism for generating the response
    attempts = 0
    while attempts < max_attempts:
        # Construct the prompt
        prompt = f"""
        You are a helpful assistant. I have the following entry as JSON:
        {entry_json}

        Find out which contribution each research paper makes with their work. You can find different kinds of Contributions here:
        {contribution_json_str}

        Go through the Abstracts, Full Texts, and output the ocintribution type a specific paper most likely belongs to. 
        The output structure should ONLY include the corresponding contribution type (must be listed in the contribution file). 
        Your answer should contain a MAXIMUM of 2 words in any case!

        For example:
        Opinion

        or:
        
        Empirical Research
        
        Don't write any other text or code for each entry, just the contribution type! This is very important.
        If more than one contribution type is matching, choose the best one. If a contribution type is matching best, that is not in the given list
        of contribution types, find a type from the list that also fits quite well. Again, don't mention anything more than ONE contribution type.
        """

        # Send the prompt to the model
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model_name
        )

        # Extract the response content
        result = response.choices[0].message.content.strip()
        
        # Check if the response matches one of the taxonomy keys
        if result in contribution_keys:
            print(f"Valid result for entry {i}, after attempt {attempts + 1}: {result}")
            break  # Valid response, exit the retry loop
        else:
            attempts += 1
    
    # If max attempts are reached, log the final result even if invalid
    if attempts == max_attempts:
        print(f"Invalid (!) result for entry {i}, after 5 attempts: {result}")


Valid result for entry 0, after attempt 1: Opinion
Valid result for entry 1, after attempt 1: Opinion
Valid result for entry 2, after attempt 1: Opinion
Valid result for entry 3, after attempt 1: Empirical Research
Valid result for entry 4, after attempt 1: Opinion
Valid result for entry 5, after attempt 1: Opinion
Valid result for entry 6, after attempt 1: Opinion
Valid result for entry 7, after attempt 1: Theoretical
Valid result for entry 8, after attempt 1: Opinion
Valid result for entry 9, after attempt 1: Theoretical
Valid result for entry 10, after attempt 1: Opinion
Valid result for entry 11, after attempt 1: Theoretical
Valid result for entry 12, after attempt 1: Survey
Valid result for entry 13, after attempt 1: Survey
Valid result for entry 14, after attempt 1: Survey
Valid result for entry 15, after attempt 1: Survey
Valid result for entry 16, after attempt 1: Survey
Valid result for entry 17, after attempt 1: Survey
Valid result for entry 18, after attempt 1: Survey
Valid 

### 1. 'Vanilla' Prompt for predefined categories (again with the Taxonomy of Topics)

In [9]:
# Parameters
max_attempts = 5  # Maximum number of retries for a single entry

# Extract the keys from the taxonomy JSON
taxonomy_keys = set(taxonomy.keys())

invalid_results = 0

# Loop through each entry in the data
for i, entry in enumerate(filtered_data_100):
    # Serialize the entry and taxonomy JSON
    entry_json = json.dumps(entry, indent=2, ensure_ascii=False)
    taxonomy_json_str = json.dumps(taxonomy, indent=2, ensure_ascii=False)

    # Retry mechanism for generating the response
    attempts = 0
    while attempts < max_attempts:
        # Construct the prompt
        prompt = f"""
        You are a helpful assistant. I have the following entry as JSON:
        {entry_json}

        Find out to what field / topic this research paper belongs. Topics can be found as keys with their corresponding keywords as values in here:
        {taxonomy_json_str}

        Go through the Abstracts, Full Texts, and output the topic a specific paper most likely belongs to. The output structure should ONLY include the corresponding 
        topic (must be a listed topic in the taxonomy file). Your answer should contain a MAXIMUM of 3 words in any case!

        For example:
        User Interface Design

        Don't write any other text or code for each entry, just the topic! This is very important.
        If more than one topic is matching, choose the best one. If a topic is matching best, that is not in the given list of topics, find a topic from the list
        that also fits quite well. Again, don't mention anything more than ONE topic name.
        """

        # Send the prompt to the model
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model_name
        )

        # Extract the response content
        result = response.choices[0].message.content.strip()           
            
        # Check if the response matches one of the taxonomy keys
        if result in taxonomy_keys:
            #print(f"Valid result for entry {i}, after attempt {attempts + 1}: {result}")
            break  # Valid response, exit the retry loop
        else:
            attempts += 1
    
    # If max attempts are reached, log the final result even if invalid
    if attempts == max_attempts:
        invalid_results += 1
        #print(f"Invalid (!) result for entry {i}, after 5 attempts: {result}")
print(f'Invalid Results: {invalid_results}')
    

Invalid Results: 8


### 2. Rephrase and Respond Prompting (see Deng et al. 2023,  https://arxiv.org/abs/2311.04205)
Rephrasing or restating the original input to ensure clarity and understanding before providing a response. 

In [9]:
prompt = f"""
        Only REPHRASE the following task to make it clearer for processing. Don't skip any details. Be as precise as possible.
        "You are a helpful assistant. I have an entry in JSON format.
        Find out to what field/topic this research paper belongs. Topics can be found as keys with their corresponding keywords as values in another
        JSON file.
        Go through the Abstracts, Full Texts, and output the topic a specific paper most likely belongs to. The output structure should ONLY 
        include the corresponding topic (must be a listed topic in the taxonomy file). Your answer should contain a MAXIMUM of 3 words in any case!

        For example:
        User Interface Design

        Don't write any other text or code for each entry, just the topic! This is very important.
        If more than one topic is matching, choose the best one. If a topic is matching best, that is not in the given list of topics, find a topic from the list
        that also fits quite well. Again, don't mention anything more than ONE topic name."
        """
response = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=model_name)

# Append the response content to results
result = response.choices[0].message.content
print(result)

You are a helpful assistant tasked with categorizing a research paper into a specific field/topic based on its abstract and full text. The topics and their corresponding keywords are provided in a JSON file. Your task is to analyze the abstract and full text of the paper, match the content with the keywords in the JSON file, and output the most suitable topic that the paper belongs to, choosing the best match from the provided list. The output should be a single topic, in any case, with a maximum of 3 words, without any additional text or explanation. If multiple topics match, select the most relevant one, and if the best match is not in the list, choose a topic from the list that also fits relatively well, ensuring the output is a single topic name only.


In [10]:
# Parameters
max_attempts = 5  # Maximum number of retries for a single entry

# Extract the keys from the taxonomy JSON
taxonomy_keys = set(taxonomy.keys())

invalid_results = 0

# Loop through each entry in the data
for i, entry in enumerate(filtered_data_100):
    # Serialize the entry and taxonomy JSON
    entry_json = json.dumps(entry, indent=2, ensure_ascii=False)
    taxonomy_json_str = json.dumps(taxonomy, indent=2, ensure_ascii=False)

    # Retry mechanism for generating the response
    attempts = 0
    while attempts < max_attempts:
        # Construct the prompt
        prompt = f"""
        You are a helpful assistant tasked with categorizing a research paper into a specific field/topic based on its abstract and full text found in
        here: {entry_json}. 
        The topics and their corresponding keywords are provided in a JSON file: {taxonomy_json_str}. Your task is to analyze the abstract 
        and full text of the paper, match the content with the keywords in the JSON file, and output the most suitable topic that the paper belongs to, 
        choosing the best match from the provided list. The output should be a single topic, in any case, with a maximum of 3 words, 
        without any additional text or explanation. If multiple topics match, select the most relevant one, and if the best match is not in the list,
        choose a topic from the list that also fits relatively well, ensuring the output is a single topic name only.
        """

        # Send the prompt to the model
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model_name
        )

        # Extract the response content
        result = response.choices[0].message.content.strip()
        
        # Check if the response matches one of the taxonomy keys
        if result in taxonomy_keys:
        #    print(f"Valid result for entry {i}, after attempt {attempts + 1}: {result}")
            break  # Valid response, exit the retry loop
        else:
            attempts += 1
    
    # If max attempts are reached, log the final result even if invalid
    if attempts == max_attempts:
        invalid_results += 1
    #    print(f"Invalid (!) result for entry {i}, after 5 attempts: {result}")
print(f'Invalid Results: {invalid_results}')


Invalid Results: 8


### 2. Emotion Prompting (see Li et al. 2023, https://arxiv.org/abs/2307.11760)
Respond by acknowledging and addressing the emotional context of the input.

In [11]:
# Parameters
max_attempts = 5  # Maximum number of retries for a single entry

# Extract the keys from the taxonomy JSON
taxonomy_keys = set(taxonomy.keys())

invalid_results = 0

# Loop through each entry in the data
for i, entry in enumerate(filtered_data_100):
    # Serialize the entry and taxonomy JSON
    entry_json = json.dumps(entry, indent=2, ensure_ascii=False)
    taxonomy_json_str = json.dumps(taxonomy, indent=2, ensure_ascii=False)

    # Retry mechanism for generating the response
    attempts = 0
    while attempts < max_attempts:
        # Construct the prompt
        prompt = f"""
        You are a helpful assistant. Your help determines the outcome of our project results and is therefore very important to my career.
        I have the following entry as JSON: {entry_json}

        Find out to what field / topic this research paper belongs. Topics can be found as keys with their corresponding keywords as values in here:
        {taxonomy_json_str}

        Go through the Abstracts, Full Texts, and output the topic a specific paper most likely belongs to. The output structure should ONLY include the corresponding 
        topic (must be a listed topic in the taxonomy file). Your answer should contain a MAXIMUM of 3 words in any case!

        For example:
        User Interface Design

        Don't write any other text or code for each entry, just the topic! This is very important.
        If more than one topic is matching, choose the best one. If a topic is matching best, that is not in the given list of topics, find a topic from the list
        that also fits quite well. Again, don't mention anything more than ONE topic name.
        """

        # Send the prompt to the model
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model_name
        )

        # Extract the response content
        result = response.choices[0].message.content.strip()
        
        # Check if the response matches one of the taxonomy keys
        if result in taxonomy_keys:
        #    print(f"Valid result for entry {i}, after attempt {attempts + 1}: {result}")
            break  # Valid response, exit the retry loop
        else:
            attempts += 1
    
    # If max attempts are reached, log the final result even if invalid
    if attempts == max_attempts:
        invalid_results += 1
    #    print(f"Invalid (!) result for entry {i}, after 5 attempts: {result}")
print(f'Invalid Results: {invalid_results}')


Invalid Results: 7


### 3. Chain of Thought Prompting (see Wei et al. 2022, https://arxiv.org/abs/2201.11903)
Generate step-by-step reasoning to solve a problem or answer a complex question.

In [13]:
# Parameters
max_attempts = 5  # Maximum number of retries for a single entry

# Extract the keys from the taxonomy JSON
taxonomy_keys = set(taxonomy.keys())

invalid_results = 0

# Loop through each entry in the data
for i, entry in enumerate(filtered_data_100):
    # Serialize the entry and taxonomy JSON
    entry_json = json.dumps(entry, indent=2, ensure_ascii=False)
    taxonomy_json_str = json.dumps(taxonomy, indent=2, ensure_ascii=False)

    # Retry mechanism for generating the response
    attempts = 0
    while attempts < max_attempts:
        # Construct the prompt
        prompt = f"""
        Your anwser should be a MAXIMUM of 3 words under eyery circumstances!!
        You are a helpful assistant. I have the following entry as JSON: {entry_json}
        
        Find out to what field or topic this research paper belongs. Topics can be found as keys with their corresponding keywords as values in here:
        {taxonomy_json_str}
        
        Think step by step:
        1. Carefully analyze the Abstracts and Full Texts in the entry.
        2. Identify the key concepts, phrases, or keywords from the text that describe the primary focus of the paper.
        3. Compare these key concepts with the topics and their keywords provided in the taxonomy file.
        4. Determine which topic in the taxonomy file best matches the content and purpose of the paper.
        5. If more than one topic seems relevant, choose the one that fits best or is most closely aligned.
        6. Output only the topic name from the taxonomy, ensuring it is no more than 3 words.
        
        Example:
        Step-by-step reasoning:
        - The Abstract mentions "designing user interfaces," "usability testing," and "interaction design."
        - These align with the keywords "design," "interface," and "usability" under the topic "User Interface Design."
        Final Output:
        User Interface Design
        
        Now, do the same for the given entry. Output the topic name based on your reasoning process. Ensure your final output is ONLY the topic name.
        """

        # Send the prompt to the model
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model_name
        )

        # Extract the response content
        result = response.choices[0].message.content.strip()
        
        # Check if the response matches one of the taxonomy keys
        if result in taxonomy_keys:
        #    print(f"Valid result for entry {i}, after attempt {attempts + 1}: {result}")
            break  # Valid response, exit the retry loop
        else:
            attempts += 1
    
    # If max attempts are reached, log the final result even if invalid
    if attempts == max_attempts:
        invalid_results += 1
    #    print(f"Invalid (!) result for entry {i}, after 5 attempts: {result}")
print(f'Invalid Results: {invalid_results}')


Invalid Results: 23


### 4. Self Consistency (see Zang et al. 2022, https://arxiv.org/abs/2203.11171)
Generate multiple responses to ensure consistent and reliable answers.

In [6]:
from collections import Counter

# Parameters
num_attempts = 3  # Number of times each entry is sent to the LLM

# Extract the keys from the taxonomy JSON
taxonomy_keys = set(taxonomy.keys())

invalid_results = 0

# Loop through each entry in the data
for i, entry in enumerate(filtered_data_100):
    # Serialize the entry and taxonomy JSON
    entry_json = json.dumps(entry, indent=2, ensure_ascii=False)
    taxonomy_json_str = json.dumps(taxonomy, indent=2, ensure_ascii=False)

    # Store responses for the current entry
    responses = []

    # Send the same entry to the LLM multiple times
    for attempt in range(num_attempts):
        # Construct the prompt
        prompt = f"""
        You are a helpful assistant. I have the following entry as JSON:
        {entry_json}

        Find out to what field / topic this research paper belongs. Topics can be found as keys with their corresponding keywords as values in here:
        {taxonomy_json_str}

        Go through the Abstracts, Full Texts, and output the topic a specific paper most likely belongs to. The output structure should ONLY include the corresponding 
        topic (must be a listed topic in the taxonomy file). Your answer should contain a MAXIMUM of 3 words in any case!

        For example:
        User Interface Design

        Don't write any other text or code for each entry, just the topic! This is very important.
        If more than one topic is matching, choose the best one. If a topic is matching best, that is not in the given list of topics, find a topic from the list
        that also fits quite well. Again, don't mention anything more than ONE topic name.
        """

        # Send the prompt to the model
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model_name
        )

        # Extract the response content
        result = response.choices[0].message.content.strip()

        # Add the result to the responses list
        responses.append(result)

    # Determine the most frequent response
    response_counts = Counter(responses)
    most_common_response, frequency = response_counts.most_common(1)[0]

    # Check if the most common response is valid
    if most_common_response in taxonomy_keys:
        print(f"Valid result for entry {i}: {most_common_response} (appeared {frequency} times)")
        #continue
    else:
        print(f"Invalid (!) result for entry {i}: {most_common_response} (appeared {frequency} times)")
        invalid_results += 1

print(f'Invalid Results: {invalid_results}')

Valid result for entry 0: Education (appeared 3 times)
Valid result for entry 1: Education (appeared 3 times)
Valid result for entry 2: User Interface Design (appeared 3 times)
Valid result for entry 3: User Interface Design (appeared 3 times)
Valid result for entry 4: User Interface Design (appeared 3 times)
Valid result for entry 5: User Interface Design (appeared 3 times)
Valid result for entry 6: User Interface Design (appeared 3 times)
Valid result for entry 7: Information Retrieval (appeared 3 times)
Valid result for entry 8: User Interface Design (appeared 3 times)
Valid result for entry 9: Information Retrieval (appeared 3 times)
Valid result for entry 10: Information Retrieval (appeared 3 times)
Valid result for entry 11: Human-Machine Systems (appeared 3 times)
Valid result for entry 12: Machine Control Systems (appeared 2 times)
Valid result for entry 13: Human-Machine Systems (appeared 2 times)
Valid result for entry 14: Machine Control Systems (appeared 3 times)
Valid resu


KeyboardInterrupt



### 5. ChatGPT- aided Prompt Design
Given the vanilla prompt, we asked ChatGPT-4o to rewrite the code to improve the performance given the problems that are occuring.
The prompt we gave ChatGPT was the vanilla prompt and:
"We have this prompt for our project. We want the model to do exactly what we wrote down in the prompt. Unfortunately, the model sometimes outputs results, that don't fit the format. for example is longer than 3 words (because it is unsure which topic to pick instead of picking the most fitting one) or sometimes it creates topics on its own. We are using the Llama 3 model. Rewrite the code to improve the models performance. Write it in a format that I can directly copy and paste in my Python script."

In [19]:
# Parameters
max_attempts = 5  # Maximum number of retries for a single entry

# Extract the keys from the taxonomy JSON
taxonomy_keys = set(taxonomy.keys())

invalid_results = 0

# Loop through each entry in the data
for i, entry in enumerate(filtered_data_100):
    # Serialize the entry and taxonomy JSON
    entry_json = json.dumps(entry, indent=2, ensure_ascii=False)
    taxonomy_json_str = json.dumps(taxonomy, indent=2, ensure_ascii=False)

    # Retry mechanism for generating the response
    attempts = 0
    while attempts < max_attempts:
        # Construct the prompt
        prompt = f"""
        You are a highly focused assistant trained to classify research papers based on their content. 
        Your task is to determine the most appropriate topic from the provided taxonomy for each research paper.
        
        Here is the research paper entry as JSON:
        {entry_json}
        
        Here is the taxonomy with topics as keys and their associated keywords as values:
        {taxonomy_json_str}
        
        Your task:
        1. Read the Abstract and Full Text provided in the research paper JSON.
        2. Compare the content with the keywords in the taxonomy.
        3. Select the ONE best-matching topic from the taxonomy.
           - The output must be EXACTLY one topic from the taxonomy, with a maximum of 3 words.
           - Do NOT create new topics or deviate from the taxonomy list.
           - If more than one topic matches, choose the most relevant one.
        4. Output ONLY the topic name (no explanations, no additional words, no code).
        
        Examples of correct outputs:
        - User Interface Design
        - Machine Learning
        - Data Security
        
        Examples of incorrect outputs:
        - User Interface Design in Applications (too long)
        - Novel Topic XYZ (not in taxonomy)
        - This paper belongs to Machine Learning (extra explanation)
        
        Now, analyze the given research paper and return the most relevant topic:
        """

        # Send the prompt to the model
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model_name
        )

        # Extract the response content
        result = response.choices[0].message.content.strip()           
            
        # Check if the response matches one of the taxonomy keys
        if result in taxonomy_keys:
            #print(f"Valid result for entry {i}, after attempt {attempts + 1}: {result}")
            break  # Valid response, exit the retry loop
        else:
            attempts += 1
    
    # If max attempts are reached, log the final result even if invalid
    if attempts == max_attempts:
        invalid_results += 1
        print(f"Invalid (!) result for entry {i}, after 5 attempts: {result}")
print(f'Invalid Results: {invalid_results}')
    

Invalid (!) result for entry 22, after 5 attempts: Information Systems Security DesignMethods seems closely related, but "Information systems security design methods" is the title and the closest topic in the taxonomy that matches this description would be related to systems, design, and security, which aligns with "User Interface Design" not being the best fit but rather something more closely related to system security. 

Given the context and the title "Information systems security design methods," a more suitable topic seems to be one that encompasses systems and security. However, since "Information Systems Security" is not explicitly listed but we have design and security-related terms, we look into the provided taxonomy for the closest match which involves'system' and'security' or related concepts. 

Human-Machine Systems could be considered due to its broad coverage of systems, but it doesn't directly address security as its primary focus. Since we need a topic that closely ali