# Read file from RQ1

In [None]:
import sys, pickle, os, json, re, time, random, logging, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, scipy, sklearn, networkx as nx, importlib
import tools
importlib.reload(tools)

# Read the Excel file from the specified path
df = pd.read_excel('df.xlsx')

# Display the column names
print(df.columns)

# Generate questions

In [None]:
def generate_question_prompt(input_details):
    """
    Generate single-choice questions based on the knowledge graph or full-text content.
    External background knowledge is forbidden; only information inferable from the input may be used.
    """
    import re

    # Extract entity names from the input for setting the question target
    entity_match = re.search(r'entities are \[(.*?)\]', input_details)
    entities = entity_match.group(1).split(',') if entity_match else []
    
    if len(entities) > 1:
        entity_requirement = (
            f"The questions must cover all of the listed cyber threat-related entities: {', '.join(entities)}. "
            "Questions should reflect each entity's specific roles, actions, and relationships as shown in the provided content."
        )
    elif len(entities) == 1:
        entity_requirement = (
            f"The questions must focus on the specified cyber threat entity: {entities[0]}. "
            "The questions should analyze its behaviors, characteristics, and how it interacts with other entities based solely on the input."
        )
    else:
        entity_requirement = (
            "The questions must target cyber threat-related entities discussed in the content. "
            "Each question should focus on an identifiable threat-related behavior or relationship."
        )

    prompt_message = [
        {
            "role": "system",
            "content": (
                "You are a large language model that must rely solely on the provided threat intelligence report or structured knowledge graph as your only source of information. "
                "You do NOT have access to any external world knowledge, pretrained threat data, or background about the entities. "
                "You are strictly forbidden from assuming any facts not explicitly mentioned or inferable from the provided input. "
                "If certain information is not found within the input, you must not guess, generalize, or rely on external cybersecurity knowledge."
            )
        },
        {
            "role": "user",
            "content": (
                "The following content contains threat intelligence data (either full-text article or structured knowledge graph). "
                "All questions and answers must be exclusively answerable based on this content. "
                "You MUST NOT incorporate any outside knowledge about cyber threats or general attack techniques. "
                "The correct answers must be supported by specific evidence in the input content.\n\n"
                "Here is the input:\n"
                + input_details +
                "\n\n"
                "Now, please generate 10 CHALLENGING single-choice questions, strictly satisfying the following conditions:\n"
                "0. Each question should mainly focus on the topic entity of the content that I provided. "
                "The question text should contain the topic entity name and inquire about its behavior on computer systems, networks, or data, its built-in functions/characteristics, or its relationship with other entities.\n"
                "1. The answer should not be guessable by only reading the question and the four answer choices. \n"
                "2. Skip any questions about who discovered the topic, which company discovered it first, or which solution/software can solve the topic.\n"
                "3. The correct answer must be clearly supported by the content.\n"
                "4. Each question must have four answer choices: A, B, C, and D.\n"
                "   - One answer should be a 'wrong name' that appears in the content but is not supported as the correct answer. It should not be the same as the 'correct answer' or 'similar entity name'.\n"
                "   - Another answer should be a 'similar entity name' modified from the 'correct answer' or the 'wrong name' using synonyms/antonyms/meaningful word changes (for example, changing 'Expatriate Pakistanis' to 'Local Pakistanis'). Avoid trivial letter or spelling changes (such as 'Expatriate Pakistans' or 'Expatriate Pakistonis').\n"
                "   - One answer is the correct answer.\n"
                "   - The last answer should be a 'random name' that does not appear in the content and appears plausible if a reader only has access to the question and its four answer choices.\n"
                "5. You must return a JSON array of **exactly 10** question objects with no extra explanations or commentary.\n"
                "6. Each question must be in the following format:\n"
                "{\n"
                '  "raw_question": "<One challenging question that requires understanding of the input content>\\nA. ...\\nB. ...\\nC. ...\\nD. ...",\n'
                '  "correct_answer": "C"\n'
                "}\n\n"
                "If the input lacks sufficient details to create 10 valid questions, state so explicitly and return an empty JSON array: []\n\n"
                + entity_requirement
            )
        }
    ]
    
    return prompt_message


# Initialize a list to store all input contents
alltext = []

# For each row in the DataFrame
for index, row in df.iterrows():
    # Extract the target entity for question generation and the full text
    target_entities = row["Topic Threat"]
    target_text = row['text']
    # Construct the input content string
    input_content = 'Topic entity of the content are: ' + str(target_entities) + ' and the content is:' + str(target_text)
    # Append to the list
    alltext.append(input_content)
    
print(f"Number of generated input content: {len(alltext)}")

# Process all input content with prompt_list_maker from the tools module
prompts = tools.prompt_list_maker(
    alltext,
    prompt_maker_fun=generate_question_prompt,
    max_length_inK_of_prompt=128,
    min_length_of_prompt=128,
    count_in_tokenlen=True,
    sample_size=100
)

ans_questions = tools.ask_group_link(
    prompts,
    token=89 * 1024,
    temp=1,
    model="gpto3mini",
    streamprint=False,
    max_workers=32,
    weight=2,
    forcegpt=True
)


# Create df_ai for RQ2

In [None]:
import pandas as pd
import json
import re
from json_repair import repair_json
from json_repair import loads as json_loads

def simplify_kg_str(inputstring: str) -> str:
    """
    Simplify the input knowledge graph string:
      - Extract the entity list and relationship list from the input string (delimited by specific markers).
      - For entities: output only the entity names that have non-empty alias or mother entity information (in a simplified format).
      - For relationships: format as triples [sub, rel, obj].
    If the simplified relationship string is less than 10 characters in length, return the original string.
    """
    # Define regex patterns to extract entity and relationship sections
    entity_pattern = r"#Final_Entity_List_Start#\s*json\s*(\[[\s\S]*?\])\s*#Final_Entity_List_End#"
    relationship_pattern = r"#Final_Relationship_List_Start#\s*json\s*(\[[\s\S]*?\])\s*#Final_Relationship_List_End#"
    
    # Parse the entity list
    entity_match = re.search(entity_pattern, inputstring)
    if entity_match:
        entity_json_str = entity_match.group(1)
        try:
            EntityList = json_loads(entity_json_str)
        except Exception:
            EntityList = []
    else:
        EntityList = []
    
    # Parse the relationship list
    relationship_match = re.search(relationship_pattern, inputstring)
    if relationship_match:
        relationship_json_str = relationship_match.group(1)
        try:
            RelationshipList = json_loads(relationship_json_str)
        except Exception:
            RelationshipList = []
    else:
        RelationshipList = []
    
    # Simplify entities: keep only those with non-empty alias or mother entity
    simplified_entities = []
    for entity in EntityList:
        if isinstance(entity, dict):
            name = entity.get("name", "").strip()
            alias_list = [a for a in entity.get("alias", []) if a and a != "None"]
            mother_list = [m for m in entity.get("mother entity", []) if m and m != "None"]
            # Process only if alias_list or mother_list is not empty
            if alias_list or mother_list:
                extras = []
                if alias_list:
                    extras.append("alias:[" + ",".join(alias_list) + "]")
                if mother_list:
                    extras.append("mother:[" + ",".join(mother_list) + "]")
                extra = "(" + ",".join(extras) + ")" if extras else ""
                if name:
                    simplified_entities.append(name + extra)
    
    # Simplify relationships: format as [sub, rel, obj] triple
    simplified_relationships = []
    for rel in RelationshipList:
        if isinstance(rel, dict):
            sub = rel.get("sub", "").strip()
            relation = rel.get("rel", "").strip()
            obj = rel.get("obj", "").strip()
            if sub and relation and obj:
                simplified_relationships.append("[" + ",".join([sub, relation, obj]) + "]")
    
    # Combine the relationships into a single string
    relationship_str = ",".join(simplified_relationships)
    
    # If the relationship string is too short, return the original input string
    if len(relationship_str) < 10:
        return inputstring
    
    return (
        "Special node with alias and mother entity: " + ",".join(simplified_entities)
        + "\nFull knowledge graph: " + relationship_str
    )
    
def simplify_kg_str_for_4o_and_o3(inputstring: str) -> str:
    """
    Simplify the input knowledge graph string suitable for 4o and o3 formats:
      - Extract the entity list and relationship list from the input string using the following markers:
            #Entity_List_Start# ... #Entity_List_End#
            #Relationship_List_Start# ... #Relationship_List_End#
        Supports optional markdown code block wrappers (e.g., ```json ... ```).
      - For entities: output the entity name, appending non-empty alias and mother entity information.
      - For relationships: format as a triple [sub, rel, obj] (convert non-string fields to string).
    
    Return the final descriptive string.
    """
    import re
    from json_repair import loads as json_loads

    def safe_strip(value):
        if isinstance(value, str):
            return value.strip()
        elif isinstance(value, list):
            # Convert each element in the list to string and join with commas
            return ",".join(str(item).strip() for item in value)
        else:
            return str(value).strip()

    # Regex patterns supporting markdown code block wrappers
    entity_pattern = r"#Entity_List_Start#\s*(?:```json\s*)?(\[[\s\S]*?\])(?:\s*```)?\s*#Entity_List_End#"
    relationship_pattern = r"#Relationship_List_Start#\s*(?:```json\s*)?(\[[\s\S]*?\])(?:\s*```)?\s*#Relationship_List_End#"
    
    # Parse the entity list
    EntityList = []
    entity_match = re.search(entity_pattern, inputstring)
    if entity_match:
        entity_json_str = entity_match.group(1)
        try:
            EntityList = json_loads(entity_json_str)
        except Exception:
            EntityList = []
    
    # Parse the relationship list
    RelationshipList = []
    rel_match = re.search(relationship_pattern, inputstring)
    if rel_match:
        relationship_json_str = rel_match.group(1)
        try:
            RelationshipList = json_loads(relationship_json_str)
        except Exception:
            RelationshipList = []
    
    # Simplify entities: output name with non-empty alias and mother entity information
    simplified_entities = []
    for entity in EntityList:
        if isinstance(entity, dict):
            name = safe_strip(entity.get("name", ""))
            alias_list = entity.get("alias", [])
            alias_list = [safe_strip(a) for a in alias_list if a and a != "None"]
            mother_list = entity.get("mother entity", [])
            mother_list = [safe_strip(m) for m in mother_list if m and m != "None"]
            extras = []
            if alias_list:
                extras.append("alias:[" + ",".join(alias_list) + "]")
            if mother_list:
                extras.append("mother:[" + ",".join(mother_list) + "]")
            extra = "(" + ",".join(extras) + ")" if extras else ""
            if name:
                simplified_entities.append(name + extra)
    
    # Simplify relationships: format as [sub, rel, obj] triple
    simplified_relationships = []
    for rel in RelationshipList:
        if isinstance(rel, dict):
            sub = safe_strip(rel.get("sub", ""))
            relation = safe_strip(rel.get("rel", ""))
            obj = safe_strip(rel.get("obj", ""))
            if sub and relation and obj:
                simplified_relationships.append("[" + ",".join([sub, relation, obj]) + "]")
    
    return (
        "Special node with alias and mother entity: " + ", ".join(simplified_entities) +
        "\nFull knowledge graph: " + ", ".join(simplified_relationships)
    )

# Build rows for the final DataFrame
rows = []
# Original method list
methods = ['CyberDoc', 'GPT4o', 'CTIKG']

for i in range(len(df)):
    # Extract 'Article' (正文) and 'idorurl'
    text = df.iloc[i]['text']
    idorurl = df.iloc[i]['idorurl']
    
    # Parse ans_quesitons[i] (a string). Try using json.loads first; if fails, repair using repair_json.
    try:
        questions = json.loads(ans_questions[i])
    except Exception:
        try:
            fixed = repair_json(ans_questions[i])
            questions = json.loads(fixed)
        except Exception as e:
            print(f"Error parsing ans_questions[{i}]: {e}")
            continue

    if not isinstance(questions, list):
        print(f"ans_questions[{i}] did not parse as a list")
        continue

    # For each question, generate 5 entries corresponding to the original methods and two additional methods.
    for q in questions:
        if isinstance(q, dict) and 'raw_question' in q and 'correct_answer' in q:
            raw_q = q['raw_question']
            # Retain the complete raw_question (including ABCD options), remove numbering at the beginning
            question_text = re.sub(r'^\d+\.\s*', '', raw_q.strip())
            answer_text = q['correct_answer']
        else:
            question_text = str(q).strip()
            answer_text = ""
        
        # For the original methods
        for method in methods:
            # Choose KG string simplification based on the method:
            # For 'CyberDoc', use simplify_kg_str; for 'GPT4o' and 'Correct', use simplify_kg_str_for_4o_and_o3.
            if method in ['CyberDoc']:
                method_cell = df.iloc[i][f'{method}结果']
                method_result = simplify_kg_str(method_cell)
            elif method in ['GPT4o', 'Correct']:
                method_cell = df.iloc[i][f'{method}结果']
                method_result = simplify_kg_str_for_4o_and_o3(method_cell)
            else:
                method_result = df.iloc[i][f'{method}结果']
            rows.append([
                text,             # Article
                question_text,    # Question
                answer_text,      # Correct Answer
                method,           # Method
                method_result,    # Context provided by the method
                idorurl           # idorurl
            ])
        rows.append([
            text,             # Article
            question_text,    # Question
            answer_text,      # Correct Answer
            "No Content",     # Method
            "Sorry, No Context is available for this question, you should provide a final answer by your own",  # Fixed prompt
            idorurl           # idorurl
        ])

# Create the DataFrame and rename columns to English
df_ai = pd.DataFrame(rows, columns=['Article', 'Question', 'Correct Answer', 'Method', 'Method Context', 'idorurl'])
print(f"Created {len(df_ai)} rows of data")
df_ai.head()


# ask questions based on graphs

In [None]:
def generate_prompt(article, original_question):
    prompt = [
        {
            "role": "user",
            "content": (
                "The article or the related knowledge graph is provided below:\n" + article + "\n\n" +
                "Here is the question:\n" + original_question + "\n\n"
                "Your answer should be in this format: <think>How to answer the question</think>Answer (one character A/B/C/D), such as '<think>The question is about the behavior of the entity, so I find that the content ''xxx'' may solve the problem</think>A'.\n"
            )
        }
    ]
    return prompt

# Two lists to store prompts and corresponding df_ai indices so that answers can be mapped back later.
qwq_prompts = []
fouro_prompts = []
qwq_indices = []
fouro_indices = []

# Iterate through df_ai to generate prompts and categorize based on the method.
for idx in df_ai.index:
    article = df_ai.loc[idx, 'Method Context']
    # Use the "Question" field in df_ai as the original question
    original_question = df_ai.loc[idx, 'Question']
    prompt = generate_prompt(article, original_question)
    if df_ai.loc[idx, 'Method'] == 'CyberDoc':
        qwq_prompts.append(prompt)
        qwq_indices.append(idx)
    else:
        fouro_prompts.append(prompt)
        fouro_indices.append(idx)

# Call the ask_batch API for each group
answer_qwq = tools.ask_batch(
    qwq_prompts,
    token=16 * 1024,
    temp=0.7,
    model="gpt4omini",
    streamprint=False,
    max_workers=12,
    weight='auto',
    forcegpt=True
)

answer_gpt = tools.ask_batch(
    fouro_prompts,
    token=16 * 1024,
    temp=0.7,
    model="gpt4omini",
    streamprint=False,
    max_workers=12,
    weight='auto',
    forcegpt=True
)

# Map the returned answers back to df_ai in the "Context-based Answer" column.
answer_qwq_cleaned = tools.cleanthinkans(answer_qwq)
answer_gpt_cleaned = tools.cleanthinkans(answer_gpt)

import re
# Define a function to extract the first option letter (A/B/C/D) from an answer.
def extract_first_option(answer):
    match = re.search(r'[ABCD]', answer)
    if match:
        return match.group(0)
    else:
        return answer

# Process answers from the QWQ model.
for idx, ans in zip(qwq_indices, answer_qwq_cleaned):
    option = extract_first_option(ans)
    df_ai.at[idx, 'Context-based Answer'] = option

# Process answers from the GPT model.
for idx, ans in zip(fouro_indices, answer_gpt_cleaned):
    option = extract_first_option(ans)
    df_ai.at[idx, 'Context-based Answer'] = option

print("Q&A complete, and the context-based answers (options A/B/C/D) have been mapped to df_ai.")

# Compare "Correct Answer" and "Context-based Answer" to generate a new column "Is Correct"
df_ai['Is Correct'] = df_ai['Correct Answer'] == df_ai['Context-based Answer']

# Group by "Method" to compute total counts and correct counts, then calculate accuracy.
accuracy_by_method = df_ai.groupby('Method')['Is Correct'].agg(Total='count', Correct='sum')
accuracy_by_method['Accuracy'] = accuracy_by_method['Correct'] / accuracy_by_method['Total']

print(accuracy_by_method)

# Create a mapping from 'idorurl' to the main category from the original df.
classification_map = df.set_index('idorurl')['article type'].to_dict()

# Add a new column "Major Category" to df_ai using the mapping from idorurl.
df_ai['Major Category'] = df_ai['idorurl'].map(classification_map)

# Group by "Method" and "Major Category" to calculate accuracy.
accuracy_by_method_and_class = df_ai.groupby(['Method', 'Major Category'])['Is Correct'].agg(
    Total='count', Correct='sum'
)
accuracy_by_method_and_class['Accuracy'] = accuracy_by_method_and_class['Correct'] / accuracy_by_method_and_class['Total']

# Create a pivot table for better visualization.
pivot_accuracy = accuracy_by_method_and_class['Accuracy'].unstack('Major Category')

# Calculate the average accuracy for each method.
pivot_accuracy['Average Accuracy'] = df_ai.groupby('Method')['Is Correct'].mean()

# Rename axes for a professional presentation.
pivot_accuracy.rename_axis("Threat Category", axis="columns", inplace=True)
pivot_accuracy.rename_axis("Method", axis="index", inplace=True)

# Format accuracy as percentages with two decimal places.
pivot_accuracy = pivot_accuracy.applymap(lambda x: f"{x*100:.2f}%" if pd.notnull(x) else "N/A")

# Reorder methods as specified.
method_order = ['CyberDoc', 'CTIKG', 'GPT4o', 'No Content']
pivot_accuracy = pivot_accuracy.reindex(method_order)

# Display the final results.
print("\nAccuracy for Different Threat Categories:")
print(pivot_accuracy)