# Key Prompts

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter;
from tqdm import tqdm

entity_type_definition = '''
###Main Topic:Actors and Identities###
Name:'user-account'
Definition:Represents a user account that can be used to log in and access system resources.
Note:This object is used to describe account information involved in an attack. Its key properties include the user ID (e.g., SID for Windows or UID for Linux), login name, and display name. This object can represent both human user accounts (e.g., 'user123') and system or service accounts (e.g., 'SYSTEM' or 'root'), which is crucial for analyzing attack behaviors like privilege escalation and lateral movement.
Example:NT AUTHORITY\SYSTEM

Name:'identity'
Definition:An entity that represents a person, organization, or group.
Note:This is a broad category for any formally identified entity. It must be distinguished from 'threat-actor-or-intrusion-set'. An 'identity' can be a victim, a reporting party, a researcher, or a benign company. Only when an 'identity' is determined to be malicious should it be classified as a 'threat-actor-or-intrusion-set'.
Example:Google Threat Analysis Group, FireEye

Name:'threat-actor-or-intrusion-set'
Definition:A combined entity that can represent an individual, group, or organization conducting cyberattacks (i.e., a 'Threat Actor'), or a set of attack activities with common goals, tactics, and infrastructure (i.e., an 'Intrusion Set').
Note:This type merges the attacker itself with its cluster of activity. When a threat actor has a clear name (e.g., APT28), use its name directly. If there is no clear name, but its activity can be identified by the tool or malware used, uniformly use the format 'Attacker(using: CVE-2021-44228)'.
Example:APT28, Sandworm Team, Attacker(using: CVE-2021-44228)

###Main Topic:Malicious Code and Tools###
Name:'detailed-part-of-malware-or-hackertool'
Definition:A specific component, function, module, or configuration block that is an integral part of a 'malware' or 'hacker-tool'.
Note:This describes the internal, often custom-built, workings of malicious software. It is the component-level counterpart to 'malware' and 'hacker-tool'. For components of legitimate software, use 'detailed-part-of-general-software'.
Example:The EternalBlue exploit module in WannaCry; a specific function named keylog_routine().

Name:'malware'
Definition:Software designed to be executed on a victim's system to cause harm, steal data, or establish unauthorized control. It functions as the malicious payload or "ammunition" in an attack.
Note:This entity represents the part of the attack that runs within the victim's environment, often automatically or silently. While some systems (like RATs) have components of both, 'malware' specifically refers to the client-side agent. Examples include ransomware (WannaCry), trojans (Emotet), and spyware (Agent Tesla).
Example: WannaCry, Emotet, Agent Tesla

Name:'hacker-tool'
Definition:Software used by an attacker or security professional to orchestrate, control, or facilitate a cyberattack. It functions as the "workbench" or "control panel" for the operation.
Note:This entity represents the part of the attack that typically runs within the attacker's environment. It is often interactive and used to create payloads, manage infrastructure, or control malware on victim systems. For the server/control component of a RAT, this type should be used. Examples include exploitation frameworks (Metasploit), C2 platforms (Cobalt Strike), and network scanners (Nmap). Only the name should be used.
Example: Cobalt Strike, Metasploit Framework, Mimikatz.

###Main Topic:Legitimate Software###
Name:'detailed-part-of-general-software'
Definition:A specific component, function, library, or command that is part of a 'general-software' entity and is being leveraged in a malicious context.
Note:This allows for a granular description of how legitimate software is abused. It is the component-level counterpart to 'general-software'.
Example:The Invoke-Expression cmdlet in PowerShell; the CreateRemoteThread Windows API function.

Name:'general-software'
Definition:Legitimate software with a non-malicious primary purpose that is either exploited as a target of an attack (e.g., via a vulnerability), serves as a host for malicious code, or is abused by threat actors to facilitate an attack.
Note:This is the counterpart to 'hacker-tool' and is central to Living off the Land (LotL) attacks. For its specific components, use 'detailed-part-of-general-software'.
Example: Microsoft Office, PowerShell, curl, OpenSSH

###Main Topic:Attack Actions and Campaigns###
Name:'vulnerability'
Definition:A flaw or weakness in software, hardware, or procedure that a threat actor can exploit to cause harm.
Note:This entity represents the weakness itself, not the act of exploiting it. An 'attack-pattern' is what 'exploits' a 'vulnerability'. It is most often identified by a formal tracking number like a CVE identifier.
Example:CVE-2021-44228

Name:'attack-pattern'
Definition:A specific attack method or technique described verbatim in the source text that an adversary uses to achieve a malicious objective.
Note:Indicates the exact attack method or technique mentioned in the source text, without mapping to any predefined TTP framework identifiers; retains the terminology used by the original author.
Example:Phishing email delivery, SQL injection attempt, Brute-force login operation

Name:'campaign'
Definition:Refers to a series of attack actions with specific objectives and a timeframe, typically launched by one or more threat actors to achieve a strategic goal.
Note:A campaign usually has a well-known name (e.g., 'Operation Shady RAT'). If an attack campaign does not have a formal or known name, but can be referred to by the key tool or malware it uses, the format 'Attacking(using: CVE-2021-44228)' should be used. 
Example:Operation Shady RAT, Attacking(using: CVE-2021-44228)

###Main Topic:Host-based Observables###
Name:'windows-registry-key'
Definition:Represents a key in the Windows Registry, which is a hierarchical database used for storing settings for the operating system and applications.
Note:The registry key is critical for analyzing malicious activity on Windows systems. Malware often achieves persistence, stores configuration, or disables security software by modifying or creating registry keys. The key properties of this object include the hive, the key path, and one or more values contained within the key, each of which has a name, data, and type.
Example:HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Run

Name:'process'
Definition:An instance of a computer program that is being executed. It contains all the state information of the program at runtime.
Note:This is a core object for host-based analysis. Its key properties include the Process ID (PID), the process name (typically the executable file path), its command-line arguments, and a reference to its parent process (parent_ref). By analyzing processes and their parent-child relationships, a clear execution chain can be constructed, such as a Word document process launching a PowerShell process.
Example:powershell.exe:3104

Name:'file'
Definition:A computer file, which is a resource for storing information in a computer system.
Note:This entity should be distinguished from 'malware'. A 'file' is the technical object (a sequence of bytes with a name and path), while 'malware' is a classification of that file's malicious purpose. Any 'malware' entity is represented by a 'file' object, but not all 'file' objects are malware (e.g., a benign document targeted by an exploit). It is also different from a 'process', which is the running instance of a file.
Example:C:\Windows\System32\ntdll.dll

###Main Topic:Network Observables and Infrastructure###
Name:'url'
Definition:A Uniform Resource Locator, providing a complete, specific address for a resource on the World Wide Web.
Note:The key distinction is with 'domain-name'. A URL is more specific, containing the scheme (e.g., http, ftp), the domain, and often a port, path, and query string. Many different URLs can exist on a single domain.
Example:http://phishing-site.com/login.php?user=victim

Name:'domain-name'
Definition:A human-readable name that corresponds to a network resource, typically an IP address, as part of the Domain Name System (DNS).
Note:This entity should be distinguished from a 'url'. A 'domain-name' is only the host part (e.g., example.com), while a 'url' is the full address including the protocol and path (e.g., https://example.com/login). It is also distinct from the IP address(es) it 'resolves-to'.
Example:evil-c2-server.net

Name:'ipv4-addr'
Definition:An Internet Protocol version 4 (IPv4) address, which is a numerical label assigned to each device participating in a computer network that uses the Internet Protocol for communication.
Note:This is a fundamental network observable. It must be distinguished from a 'mac-address' (Layer 3 logical address vs. Layer 2 physical address) and a 'domain-name' (machine-routable address vs. human-readable name). An IP address by itself is just an observable; it only becomes an 'indicator' when context about its maliciousness is added.
Example:198.51.100.10

Name:'ipv6-addr'
Definition:An Internet Protocol version 6 (IPv6) address, representing a logical network address for modern internet protocols.
Note:Functionally similar to 'ipv4-addr' but with a much larger address space. All distinctions that apply to 'ipv4-addr' (e.g., vs. 'mac-address' or 'domain-name') also apply here.
Example:2001:0db8:85a3:0000:0000:8a2e:0370:7334

Name:'network-traffic'
Definition:Represents an aggregation of network traffic that flows between two or more network endpoints. It can be a single connection or a series of related network packets.
Note:This object does not represent a single packet but describes a "flow". It uses references to other objects to fully define this flow, such as source and destination IP addresses (src_ref, dst_ref), source and destination ports (src_port, dst_port), and the protocols used (protocols). It is often used to describe network-level activities like C2 communications, data exfiltration, or network scanning. In a graph, a traffic node is often named using a 5-tuple.
Example:tcp:192.168.1.100:51234-203.0.113.10:443

Name:'mac-address'
Definition:Refers to the Media Access Control Address, a unique physical address assigned to a network interface controller (NIC) for communication at the data link layer.
Note:A MAC address is typically a 48-bit address, represented as six groups of hexadecimal digits separated by colons or hyphens. It operates at Layer 2 (Data Link Layer) of the OSI model, which is different from an IP address that operates at Layer 3 (Network Layer). An IP address is a logical address that can be changed, while a MAC address is a physical address burned into the hardware. It is used to deliver data frames to the correct device on a local network.
Example:00:1A:2B:3C:4D:5E

Name:'email-address'
Definition:A specific address used to send and receive electronic mail messages.
Note:This is a fundamental observable object. It is a key component in phishing 'attack-patterns' and can be associated with an 'identity' or used as a login for a 'user-account'.
Example:phisher@example.com

Name:'infrastructure'
Definition:Hardware or software resources used to facilitate an activity, particularly an attack.
Note:This is a generic, fallback category. As per its original definition, if a more specific type like 'url', 'ipv4-addr', or 'domain-name' can be used, it should be preferred. 'infrastructure' is for cases where the type is ambiguous or is a higher-level concept (e.g., describing "the attacker's C2 network" as a whole).
Example:Attacker C2 Server

###Main Topic:Data and Credentials###
Name:'credential-value'
Definition:A piece of secret information used for authentication, such as a password, access token, or API key.
Note:This is a specific data type, distinct from 'user-account', which represents the account entity itself. For security reasons, the actual secret value should rarely be stored directly; this entity often represents the existence or type of credential that was compromised.
Example:Stolen Kerberos Ticket

Name:'x509-certificate'
Definition:Refers to a digital certificate that conforms to the X.509 standard, which verifies the authenticity of a public key by binding it to an identity (such as an individual, an organization, or a domain name).
Note:This object is very important when describing cybersecurity incidents, especially in TLS/SSL encrypted communications and code signing. Attackers may use self-signed, stolen, or forged certificates to encrypt C2 communications or to make malware appear legitimate. Its key properties include the issuer, subject, serial number, and various hashes of the certificate (e.g., SHA-1, SHA-256). The hash is often used as the unique identifier for the node.
Example:50D4583B1B35391AA89E86148B267974937447BF

###Main Topic:Intelligence, Defense, and Analysis###
Name:'indicator'
Definition:A pattern of observables or properties that signifies malicious or suspicious activity, serving as a forensic or detection artifact.
Note:An indicator is not the raw data itself, but rather an analytical conclusion or pattern derived from it. It's a critical distinction: An IP address is just an 'ipv4-addr' observable; that same IP address, when known to be malicious and packaged with context (e.g., "C2 server for Emotet"), becomes an 'indicator'. It's the bridge between raw data and actionable intelligence.
Example:[file:hashes.'MD5' = 'd41d8cd98f00b204e9800998ecf8427e']

Name:'course-of-action'
Definition:A recommended step or set of steps to mitigate a threat or remediate an incident.
Note:This entity represents a recommended defensive action, not a technical object itself. It is the answer to "What should we do about this threat?". It is the direct counterpoint to an 'attack-pattern', and is often linked to a threat via the 'mitigates' relationship.
Example:Apply patch for CVE-2021-44228

Name:'security-product'
Definition:A commercial or open-source software, hardware, platform, or service designed to provide cybersecurity functions.
Note:This should be distinguished from 'hacker-tool' (which is typically offensive) and 'general-software' (whose primary purpose is not security). Some tools, like network analyzers, can be dual-use, but are categorized here when used for defense.
Example:CrowdStrike Falcon

Name:'malware-analysis-document-or-publication-or-conference'
Definition:A specifically named or titled source of cybersecurity information, which can be a specific document, a formal publication, or an event.
Note:The key characteristic of this type is that the entity must have a specific, unique name or title. This rule is what distinguishes it from a 'generic-noun' like "Threat Report". For example, "Mandiant APT1 Report" is a specific instance and belongs here, while the term "Threat Report" itself is a 'generic-noun'. Intelligence is often derived from or 'based-on' entities of this type.
Example: 'Mandiant APT1 Report','Kaspersky Lab's "Operation Aurora" White Paper',

###Main Topic:Geographic and Contextual Information###
Name:'location'
Definition:A geographical location, which can range from a country to a specific street address.
Note:This entity is used to provide geographical context to other entities. It can be linked to a 'threat-actor-or-intrusion-set' via the 'originates-from' relationship or to an 'infrastructure' object via the 'located-at' relationship to specify its physical presence.
Example:Beijing, China

###Main Topic:Abstract and Fallback Categories###
Name:'abstract-concept'
Definition:Describes high-level, typically uncountable, ideas, fields of study, principles, or broad categories of activity that do not represent a specific, individual entity.
Note:This type is specifically for uncountable nouns that represent broad ideas. It serves as a high-level fallback category, following the priority: a specific entity type (e.g., 'malware') -> 'abstract-concept' -> 'other' -> 'noise'. This distinguishes abstract ideas from both concrete entities and countable categories of things ('generic-noun').
Example:Cyber Crime, Ransomware Ecosystem, Incident Response, Geopolitics.

Name:'generic-noun'
Definition:A term that refers to a class or type of entity, typically a countable noun, rather than a specific, named instance of that entity.
Note:This type is specifically for countable nouns that represent a class of objects. It helps differentiate between the name of a category and a specific member of that category. It functions within the classification fallback hierarchy: a specific entity type (e.g., 'malware') -> 'generic-noun' -> 'other'-> 'noise'.
Example:Threat Report (as a word), white paper (as a word), Malware (as a word), Vulnerability (as a word).

Name:'other'
Definition:Any other valid entities related to cyber threats that do not fit into any of the other defined categories.
Note:This category functions as a catch-all for meaningful data that is currently unclassifiable. It serves as a source for identifying potential new entity types as the classification schema evolves.
Example:Anti-Ransomware Day, MITRE ATT&CK

Name:'noise'
Definition:Data that is unidentifiable, incorrectly formatted, nonsensical, or otherwise irrelevant junk.
Note:This type is used to flag and isolate poor quality or corrupted data, preventing it from polluting analysis. It is the final classification for data that has no informational value.
Example:'quality#@!', ' (9/1718) [Japanese]'.
'''

rel_type_definition='''
###Main Topic:Attack and Compromise Relationships###
Name:'exploits'
Definition:Represents an entity leveraging a specific flaw or weakness within another entity (typically a Vulnerability) to achieve a malicious objective.
Note:This is a more specific form of 'uses'. When the action involves taking advantage of a known vulnerability, 'exploits' should be used instead of 'uses'.
Example:The malware exploits the CVE-2021-44228 vulnerability.

Name:'bypasses'
Definition:Indicates that an offensive entity (e.g., malware, exploit) successfully evades or circumvents a defensive measure. If 'mitigates' (such as patching to mitigate vulnerabilities) are successful actions for defenders, then 'bypasses' (such as using obfuscation techniques to bypass sandboxes) are successful actions for attackers. It is specifically used to describe the behavior of circumventing and bypassing defensive measures.
Example:The malware's obfuscation technique bypasses sandbox analysis.

Name:'malicious-investigates-track-detects'
Definition:Represents a malicious action where one entity (typically malware or a tool) performs either a discrete investigation, continuous tracking, or active detection of another entity to gather information or for evasive purposes.
Note:This relationship now covers three types of malicious information gathering and reconnaissance: Investigating: One-time reconnaissance of an entity (e.g., a system scan). Tracking: Long-term, continuous surveillance of an entity (e.g., keystroke logging). Detecting (Malicious): Evasion-focused discovery, such as identifying a sandbox, debugger, or specific security tool to alter behavior.
Example:Example 1 (Investigates): A malware implant malicious-investigates-track-detects local system configuration files. Example 2 (Tracks): A spyware module malicious-investigates-track-detects the user's web Browse history. Example 3 (Detecting): The malware malicious-investigates-track-detects the presence of a virtual machine environment.

Name:'impersonates'
Definition:Indicates that one entity actively masquerades as another, distinct entity to deceive or gain trust.
Note:This is distinct from 'alias-of'. 'impersonates' is a deceptive action between two separate entities. In contrast, 'alias-of' links two different names for the very same entity. For example, a hacker 'impersonates' the CEO in an email, whereas "APT28" is an 'alias-of' "Fancy Bear."
Example:A threat actor impersonates a trusted IT administrator to trick users.

Name:'targets'
Definition:Describes an offensive entity directing its actions against another entity. It expresses the intent and direction of an attack.
Note:'targets' describes intent, while 'compromises' describes a successful outcome. An actor might 'target' the financial industry for years but 'compromise' a specific bank in a single operation. 'targets' is also broader than 'exploits'; an actor can 'target' an organization, whereas they 'exploit' a specific vulnerability within that organization's systems.
Example:A phishing campaign targets employees in the financial sector.

Name:'compromises'
Definition:Represents that an offensive entity has successfully violated the confidentiality, integrity, or availability of a target, achieving some form of unauthorized access or control.
Note:See the note under 'targets' for a direct comparison.
Example:The threat actor compromised the company's domain controller.

Name:'leads-to'
Definition:Describes a causal relationship where one entity or event directly results in another outcome or state, often used in attack chains. Relationships such as 'exploits', 'delivers', and 'executes' are all “points” in the attack chain, and 'leads-to' is the “line” connecting these points, clearly showing the logic of “vulnerability exploit leads to remote code execution”. When a relationship meets the subdivision relationship of 'exploits', 'delivers', and 'executes', choose them instead of 'leads-to'.
Example:Exploitation of a vulnerability leads-to remote code execution.

###Main Topic:Data and Payload Movement###
Name:'drops'
Definition:Represents an entity creating a new file on the local filesystem from its own embedded or internal resources.
Note:This relationship exclusively describes the action of Local -> Local file creation, with no network communication involved. This is distinct from 'downloads', which is an External -> Local action.
Example:The installer drops a malicious DLL file into the System32 folder.

Name:'downloads'
Definition:Represents an entity retrieving a file or data from an external, remote source and saving it to the local system.
Note:This relationship exclusively describes the action of External -> Local data transfer. It is the direct opposite of 'drops', which involves no network communication.
Example:The dropper downloads a second-stage payload from a malicious URL.

Name:'executes'
Definition:Signifies that one entity (e.g., a loader, script) runs or initiates another entity (e.g., a malicious executable).
Example:A dropper executes a second-stage payload.

Name:'delivers'
Definition:Represents a higher-level, abstract relationship where one attack component is responsible for 'bringing' a malicious payload or tool to the target environment.
Note:This describes the abstract "bringing" action within an attack chain, answering "How did the payload get here?" at a tactical level. For example, a phishing email 'delivers' malware; this delivery might be achieved through the user 'downloads' an attachment, which then 'drops' an executable.
Example:A phishing campaign delivers the Ursnif malware.

Name:'beacons-to'
Definition:Specifically indicates that malware or an implant periodically sends 'beacon' or 'heartbeat' signals to its Command and Control (C2) server.
Example:Malware beacons-to (beacons-to) Command and Control URL.

Name:'exfiltrate-to'
Definition:Specifically describes the act of stealing data from a compromised system and transmitting it outward to a target location specified by the attacker, such as a server or IP address.
Note:The core of this relationship is purposeful, outbound data transmission. Its distinction from other network relationships lies in intent and direction: (1) Versus 'communicates-with': 'exfiltrate-to' is a specific type of 'communicates-with'. If the purpose of the communication is confirmed to be data theft, 'exfiltrate-to' should be preferred for more precise semantics. If the purpose is unknown, the more general 'communicates-with' should be used. (2) Versus 'downloads': The data flow direction is the opposite of 'downloads'. 'downloads' refers to fetching files from an external source into the victim system, while 'exfiltrate-to' refers to uploading data from the victim system to an external source. (3) Versus 'leaks': 'exfiltrate-to' typically describes a targeted, covert transfer from a victim to an attacker. In contrast, 'leaks' (if used as a custom relationship) usually refers to a broader, potentially public or semi-public data disclosure.
Example:A spyware implant (Malware) exfiltrate-to a specific FTP server (Infrastructure) to upload stolen documents.

Name:'leaks'
Definition:Represents the unauthorized disclosure or public release of sensitive resources. This includes confidential data (e.g., documents, credentials) as well as operational assets like malware source code or vulnerability details. 
Note:'exfiltrate-to' (malware steals data to a server) describes the directed transfer of data from the victim to the attacker. 'leaks' (internal threat actors leak company documents) describes the unauthorized public or semi-public disclosure of sensitive resources (data, source code, etc.). The core difference between the two lies in the direction of information flow and the degree of disclosure.
Example:An insider threat leaks confidential corporate documents online. The source code for a prominent banking trojan leaks onto a public repository.

Name:'communicates-with'
Definition:Describes the occurrence of network communication between two entities. It is a general relationship for network interactions.
Note:'beacons-to', 'downloads', and 'exfiltrate-to' are all specific types of 'communicates-with'. If the traffic is a periodic heartbeat, 'beacons-to' is more precise. If the purpose is to retrieve a file, use 'downloads'. If it is to send data out, use 'exfiltrate-to'. Use 'communicates-with' for general descriptions or when the specific purpose is unknown.
Example:The implant communicates-with a C2 server every hour.

###Main Topic:Infrastructure and Provisioning###
Name:'resolves-to'
Definition:A specific technical relationship describing a domain name being resolved to one or more IP addresses via the Domain Name System (DNS).
Note:See the note under 'hosts'. This relationship is a core technical link for establishing network infrastructure associations.
Example:The malicious domain https://www.google.com/search?q=evil-phishing.com resolves-to the IP address 198.51.100.10.

Name:'hosts'
Definition:Indicates that an infrastructure entity 'carries' or provides the runtime environment for another object, such as a malicious payload, website, or C2 service.
Note:This relationship describes 'carrying' at the infrastructure level. It is distinct from 'delivers', which describes a tactical action, and 'provides', which is more general. For example, a server ('hosts') a malware file, which is then ('downloads') by a victim after being ('delivers') by a phishing link.
Example:A bulletproof hosting provider hosts malware command and control servers.

Name:'provides'
Definition:A general relationship where one entity supplies another with a resource, service, or capability.
Note:This is the most abstract supply relationship and should be used when a more specific term is not applicable. Follow the priority: use 'delivers' for tactical delivery or 'hosts' for infrastructure hosting first. Use 'provides' only when the relationship is more general than these options.
Example:A bulletproof hosting service provides infrastructure for a phishing campaign.

###Main Topic:Attribution and Association###
Name:'authored-by'
Definition:Defines the creator or development source of an entity, such as malware, a tool, a report, or an attack pattern. It is used to trace the provenance of an object.
Note:The core of this relationship is to clarify "who created it". It has key distinctions from other relationships: (1) Versus 'attributed-to': 'authored-by' focuses on the act of creation itself, while 'attributed-to' focuses on assigning responsibility for an attack campaign. An organization can have 'authored-by' a tool, while the campaign that uses the tool is 'attributed-to' another group. (2) Versus 'owns': 'owns' describes the state of ownership over infrastructure or tools, while 'authored-by' describes their creation source.
Example:The Lazarus Group (Identity) authored-by a custom backdoor malware (Malware).

Name:'owns'
Definition:Describes a real-world entity (e.g., an organization, team, or an individual) having ownership or de facto dominion over another entity (e.g., infrastructure, a domain name, or a tool).
Note:The core of this relationship is ownership by a real-world entity. Its distinction from 'controls' lies in the nature of the subject: the subject of 'owns' is a real-world entity (a team, an individual), while the subject of 'controls' is software. This is a critical distinction as it separates the real-world actor from their digital-world proxy tools.
Example:The APT41 group (Identity) owns the domain name evil-domain.com and the C2 server.

Name:'controls'
Definition:Specifically describes the relationship where one software entity (e.g., a trojan, backdoor, RAT) commands and controls another software entity (e.g., a hijacked process, a browser plugin).
Note:The core of this relationship is software-level control. Its key distinction from 'owns' is the level of the controller: the subject of 'controls' is a piece of software (e.g., a RAT), while the subject of 'owns' is a real-world entity (e.g., a team). For example, a team can 'own' a domain name, and the RAT program on the C2 server pointed to by that domain then 'controls' another process on the victim host.
Example:A Remote Access Trojan (RAT) controls a compromised browser process to steal cookies.

Name:'attributed-to'
Definition:Formally assigns the responsibility for a threat activity, such as an Intrusion Set or Campaign, to one or more Threat Actors. This is typically the conclusion derived from intelligence analysis and attribution efforts.
Note:It differs from 'authored-by' and 'affiliated-with'. 'attributed-to' focuses on the responsibility for an attack, while 'authored-by' pertains to the creation of an entity, like malware. An organization might 'author' a tool, but if another affiliated group uses it in an attack, the attack activity is 'attributed-to' the latter. 'affiliated-with' describes a broader organizational or social connection (e.g., membership, employment), whereas 'attributed-to' is a specific assignment of culpability for an action.
Example:Intrusion Set "Sandworm" is attributed-to Russian GRU Unit 74455.

Name:'affiliated-with'
Definition:Describes an affiliation, employment, or membership relationship between individuals and organizations. 'authored-by' refers to the creation relationship, 'attributed-to' refers to the responsibility for the attack, and 'owns' refers to the ownership of the infrastructure. 'affiliated-with' describes an 'affiliation' relationship at the organizational or social level, which is not necessarily creation, attack or ownership. When a relationship meets the subdivision relationship of 'attributed-to' and 'owns', choose them instead of 'affiliated-with'.
Example:A security researcher is affiliated-with a university.

Name:'cooperates-with'
Definition:Describes active, non-hierarchical collaboration between two or more peer entities, such as threat groups working together. 'affiliated-with' describes an affiliation. 'cooperates-with' (threat A cooperates with threat B) describes a collaborative relationship between peer entities.
Example:Threat Actor A cooperates-with Threat Actor B in a joint operation.

###Main Topic:Composition, Capability and State###
Name:'is-part-of'
Definition:Used when one entity is a component, member, or constituent of a larger entity. It is the inverse of ''consists-of''.
Example:A malicious module is-part-of a larger malware family.

Name:'consists-of'
Definition:Describes the compositional relationship where a complex entity is made up of its structural subcomponents.
Note:This relationship should be used to detail an object's "bill of materials" or internal architecture. It is distinct from 'has', which is used to attribute abstract features or capabilities rather than constituent parts. Use 'consists-of' to answer the question, "What is it made of?"
Example:The TrickBot malware framework consists-of numerous distinct modules, such as a password grabber and a VNC module.

Name:'has'
Definition:Indicates that an entity possesses a specific feature, function, or capability, which may be abstract in nature.
Note:This relationship is best used for attributing characteristics or functions to an object. It differs from 'consists-of', which is used for deconstructing an object into its physical or logical components. Use 'has' to answer the question, "What can it do?" or "What properties does it possess?"
Example:A backdoor Trojan has a persistence capability.

Name:'depends-on'
Definition:Signifies that one entity requires another entity to exist or function correctly.
Note:This describes a state of prerequisite or dependency. It differs from uses, which describes an action. For example, malware uses PowerShell to execute commands, but it depends-on a specific library to run. It covers terms like requires and is required for.
Example:A malware depends-on a specific version of the .NET Framework.

Name:'creates-or-generates'
Definition:An entity dynamically creates or generates another entity, such as a file, process, or data.
Note:This is more general than authored-by (which is about original creation by an identity) and drops (which is specific to malware placing a file). It describes the runtime action of creation. It covers terms like create, creates, and generates. If the relationship is more concise, such as a malware creating a file, use 'drops' instead, or if it is about the original creation by an identity, use 'authored-by'. Otherwise, use 'creates-or-generates' to capture the action of creation or generation in a broader sense.
Example:A malware creates-or-generates a new registry key. A malware creates-or-generates notification popups.

Name:'modifies-or-removes-or-replaces'
Definition:Indicates that an entity alters, replaces, or removes another entity or its components, such as changing a registry key.
Example:A ransomware modifies(modifies-or-removes-or-replaces) the Master Boot Record.

Name:'uses'
Definition:Represents that an entity employs or leverages another entity to achieve its objectives. It is a highly general, active relationship describing "A uses B to do something."
Note:Differentiated from 'depends-on' and 'exploits'. 'uses' is an active behavior (e.g., malware uses PowerShell to execute commands), while 'depends-on' is a static, prerequisite state (e.g., the malware's execution depends-on the .NET Framework). 'exploits' is a special case of 'uses' that specifically involves leveraging a 'vulnerability'; if a vulnerability is leveraged, 'exploits' should be preferred.
Example:Threat Actor APT41 uses the Cobalt Strike framework.

###Main Topic:Classification and Lineage###
Name:'variant-of'
Definition:Indicates that one entity is a direct evolutionary version of another, typically sharing a lineage in code or core functionality.
Note:This is distinct from 'derived-from' and 'compares-to'. 'variant-of' implies direct derivation, often at the code level (e.g., the Zeus malware has countless 'variants'). 'derived-from' is more abstract, signifying conceptual or technical inspiration without direct code reuse. 'compares-to' is for a general comparison of attributes without implying any lineage.
Example:The Gootkit malware is a variant-of the earlier Gozi trojan.

Name:'derived-from'
Definition:Indicates that an entity is conceptually, technically, or philosophically inspired by or based on another, but is not a direct code-level evolution.
Note:See the note under 'variant-of'. 'derived-from' represents a more abstract, "intellectual lineage" relationship.
Example:The techniques used in the Triton malware were derived-from the know-how developed for the Stuxnet attack.

Name:'alias-of'
Definition:Indicates that one entity is an alternative name or identifier for another.
Note:This provides a direct and explicit way to link known aliases, which is more specific than the broader compares-to relationship. It is a bidirectional relationship. This covers terms like has alias and is alias of.
Example:APT28 alias-of Fancy Bear.

Name:'compares-to'
Definition:Indicates a comparative relationship between two entities based on their features, behavior, complexity, or other attributes. 'variant-of' means two entities have a direct evolution or code-derived variant relationship, while 'compares-to' is broader and can include any form of comparison. When a relationship meets both criteria, 'variant-of' should be used instead.
Example:Malware A compares-to Malware B in its propagation method.

Name:'categorized-as'
Definition:Links an entity to its formal classification or type within a given taxonomy. 'variant-of' is a specific evolutionary classification. 'categorized-as' is a more formal, ontological classification relationship, for example, used to link an instance to a category in a taxonomy.
Example:The threat activity is categorized-as a form of ransomware attack.

###Main Topic:Geographic Relationships###
Name:'located-at'
Definition:Specifies the current or known geographic location of an entity.
Note:This is distinct from 'originates-from'. 'located-at' refers to the present location, while 'originates-from' refers to the place of origin or provenance. For example, a threat actor may 'originates-from' Iran, but the server they use is 'located-at' a data center in the Netherlands.
Example:A command and control server is located-at a data center in Germany.

Name:'originates-from'
Definition:Specifies the place of origin or provenance of an entity.
Note:See the note under 'located-at' for a direct comparison.
Example:The Stuxnet malware is believed to originate-from the United States and Israel.

###Main Topic:Analysis and Defense Relationships###
Name:'indicates'
Definition:Represents an inferential relationship where the presence of one entity (typically an Indicator) serves as evidence or a sign of another threat entity. It expresses that "if A is observed, it likely signifies that B exists or is occurring."
Note:The core of this relationship is analytical inference. It is distinct from the 'detecting' function within other relationships (e.g., 'research-describes-analysis-of-characterizes-detects'). The 'detecting' function represents an active, confirmed discovery, whereas 'indicates' represents a probabilistic link ("this likely means that"). This relationship is fundamental for operationalizing threat intelligence, as it directly connects a detectable artifact (the IOC) to the threat it helps to identify.
Example:An IP address (indicator) indicates a malware.

Name:'mitigates'
Definition:Indicates that a defensive measure or Course of Action effectively counters, reduces, or remediates the threat posed by an Attack Pattern, Vulnerability, or Malware.
Note:This is the inverse of 'bypasses'. 'mitigates' is a successful action for the defender (e.g., a patch mitigates a vulnerability), whereas 'bypasses' is a successful action for the attacker (e.g., an obfuscation technique bypasses a sandbox).
Example:Applying the MS17-010 patch mitigates the EternalBlue exploit.

Name:'based-on'
Definition:Indicates that an object (e.g., report, indicator, signature) is derived from or based on the information or analysis of another object (e.g., observed data, another report, malware sample).
Example:Indicator based-on (based-on) Observed Data.

Name:'research-describes-analysis-of-characterizes-detects'
Definition:A comprehensive research and defense relationship that signifies a document describing a subject, an actor analyzing a subject, a formal analysis object characterizing a subject's behavior, or a defensive tool identifying a threat.
Note:This consolidated relationship serves four primary purposes:Describing: Linking a textual document or publication to the entity it is about.Analyzing: Linking an analytical actor (e.g., a researcher or organization) to the subject of their investigation.Characterizing: Linking a formal analysis object (e.g., a Malware Analysis run) to the entity it was performed on.Detecting (Defensive): Linking a defensive tool, signature, or security product to the threat it successfully identifies.
Example:Example 1 (Describing): A Mandiant report research-describes-analysis-of-characterizes-detects the APT1 group.Example 2 (Analyzing): A security researcher research-describes-analysis-of-characterizes-detects a new malware sample. Example 3 (Characterizing): A sandbox analysis run research-describes-analysis-of-characterizes-detects the WannaCry malware. Example 4 (Detecting): An antivirus signature research-describes-analysis-of-characterizes-detects a specific malware file.

###Main Topic:Meta and Fallback Relationships###
Name:'negation'
Definition:Represents the confirmed absence of a relationship, link, characteristic, or action between entities.
Note:This type is used to explicitly state that a suspected or potential relationship does not exist. It is crucial for refuting claims or clarifying the scope of an entity's attributes. It should be used for phrases like does not contain, has no links to, is not affected by.
Example:A threat report states that Malware X negation (is not affected by) Vulnerability Y.

Name:'other'
Definition:If a relationship exists but does not fit into the categories above, and write down the value of 'rel' as the original text of the relationship.
Example: Not available, Not Applicable, Unknown, etc.
'''

def LLM4CTI_chunk_prompt_maker(text_pair):
    full_text, chunk_text = text_pair
    prompt_message = [
            {
                'role': 'user',
                'content': (
                    '''
    You are an NLP model specialized in threat intelligence extraction. Your task is to extract a knowledge graph related to cybersecurity threats from a given threat intelligence report or blog article, including entities (nodes) and relationships (edges) among those entities, and output the results in a specified format.
    [Entities (Nodes)]
    What's an entity:

    In cyber threat intelligence, 'entity' refers to any unit of information that can be independently identified, described, and analyzed, and it forms a fundamental component of each link in threat activity. It is important to note that entities here are not limited to those verified indicators (IOCs) specifically used for detection, such as specific IP addresses, domain names, or file hashes, but a broader concept. Entities can be objects with clear names and characteristics (such as 'get-logon-history.ps1'), or data objects such as 'RAR file', even if it has no fixed naming rules.

    The main characteristics of entities are:

        Independence: Each entity exists as an independent unit of information and can be extracted and analyzed separately;

        Relevance: There may be inherent connections between entities, and by associating this information, a complete attack chain or threat portrait can be constructed;

        Diversity: Entities can cover various forms of information such as files, scripts, configuration files, registry entries, network traffic data, log records, etc.;

        Contextual significance: Even if an entity (such as a downloaded 'RAR file') does not have a specific name, as long as it has contextual significance and analytical value in threat intelligence analysis, it is still a valid entity.

    Important Note

        Only extract entities from named entities that directly appear in the current chunk, i.e., only pay attention to the entities explicitly mentioned in the current chunk.

        In the process of relationship extraction, if it is discovered that an entity in the current chunk has a relationship with an entity that does not directly appear in the chunk (for example, through indefinite pronouns, chapter hints, or contextual implications), then add that outside-chunk entity to the entity list and extract the corresponding relationship.

    Entity Extraction Steps
    Stage 1 – Entity Extraction and Classification

    Stage 1.1 Fully scan the #current chunk's text# to identify all entities related to cybersecurity threats, including:

        Explicitly named indicators (IPs, domains, filenames, hash values, etc.)

        Implicit threat components (attack stages, undocumented tools, generic file types)

        Contextually significant objects ('RAR file', 'registry entry') even without specific names

    Stage 1.2 – Type Assignment

        Assign one predefined category to each entity.

        Use threat-actor/intrusion-set formatted names for unnamed attackers/attacks.

        Apply other category only when no predefined type matches.

    Stage 1.3 – Alias & Lineage Handling

        Record all aliases or alternative names for entities.

        Identify evolutionary relationships for the 'mother entity' field (e.g., malware variants).

    Stage 1.4 – Recheck (Self-Reflection on Completeness)
    Before finalizing entity extraction, pause and critically reflect on whether any relevant entities may have been overlooked.
    First, check if any explicitly named entities were unintentionally ignored.
    Then, examine whether non-explicitly named entities have been missed—these may be referenced using generic terms like file, image, or script, but actually point to specific entities in context.

    '''+entity_type_definition+'''

Stage 2 – Relationship Extraction and Classification

   Stage 2.1 To determine the relationship between entities, you MUST follow a strict, prioritized decision process to generate the 'rel' and 'rel_type' values. Evaluate the following cases in order and stop as soon as one case is met.

   Case 1: An Explicit Relationship Phrase Exists in the Text (Highest Priority)
   First, check if the text contains an explicit phrase (a verb, prepositional phrase, or descriptive clause) that directly links the entities.
   - If YES:
     - The 'rel' value MUST BE this explicit phrase from the text (simplified for conciseness if necessary).
     - Then, for the 'rel_type', find the best-matching category from your predefined list. If no predefined type is a good fit, you MUST use ['other'].
   - Example A (rel_type match found):
     - Text: "LazyFox 'deployed' a custom version of Cobalt Strike."
     - Output: { 'sub': 'LazyFox', 'rel': 'deployed a custom version of', 'rel_type': ['uses'], 'obj': 'Cobalt Strike' }
   - Example B (no rel_type match):
     - Text: "The malware is 'designed to evade' sandbox environments."
     - Output: { 'sub': 'malware', 'rel': 'is designed to evade', 'rel_type': ['other'], 'obj': 'sandbox environments' }

   Case 2: No Explicit Phrase, but an Implied Relationship Matches a Predefined rel_type
   If Case 1 does not apply, then analyze the implied relationship. Check if this implied action perfectly matches one of the categories in your predefined 'rel_type' list.
   - If YES:
     - Both the 'rel' value and the 'rel_type' value MUST BE the name of that matching predefined category. They will be identical in this case.
   - Example:
     - Text: "...analysis revealed network 'traffic between' the infected host and the domain evil.com."
     - Implied action is 'communication'. Assuming 'communicates-with' is a predefined 'rel_type'.
     - Output: { 'sub': 'infected host', 'rel': 'communicates-with', 'rel_type': ['communicates-with'], 'obj': 'evil.com' }

   Case 3: No Explicit Phrase and No Predefined rel_type Fits (Fallback)
   If neither Case 1 nor Case 2 applies, this is your final option.
   - Action: You MUST infer the underlying relationship, summarize it into a concise verb phrase yourself, and use this summary as the 'rel' value.
   - The 'rel_type' in this case MUST BE ['other'].
   - Example:
     - Text: "The attacker stored the stolen credentials 'inside a base64-encoded string within a JPEG image's metadata'."
     - Implied action is complex: hiding/embedding data. No predefined type fits.
     - Output: { 'sub': 'attacker', 'rel': 'hides credentials in image metadata', 'rel_type': ['other'], 'obj': 'stolen credentials' }
     
    Stage 2.2 If an entity in the current chunk has a relationship with an entity not directly mentioned in the chunk, but in the full text part(e.g., through indefinite pronouns, chapter references, or context hints), add that outside-chunk entity to the entity list and extract the corresponding relationship. A common example is current chunk has a list of multiple urls/hashs/filename/domains with out any specific relationship, but you should think if the list of entities has a relationship with the topic threat entity. And the listing means the those 'entities' are related to the topic threat entity with 'indicates' or 'characterizes' relationship.

    Stage 2.3 – Recheck
    Ensure that every entity extracted in Stage 1 has at least one corresponding relationship description. There must not be an entity listed without any relationship.
    If necessary, infer that the entity might have some indirect relationship with another entity in the chunk. Otherwise, that entity might be related to a main entity in the broader text (e.g., an APT, Malware, Threat Actor, Vulnerability, etc.) that was introduced outside the current chunk. In that case, add the main entity (from outside the chunk) to the current chunk’s entity list and extract their relationship.

   '''+rel_type_definition+'''
    Stage 3 – Output Generation
    [Normalization of Obfuscated URLs, IPs, and Emails]

    Obfuscated URLs, IP addresses, and email addresses must be converted to their original format:

        Replace [.] with . in URLs and IPs (e.g., 192[.]168[.]1[.]1 → 192.168.1.1).

        Replace # with @ and [.] with . in emails (e.g., contact#example[.]com → contact@example.com).

        Only apply this to URLs, IPs, and emails—leave other obfuscations unchanged.

    Both the entity list and the relationship list must strictly follow the formats below. Ensure the entity names are consistent in both the entity list and the relationships, and use correct JSON formatting:
    Part 1: Entity List

        The entire JSON array must be strictly enclosed between #Entity_List_Start# and #Entity_List_End#.

        Each entity node must include the following attributes (all attribute values should be strings or string arrays):

            'name': The specific name of the entity.

            'type': The category of the entity (each entity node must have only one type value).

            'alias': The alias name of the entity mentioned in the text.

                If multiple aliases exist, format as ['Alias1', 'Alias2'].

                If there is only one, format as ['Actual_Value'].

                If none, use ['None'].

            'mother entity': If the entity is a variant or evolution of another entity, provide the name of its parent entity; otherwise, use ['None'].

    Part 2: Entity Relationships

        Extract relationship descriptions between entities from the text and output them as a JSON array of objects, with keys sub, rel, rel_type, and obj.

        The entire JSON array must be strictly enclosed between #Relationship_List_Start# and #Relationship_List_End#.

        Each relationship object must follow this format:

        {
            'sub': '<Source Entity>',
            'rel': '<Relationship Text>',
            'rel_type': ['<Relationship Type Category>'],
            'obj': '<Target Entity>'
        }

            'sub': Must exactly match the source entity name extracted in Part 1.

            'rel': A verb or phrase summarizing the relationship as described in the text (if the original text is long, it can be simplified).

            'rel_type': An array listing one or more of the predefined relationship types (e.g., 'uses', 'targets'). Even if there is only one, it should still be formatted as an array.

            'obj': Must exactly match the target entity name extracted in Part 1.

    Below is an example:

    #Entity_List_Start#
    ```json
    [
    { 'name': 'exampleAPT', 'type': 'threat-actor', 'alias': ['exampleAPTnickname'], 'mother entity': ['None'] },
    { 'name': 'exampleTool', 'type': 'hacker-tool', 'alias': ['None'], 'mother entity': ['None'] },
    { 'name': 'exampleCVE', 'type': 'vulnerability', 'alias': ['None'], 'mother entity': ['None'] }
    ]
    #Entity_List_End#

    #Relationship_List_Start#

    JSON

    [
    {
        'sub': 'exampleAPT',
        'rel': 'utilized',
        'rel_type': ['uses'],
        'obj': 'exampleTool'
    },
    {
        'sub': 'exampleTool',
        'rel': 'is using',
        'rel_type': ['exploits'],
        'obj': 'exampleCVE'
    },
    {
        'sub': 'exampleAPT',
        'rel': 'leverages vulnerability',
        'rel_type': ['exploits'],
        'obj': 'exampleCVE'
    }
    ]
    #Relationship_List_End#
    【Start of the full text】
    '''+str(full_text)+'''
    【Start of the current chunk】
    '''+str(chunk_text)+'''
    【End of the current chunk】
    '''
    )
    }
    ]
    return prompt_message

def LLM4CTI_merger_prompt_maker(merged_text):
    prompt_message = [
        {
            'role': 'user',
            'content': (
                '''
You are working on merging results from a distributed knowledge graph construction task for cybersecurity threat intelligence. Before your work, a single article was split into multiple chunks, and each chunk has gone through entity and relationship extraction. Now, you need to merge the processing results of these chunks to form a complete knowledge graph.

Your core task: Please merge the results of multiple chunks according to the following strict rules:

### Merging Rules
1. Entity Merging:
    - Consider two entities the same if either:
      a) Their names are identical or semantically equivalent (including case differences).
    - Merging Strategy:
      * Keep the simplest naming format (e.g., 'APT28' and 'APT28 (Fancy Bear)' merge into the former, while 'APT28' adds an attribute `'alias': ['Fancy Bear']`), and add a relationship:
        `'sub': 'APT28', 'rel': 'variant of', 'rel_type': ['variant-of'], 'obj': 'Fancy Bear'`.
      * Merge all alias lists (removing duplicates).
      * Merge all mother entity relationships (keep the most complete evolution chain).

2. Relationship Merging:
    - Only merge relationships if all the following conditions are met:
      a) The subject (sub) is the same after entity merging.
      b) The object (obj) is the same after entity merging.
      c) The relationship description (rel) is exactly the same text.
      d) Differences in 'rel_type' are ignored for the merging condition.
    - Merging Strategy:
      * Relationships with different rel_type remain as separate entries or, if merged, preserve their union.
      * Maintain the complete original relationship description.

3. Special Handling:
    - Only perform merging; all unique entities and relationships must be retained.
    - Cross-chunk implied relationships (inferred through mother entities) must be explicitly created.
    - Ensure the final result contains all original information, only eliminating redundant expressions.

### Output Requirements
Maintain the same JSON structure as the original, but:
1. Streamline the entity list according to the merging rules.
2. Streamline the relationship list according to the merging rules.
3. Retain all aliases and mother entity relationships.
4. Keep all distinct relationship descriptions.
5. The rel_type field should preserve all unique values.

Please strictly output the final merged results in the following format:

### [Final Entity List]
#Final_Entity_List_Start#
json
[
  {
    'name': '<standardized name>',
    'type': '<best classification>',
    'alias': ['<original name 1>', '<alias 2>', ...],
    'mother entity': ['<complete mother entity chain>']
  },
  ...
]
#Final_Entity_List_End#

### [Final Relationship List]
#Final_Relationship_List_Start#
json
[
  {
    'sub': '<merged subject>',
    'rel': '<original relationship description>',
    'rel_type': ['<deduplicated type list>'],
    'obj': '<merged object>'
  },
  ...
]
#Final_Relationship_List_End#

Below are the entities and relationships I extracted from multiple chunks:

'''+str(merged_text)

            )
        }
    ]
    return prompt_message



# Key functions

In [None]:

import os
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Any, Union
from openai import OpenAI, OpenAIError
from langchain_text_splitters import RecursiveCharacterTextSplitter
try:
    from tqdm import tqdm
except ImportError:
    tqdm = None

def ask_group_link(
    prompt_list: List[List[Dict[str, str]]],
    model: str,
    token: int,
    temp: float,
    max_workers: int = 64,
    api_key: str = None,
    api_base: str = None
) -> List[Union[str, None]]:
    effective_api_key = api_key or os.getenv("OPENAI_API_KEY")
    if not effective_api_key:
        raise ValueError("API key must be provided either as an argument or as an OPENAI_API_KEY environment variable.")

    def _call_openai_api(prompt: List[Dict[str, str]]) -> Union[str, None]:
        """A helper function to call the API for a single prompt."""
        try:
            client = OpenAI(api_key=effective_api_key, base_url=api_base)
            response = client.chat.completions.create(
                model=model,
                messages=prompt,
                max_tokens=token,
                temperature=temp
            )
            return response.choices[0].message.content
        except OpenAIError as e:
            print(f"An API error occurred: {e.__class__.__name__} - {e.body.get('message') if e.body else 'No message'}")
            return None
        except Exception as e:
            print(f"An unexpected error occurred during an API call: {e}")
            return None

    results = [None] * len(prompt_list)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {
            executor.submit(_call_openai_api, prompt): i
            for i, prompt in enumerate(prompt_list)
        }

        iterable = as_completed(future_to_index)
        if tqdm:
            iterable = tqdm(iterable, total=len(prompt_list), desc="Processing API Requests")

        for future in iterable:
            index = future_to_index[future]
            try:
                results[index] = future.result()
            except Exception as e:
                print(f"Request for prompt at index {index} generated an exception: {e}")
                results[index] = None
    
    return results

def process_texts_to_kg_strings_in_batch(
    text_list: List[str],
    token: int,
    temp: float,
    model: str,
    max_workers_for_api: int = 64,
    api_key: str = None,
    api_base: str = None,
    replace_prompt_text: List[Dict[str, str]] = None,
    chunk_size: int = 400,
    chunk_overlap: int = 40,
    alt_tokenlen_for_merge: int = None,
    force_cut: bool = False,
) -> List[Union[str, None]]:
    """
    Processes a batch of article texts through a multi-stage LLM pipeline
    to convert them into knowledge graph strings.

    Args:
        text_list (List[str]): A list containing the raw text of multiple articles.
        token (int): The maximum number of tokens to generate for each LLM call.
        temp (float): The temperature parameter for the LLM.
        model (str): The model name to be used (e.g., 'gpt-4o').
        max_workers_for_api (int): Number of concurrent workers for API calls.
        api_key (str, optional): The API key. Passed to the underlying API call function.
        api_base (str, optional): The API endpoint URL. Passed to the underlying API call function.
        replace_prompt_text (List[Dict[str, str]], optional): A list of replacement rules for prompts.
                                                              Each dict should have "to_find_text" and "place_as_text".
                                                              Defaults to None.
        chunk_size (int, optional): The size of text chunks. Defaults to 400.
        chunk_overlap (int, optional): The overlap between text chunks. Defaults to 40.
        alt_tokenlen_for_merge (int, optional): An alternative token limit for the final merge step.
                                                Defaults to None.
        force_cut (bool, optional): If True, forces the response to be trimmed starting from specific markers
                                    like #Entity_List_Start# after the initial </think> tag is processed.
                                    Defaults to False.

    Returns:
        List[Union[str, None]]: A list where each element is the final knowledge graph string
                               for the corresponding article. Failed processes will result in None.
    """
    cut_markers = ["#Entity_List_Start#", "Entity_List_Start#", "Entity_List_Start"]

    # --- Stage 1: Generate Chunk-Level Prompts for All Articles ---
    print("\n--- Stage 1: Chunking texts and generating chunk prompts ---")
    all_chunk_prompts = []
    article_chunk_indices = []
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        model_name="gpt-4", chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    current_chunk_global_idx = 0
    
    iterable_stage1 = text_list
    if tqdm:
        iterable_stage1 = tqdm(text_list, desc="Stage 1 Progress")
        
    for article_idx, full_text in enumerate(iterable_stage1):
        article_start_chunk_idx = current_chunk_global_idx
        try:
            chunks = text_splitter.split_text(str(full_text))
            if not chunks:
                print(f"\nWarning: Article {article_idx} yielded no content after splitting and will be skipped.")
                article_chunk_indices.append((article_start_chunk_idx, article_start_chunk_idx))
                continue
            
            for chunk in chunks:
                all_chunk_prompts.append(LLM4CTI_chunk_prompt_maker((full_text, chunk)))
                current_chunk_global_idx += 1
            article_chunk_indices.append((article_start_chunk_idx, current_chunk_global_idx))
        except Exception as e:
            print(f"\nError: Failed to chunk article {article_idx}: {e}")
            article_chunk_indices.append((article_start_chunk_idx, article_start_chunk_idx))
    
    if not all_chunk_prompts:
        print("Error: No valid prompts were generated. Terminating process.")
        return [None] * len(text_list)
    print(f"Generated a total of {len(all_chunk_prompts)} chunk prompts for {len(text_list)} articles.")

    # --- Optional: Apply text replacement to prompts ---
    if replace_prompt_text and isinstance(replace_prompt_text, list):
        print("\n--- Applying prompt replacement rules (Stage 1: Chunks) ---")
        for replacement_rule in replace_prompt_text:
            to_find = replacement_rule.get("to_find_text")
            to_place = replacement_rule.get("place_as_text")
            if to_find is None or to_place is None:
                continue

            total_replacements = 0
            for prompt in all_chunk_prompts:
                if isinstance(prompt, list):
                    for message in prompt:
                        if isinstance(message, dict) and "content" in message and isinstance(message["content"], str):
                            count = message["content"].count(to_find)
                            if count > 0:
                                message["content"] = message["content"].replace(to_find, to_place)
                                total_replacements += count
            print(f"Found and replaced {total_replacements} occurrences of '{to_find}' in chunk prompts.")

    # --- Stage 2: Process All Chunk-Level Prompts in Batch ---
    print("\n--- Stage 2: Batch processing all chunks ---")
    raw_chunk_responses = ask_group_link(
        prompt_list=all_chunk_prompts,
        model=model,
        token=token,
        temp=temp,
        max_workers=max_workers_for_api,
        api_key=api_key,
        api_base=api_base
    )

    # --- Stage 3: Post-process All Chunk Responses ---
    print("\n--- Stage 3: Post-processing chunk responses ---")
    processed_all_chunk_answers = []
    
    iterable_stage3 = raw_chunk_responses
    if tqdm:
        iterable_stage3 = tqdm(raw_chunk_responses, desc="Stage 3 Progress")
        
    for response in iterable_stage3:
        if response is None:
            processed_all_chunk_answers.append("")
            continue
        try:
            content = response.split("</think>", 1)[-1].strip()
            if force_cut:
                for marker in cut_markers:
                    if marker in content:
                        start_pos = content.find(marker)
                        content = content[start_pos:]
                        break
            processed_all_chunk_answers.append(content.strip())
        except Exception:
            processed_all_chunk_answers.append("")

    # --- Stage 4: Generate Merger Prompts for Each Article ---
    print("\n--- Stage 4: Generating merger prompts ---")
    all_merger_prompts = []
    
    iterable_stage4 = article_chunk_indices
    if tqdm:
        iterable_stage4 = tqdm(article_chunk_indices, desc="Stage 4 Progress")
        
    for article_idx, (start_idx, end_idx) in enumerate(iterable_stage4):
        if start_idx == end_idx:
            all_merger_prompts.append(None)
            continue

        article_chunks_processed = processed_all_chunk_answers[start_idx:end_idx]
        formatted_chunks = [f"\n\n[Chunk{i}_START]\n{content}\n[Chunk{i}_END]\n" for i, content in enumerate(article_chunks_processed)]
        
        merged_chunks_text = (
            f"/* TOTAL {len(formatted_chunks)} CHUNKS START FOR ARTICLE {article_idx}: */"
            + "".join(formatted_chunks)
            + f"\n/* END OF MERGED CHUNKS FOR ARTICLE {article_idx} */"
        )
        all_merger_prompts.append(LLM4CTI_merger_prompt_maker(merged_chunks_text))

    valid_merger_data = [(i, p) for i, p in enumerate(all_merger_prompts) if p is not None]
    if not valid_merger_data:
        print("Error: No valid merger prompts were generated. Terminating process.")
        return [None] * len(text_list)
        
    valid_merger_indices, valid_merger_prompts = zip(*valid_merger_data)
    print(f"Generated valid merger prompts for {len(valid_merger_prompts)} articles.")

    # --- Stage 5: Process All Merger Prompts in Batch ---
    print("\n--- Stage 5: Batch processing merger requests ---")
    merge_token = alt_tokenlen_for_merge if alt_tokenlen_for_merge is not None else token
    if alt_tokenlen_for_merge is not None:
        print(f"Using alternative token length of {merge_token} for merger requests.")
        
    final_kg_responses_raw = ask_group_link(
        prompt_list=list(valid_merger_prompts),
        model=model,
        token=merge_token,
        temp=temp,
        max_workers=max_workers_for_api,
        api_key=api_key,
        api_base=api_base
    )

    # --- Stage 6: Finalize Results ---
    print("\n--- Stage 6: Finalizing results ---")
    final_knowledge_graphs = [None] * len(text_list)
    
    iterable_stage6 = final_kg_responses_raw
    if tqdm:
        iterable_stage6 = tqdm(final_kg_responses_raw, desc="Stage 6 Progress")
        
    for i, raw_ans in enumerate(iterable_stage6):
        original_article_idx = valid_merger_indices[i]
        if raw_ans is None:
            continue
        try:
            final_kg_string = raw_ans.split("</think>", 1)[-1].strip()
            if force_cut:
                for marker in cut_markers:
                    if marker in final_kg_string:
                        start_pos = final_kg_string.find(marker)
                        final_kg_string = final_kg_string[start_pos:]
                        break
            final_knowledge_graphs[original_article_idx] = final_kg_string.strip()
        except Exception:
            continue
            
    print("\n--- Knowledge graph generation for all articles is complete ---")
    return final_knowledge_graphs

# Main Usage

In [None]:
import sys
import pandas as pd
import importlib
df = pd.read_excel('Articles.xlsx')
Articlecontent=df['content'].tolist()
KnowledgeGraph={}

kg_text=process_texts_to_kg_strings_in_batch(Articlecontent, temp=0.1, token=4*1024, model='gpt-4o', api_key='sk-xxxx', api_base="https://api.openai.com/v1")

# Convert to Knowledge Graph

In [None]:
import networkx as nx
import spacy
import json
import re
import json_repair
from collections import Counter
import ast
import random

def aggregate_knowledge_graph(df, knowledge_graph_column, id_column):
    """
    Aggregates knowledge graphs from multiple rows of a DataFrame into a single graph.
    This process involves:
    1. Merging individual graphs from each row.
    2. Lemmatizing node names to group similar entities.
    3. Merging nodes that become identical after lemmatization.
    """
    # Attempt to load the spaCy model, provide a hint if it fails
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        print("Error: spaCy model 'en_core_web_sm' not found.")
        print("Please run: python -m spacy download en_core_web_sm")
        return None

    def spacy_lemmatize(text):
        """
        Lemmatizes and lowercases a string using spaCy, returning a space-joined string.
        E.g., 'Social Engineering Attacks' -> 'social engineering attack'
        """
        doc = nlp(text.lower())
        return " ".join(token.lemma_ for token in doc if not token.is_punct and not token.is_space)

    # Helper function: merge two attribute values into a 1D list (unique, preserving order)
    def merge_values(val1, val2):
        def to_list(x):
            if x is None:
                return []
            if isinstance(x, list):
                return x
            return [x]
        list1 = to_list(val1)
        list2 = to_list(val2)
        # Merge and deduplicate based on order of appearance
        merged = list(dict.fromkeys(list1 + list2))
        return merged

    # -------------------------------
    # 1) First, merge the knowledge graphs from all rows
    G_aggregated = nx.MultiDiGraph()
    total_skipped_rels_count = 0  # Initialize the total counter for skipped relations
    for index in df.index:
        # The graph building function now returns the graph and the count of skipped relations
        G, skipped_count = build_knowledge_graph(df, index, knowledge_graph_column, id_column)
        total_skipped_rels_count += skipped_count # Accumulate the number of skipped items

        if G is None: # If graph building fails, skip it
            continue

        # Merge nodes: if a node already exists, merge its attributes
        for node, attr in G.nodes(data=True):
            if node not in G_aggregated:
                G_aggregated.add_node(node, **attr)
            else:
                for key in set(list(attr.keys()) + list(G_aggregated.nodes[node].keys())):
                    val_new = attr.get(key)
                    val_old = G_aggregated.nodes[node].get(key)
                    G_aggregated.nodes[node][key] = merge_values(val_old, val_new)
    
        # Merge all edges (allowing multi-edges)
        for u, v, edge_attr in G.edges(data=True):
            G_aggregated.add_edge(u, v, **edge_attr)
    
    # After processing all rows, print the total count of skipped relations
    if total_skipped_rels_count > 0:
        print(f"\nAggregation complete. A total of {total_skipped_rels_count} malformed relationship entries were skipped across all files.")


    # -------------------------------
    # 2) Lemmatize node names using spaCy and group them by the same lemma
    lemma_to_nodes = {}
    for node in list(G_aggregated.nodes()):
        # If the node is a string, lemmatize it; otherwise (e.g., number, None), use the node itself as the key
        if isinstance(node, str):
            lemma = spacy_lemmatize(node)
        else:
            lemma = node
        lemma_to_nodes.setdefault(lemma, []).append(node)
    
    # Count and print how many lemma groups contain multiple nodes
    duplicate_groups_count = sum(1 for group in lemma_to_nodes.values() if len(group) > 1)
    print(f"Found {duplicate_groups_count} groups of nodes that are identical after lemmatization and need to be merged")
    
    # -------------------------------
    # 3) For each lemma group with multiple nodes, merge them into one
    #    Merge logic: keep the node with the highest total degree (in + out), 
    #    and merge the attributes and edges of the others into it.
    for lemma, nodes_same_lemma in lemma_to_nodes.items():
        if len(nodes_same_lemma) > 1:
            # Calculate the sum of in-degree and out-degree for each node
            degree_dict = {node: G_aggregated.in_degree(node) + G_aggregated.out_degree(node)
                           for node in nodes_same_lemma}
            # Select the node with the highest degree as the one to keep
            node_to_keep = max(degree_dict, key=degree_dict.get)
            # Sequentially merge other nodes into node_to_keep
            nodes_to_merge = [n for n in nodes_same_lemma if n != node_to_keep]
            
            for node_to_merge in nodes_to_merge:
                if node_to_merge in G_aggregated:
                    # 1) Merge node attributes
                    for key in set(list(G_aggregated.nodes[node_to_keep].keys()) + 
                                   list(G_aggregated.nodes[node_to_merge].keys())):
                        val_keep = G_aggregated.nodes[node_to_keep].get(key)
                        val_merge = G_aggregated.nodes[node_to_merge].get(key)
                        G_aggregated.nodes[node_to_keep][key] = merge_values(val_keep, val_merge)
                    
                    # 2) Redirect all incoming edges from node_to_merge
                    in_edges = list(G_aggregated.in_edges(node_to_merge, keys=True, data=True))
                    for u, v, key_edge, data_edge in in_edges:
                        G_aggregated.add_edge(u, node_to_keep, **data_edge)
                    
                    # 3) Redirect all outgoing edges from node_to_merge
                    out_edges = list(G_aggregated.out_edges(node_to_merge, keys=True, data=True))
                    for u, v, key_edge, data_edge in out_edges:
                        G_aggregated.add_edge(node_to_keep, v, **data_edge)
                    
                    # 4) Remove the merged node
                    G_aggregated.remove_node(node_to_merge)
    
    return G_aggregated

def build_knowledge_graph(df, index, knowledge_graph_column, id_column):
    """
    Builds a NetworkX MultiDiGraph from a string containing entity and relationship JSON lists.
    """
    # Initialize a counter for skipped relations for this specific function call
    skipped_rels_count = 0

    # Read the knowledge graph string from the specified row in the DataFrame
    data_str = df.loc[index, knowledge_graph_column]
    source_id = df.loc[index, id_column]

    # 1. Extract the entity list JSON using regex
    entity_pattern = r'#Final_Entity_List_Start#\s*json\s*(\[[\s\S]*?\])\s*#Final_Entity_List_End#'
    entity_match = re.search(entity_pattern, data_str)
    if entity_match:
        entity_json_str = entity_match.group(1)
        try:
            entities = json_repair.loads(entity_json_str)
        except Exception as e:
            print(f"Error parsing entity JSON for ID {source_id}: {e}")
            entities = []
    else:
        entities = []

    # 2. Extract the relationship list JSON using regex
    rel_pattern = r'#Final_Relationship_List_Start#\s*json\s*(\[[\s\S]*?\])\s*#Final_Relationship_List_End#'
    rel_match = re.search(rel_pattern, data_str)
    if rel_match:
        rel_json_str = rel_match.group(1)
        try:
            relationships = json_repair.loads(rel_json_str)
        except Exception as e:
            print(f"Error parsing relationship JSON for ID {source_id}: {e}")
            relationships = []
    else:
        relationships = []

    # 3. Create a MultiDiGraph
    G = nx.MultiDiGraph()

    # 4. Add entities as nodes
    for i, entity in enumerate(entities):
        if not isinstance(entity, dict):
            print(f"Warning: Entity at index {i} for ID {source_id} is not a dictionary, skipping. Content: {entity}")
            continue
        node_id = entity.get("name")
        if node_id is None:
            print(f"Warning: Entity at index {i} for ID {source_id} is missing 'name' field, skipping. Content: {entity}")
            continue
        entity['source'] = source_id 
        G.add_node(node_id, **entity)

    # 5. Iterate through relationships and add them as edges
    # Create a flattened list of relationships to handle nested lists
    flat_relationships = []
    for item in relationships:
        if isinstance(item, list):
            flat_relationships.extend(item)
        else:
            flat_relationships.append(item)

    for rel in flat_relationships:
        # If the relationship is a string, try to convert it to a dictionary
        if isinstance(rel, str):
            try:
                # Use ast.literal_eval for safer string evaluation
                rel = ast.literal_eval(rel)
            except (ValueError, SyntaxError, MemoryError, TypeError):
                skipped_rels_count += 1
                continue
        
        # Check if 'rel' is a dictionary; if not, skip and count it
        if not isinstance(rel, dict):
            skipped_rels_count += 1
            continue

        source = rel.get("sub")
        target = rel.get("obj")
        if source is None or target is None:
            skipped_rels_count += 1
            continue
        
        # Ensure head and tail nodes exist in the graph (fault tolerance)
        if not G.has_node(source):
            G.add_node(source, source=source_id, name=source, type='unknown')
        if not G.has_node(target):
            G.add_node(target, source=source_id, name=target, type='unknown')
            
        edge_attr = {key: value for key, value in rel.items() if key not in ["sub", "obj"]}
        edge_attr['source'] = source_id
        G.add_edge(source, target, **edge_attr)
        
    # Return the graph and the number of skipped relations for this call
    return G, skipped_rels_count

def analyze_graph_structure(graph):
    """
    Performs and prints a basic structural analysis of the graph.
    """
    if graph is None:
        print("Graph is None, cannot perform structure analysis.")
        return
        
    # Count the distribution of node types
    type_counter = Counter()
    for node, data in graph.nodes(data=True):
        # A node's 'type' attribute could be a list or a string
        node_type_attr = data.get('type')
        if isinstance(node_type_attr, list):
            # If it's a list, take the first element as representative
            node_type = node_type_attr[0] if node_type_attr else 'unknown'
        else:
            node_type = node_type_attr or 'unknown'
        type_counter[node_type] += 1

    print("Node Type Distribution:")
    for node_type, count in type_counter.items():
        print(f"{node_type}: {count}")

    # Calculate the sum of in-degree and out-degree for each node
    degree_sum = {node: graph.in_degree(node) + graph.out_degree(node) for node in graph.nodes()}
        
    # Find the node with the highest total degree for each type
    type_to_top_node = {}
    for node, data in graph.nodes(data=True):
        node_type_attr = data.get('type')
        if isinstance(node_type_attr, list):
            node_type = node_type_attr[0] if node_type_attr else 'unknown'
        else:
            node_type = node_type_attr or 'unknown'

        degree = degree_sum[node]
        if node_type not in type_to_top_node or degree > type_to_top_node[node_type][1]:
            type_to_top_node[node_type] = (node, degree)

    print("\nTop Node by Total Degree for Each Type:")
    for node_type, (node, degree) in type_to_top_node.items():
        print(f"Type: {node_type}, Node: {node}, Total Degree: {degree}")

def check_aggregated_graph(graph):
    """
    Given an aggregated graph, output the top 5 nodes with the most unique sources.
    Returns a list of tuples, where each tuple is (node, number_of_unique_sources, list_of_sources).
    """
    if graph is None:
        print("Graph is None, cannot perform check.")
        return []

    results = []
    for node, attr in graph.nodes(data=True):
        sources = attr.get("source", [])
        if not isinstance(sources, list):
            sources = [sources]
        unique_sources = set(sources)
        results.append((node, len(unique_sources), list(unique_sources)))
    
    results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
    return results_sorted[:5]

df['KnowledgeGraph']=kg_text
G_aggregated = aggregate_knowledge_graph(df, 'KnowledgeGraph', 'ArticleIndex')
print("Random node and its attributes:", random.choice(list(G_aggregated.nodes(data=True))))
print("Random edge and its attributes:", random.choice(list(G_aggregated.edges(data=True))))
