In [1]:
# gen data

import os
import re
import json
import copy
import time
import logging
import random
import pickle

from utils.chat2DeepSeek import Chat2DeepSeek
from utils.gen_prompt import Prompt
from utils.util import *

In [2]:
API_KEY = 'sk-7015d98e24c9431f9fb7fb2a4454186e'
event_sen = {}
event_sub_list = ['TRANSPORT', 'ELECT', 'START-POSITION', 'ATTACK', 'END-POSITION', 'MEET', 'MARRY', 'PHONE-WRITE', 'TRANSFER-MONEY', 'SUE']#

# event_sen = load_json('./data_5_ATTACK_6_15_18_53.json')
event_dict = load_json('./meta_data/event_dict_full.json') # defin from doc
trigger_dict = load_json('./meta_data/trigger_pool_augmented_5_16.json')
arguments_dict = load_json('./meta_data/argument_pool_augmented_5_16.json')

fp = './meta_data/arg_roles/'

arg_role_definition = {}
for fn in os.listdir(fp):
    if not fn.endswith('.txt'):
        continue
    with open(fp + fn, 'r', encoding = 'utf-8') as f:
        s = f.read()
    j = json.loads(s)
    arg_role_definition.update(j)

In [3]:
n_trigger = 100
n_argument = 20
sentence_per_event = 50
complex_score = [5, 10] # not implemented
max_argument = 2
max_event = 5
max_refine = 3

weight_dict = {0:[1],
                1:[1],
                2:[0.5, 0.5],
                3:[0.2, 0.5, 0.3],
                4:[0.1, 0.2, 0.3, 0.4],
                5:[0.1, 0.1, 0.3, 0.3, 0.2]
                  }

In [4]:
def get_n_event(max_event, weight_dict):
    if max_event == 0:
        return 0
    return random.choices([ i for i in range(1, max_event + 1)], weights=weight_dict[max_event])

def get_local_time():
    return str(time.localtime().tm_mon) + '_' + str(time.localtime().tm_mday) + '_' + str(time.localtime().tm_hour) + '_' + str(time.localtime().tm_min)
    

In [5]:
refine = '''
### **Event Extraction passage Validation Protocol**  
**Objective**: Ensure generated passage strictly comply with event structure, argument roles, and tense constraints.  

#### **1. Core Validation Checks**  
**1.1 Trigger Compliance**  
- ☑ **Explicit markup**: Triggers wrapped in `<EVENT><trigger>...</trigger></EVENT>`.  
- ☑ **Only one trigger per event instance**

**1.2 Argument Role Alignment**  
- ☑ **Role definitions**: Arguments must act *exactly* as their abstract role requires
- ☑ **Role presence**: Arguments not in the Input(Task Execution -> Your Input of Event Extraction Passage Generation Task) must not appear
- ☑ **Strict argument boundaries:
    Arguments must consist of exactly the words/phrases provided in input
    No additional words may be included within argument tags (e.g., "<Attacker>Israeli forces</Attacker>" is invalid if input only specifies "Israeli" as Attacker)
    No partial matches (must use complete argument phrases as given)
- ☑ **Noun-only arguments**:  
  - Arguments must function as standalone nouns/noun phrases  
  - Never allow arguments to modify other words as adjectives  
  - Example violation:  
    *"<Attacker>Israeli</Attacker> forces launched..."* (❌ "Israeli" modifies "forces")  
  - Correct form:  
    *"<Attacker>Israeli</Attacker> launched..."* (✅ "Israeli" as standalone agent)  
  - Exception: Multi-word proper nouns (e.g., "Saddam Hussein") remain intact
- ☑ **No role leakage**: Arguments must not function as adjectives (e.g., "<Target>Governor</Target>'s office" is invalid).
- ☑ **Explicit markup**: Arguments wrapped in `<ROLE>...</ROLE>`.  

**1.3 Semantic Coherence**  
- ☑ **Event chain intact**: 
  - relations of events descrided in **Natural language form of Event Record** must be followed. 
- ☑ **Arguments logically fit roles**:  
  - Example: `Entity` must be capable of voting (e.g., "government" ✅, "tree" ❌).  
- ☑ **No Need for Additional Arguments**:
  - Arguments in the INPUT can make Passage Logical(No Need for Additional Arguments to make Passage vaild)
- ☑ **No factual checks**: Unrealistic scenarios are allowed (e.g., "AI was elected in 1200 BC").  

---

#### **2. Revision Rules**  
**2.1 Mandatory Fixes**  
- ❌ **Misaligned arguments**: 
    Rewrite to ensure arguments *act* as their role.  
    Before: "<Attacker>Israeli</Attacker>-led blitz" (invalid: "-led" modifies argument).
    After: "The <Attacker>Israeli</Attacker> conducted a blitz".
- ❌ Adjective-Modified Arguments:
    Before: "<Artifact>Saddam</Artifact>'s convoy".
    After: "<Artifact>Saddam Hussein</Artifact> was transported".


**2.2 Prohibited Actions**  
- ⛔ **Never modify original arguments/triggers** (e.g., cannot change `Entity="american"` to `Entity="Americans"`).  
        Including no additions/removals to argument phrases
- ⛔ **No role swaps** (e.g., cannot repurpose `Place` as `Entity`).  

**2.3 Unfixable Errors**  
- If constraints cannot be met without violating rules, return  `{"sentences": []}`.  

---

#### **3. Examples**  
**Valid**:  
> *"<Person>AI</Person> was <ELECT><trigger>elected</trigger></ELECT> by <Entity>robots</Entity> in <Place>Mars</Place>."*  
- ✅ `Entity="robots"` acts as voter.  

**Invalid → Revision**:  
> *"The <Entity>government</Entity> <ELECT><trigger>elected</trigger></ELECT> <Person>Chiluba</Person> in <Place>Florida</Place>."*  
- ❌ `Entity="government"` should vote *for* someone, not act as elector.  
- **Fix**: *"<Person>Chiluba</Person> was <ELECT><trigger>elected</trigger></ELECT> by <Entity>government</Entity> in <Place>Florida</Place>."*  
> *"<Attacker>Israeli</Attacker> forces conducted..."
- ❌ by definition, "Israeli forces" would be the argument mention of Attacker rather than "Israeli"
- **Fix**:"<Attacker>Israeli</Attacker> conducted..." 

---

#### **4. Final Workflow**  
(print out the process step by step)
1. Run **all checks** (trigger, argument roles, coherence).  
2. Generate Revised passage from INPUT and previous conversation.  
'''

In [35]:
def chat4data(gen_prompt, main_event, max_refine=None):
    prompt, record = gen_prompt.gen_data_prompt(main_event=main_event, n_event=get_n_event(max_event, weight_dict)[0])# n_event=get_n_event(max_event, weight_dict)[0]
    record = json.loads(record)
    target_sen = {}
    valid_output = '''
Valid Output(JSON format):
         {
          "sentences": [
            "In yesterday's special election, the <Entity>district</Entity> of <Place>Ohio</Place> successfully <ELECT><trigger>voted</trigger></ELECT> to fill the congressional seat vacancy."
          ]
        }
Note: when rewriting passage, just change what the prompt demands, DO NOT change event mentions which belong to other event. 
'''
    s_time = time.time()
    chat = Chat2DeepSeek(api_key=API_KEY, model='deepseek-chat')
    response = chat.prompt2chat(prompt).strip()
    gen_sen = re.findall('```json(.*?)```', response, flags=re.S)
    print('\tgenerated in(s):', time.time() - s_time)
    for e_id in record:
        ss_time = time.time()
        event_data = record[e_id]
        event_type = event_data['event']
        trigger = event_data['trigger']
        head = 'For the passage(in json) your just generated/revised, and for event {} triggered by {}:\n'.format(event_type, trigger)
    
        check = generate_questions(event_data, arg_role_definition)
        response = chat.prompt2chat(head + check + '\nNote:when checking, exam whether mentions are semantically appropriate rather than merely check if they are tagged properly.\n Please check the questions step by step, print out the process, then return your answer in json:').strip()
        a_d = re.findall('```json(.*?)```', response, flags=re.S)
        
        for sen in a_d[::-1]:
            try:
                sen = json.loads(sen)
            except:
                print('error checking on {}:  {}'.format(main_event, sen))
            if 'mention_exist' in sen.keys():
                result = sen
                break
        
        problem_statements = generate_problem_statements(result, event_data, arg_role_definition)
        if problem_statements:
            response = chat.prompt2chat(head + '\n'.join(problem_statements) + valid_output).strip()
            gen_sen = re.findall('```json(.*?)```', response, flags=re.S)
        
        for sen in gen_sen[::-1]:
            try:
                sen = json.loads(sen)
            except:
                print('error checking on {}:  {}'.format(main_event, sen))
            if 'sentences' in sen.keys():
                if sen['sentences']:
                    target_sen = sen
                    break
        print('\trevised for {} in(s): {}'.format(event_type, time.time() - s_time))
    print('\tpassage done in(s):', time.time() - s_time)
    if target_sen:
        return [target_sen, record]
    else:
        return []

In [7]:
def chat4data_all(event_dict, gen_prompt, max_refine):
    for i, event in enumerate(event_dict, 1):
        if not event in event_sub_list:
           continue

        if event_sen.get(event) and len(event_sen[event]) >= 50:
            print('---EVENT {} chated'.format(event))
            continue

        s_start_time = time.time()
        print('TIME {} -- chating for EVENT {}'.format(get_local_time(), event))
        if not event_sen.get(event):
            event_sen[event] = []
        while len(event_sen[event]) < 50: 
            start_time = time.time()
            sen_record = chat4data(gen_prompt, event, max_refine)
            end_time = time.time()
            print('TIME {} -- done {}/50, in {} s'.format(get_local_time(), len(event_sen[event]), end_time - start_time))
            try:
                if sen_record[0]['sentences']:
                    event_sen[event].append({'sentence':sen_record[0], 'record':sen_record[1]})
                else:
                    print('error ', event)
                    try:
                        with open('./log/error/' + event + '_arg.txt', 'w') as f:
                            f.write(sen_record)
                    except:
                        print('---can not save:', sen_record)
                save_json(event_sen, './history_data/data_{}_{}_{}.json'.format(i, event, get_local_time()))
                print('TIME {} -- done save'.format(get_local_time()))
            except:
                print('empty:', sen_record)
                continue
    
        save_json(event_sen, './data_{}_{}_{}.json'.format(i, event, get_local_time()))
        print('TIME {} -- done save'.format(get_local_time()))
        print('Done processing EVENT: {}, in {} s'.format(event, end_time - s_start_time))
        time.sleep(0.55)
        


In [8]:
gen_prompt = Prompt(
    event_dict = event_dict, 
    trigger_dict = trigger_dict, 
    arguments_dict = arguments_dict, 
    n_event = get_n_event(max_event, weight_dict)[0], 
    max_argument = max_argument,
    complex_score = complex_score)

no orignal record


In [9]:
# Is ‘Syria’ a DESTINATION argument describing the event triggered by ‘fee’?
# The passage contains a hallucinated argument CRIME incorrectly, remove CRIME information for event triggered by ‘jailed’.”

prompt, record = gen_prompt.gen_data_prompt(main_event='ATTACK', n_event=5)# n_event=get_n_event(max_event, weight_dict)[0]

In [10]:
print(prompt)


Event Extraction Passage Generation Task

Core Definitions
    1. **Event**: An objective fact describing interactions between participants at specific time/location  
    2. **Event Type**: Classification label (e.g., ELECT) determining argument structure   
    3. **Arguments**: Core elements with semantic roles (e.g., Person/Entity/Time/Place)
Trigger Annotation Rules
    Key Requirements:
     Quantitative Limitation: Only one trigger per event instance
     Trigger Patterns:
      a) Main verb: "Protesters <EVENT_TYPE><trigger>clashed</trigger></EVENT_TYPE> with police"
      b) Adjective/Participle: "The <EVENT_TYPE><trigger>convicted</trigger></EVENT_TYPE> politician resigned"
      c) Nominalized: "New <EVENT_TYPE><trigger>explosions</trigger></EVENT_TYPE> rock Baghdad after dark ."
      d) Simple Present Tense verb: "Protesters <EVENT_TYPE><trigger>clash</trigger></EVENT_TYPE> with police"
Annotation Protocol:
     1.Event Trigger Tagging:
            <EVENT_TYPE><trigger>tr

In [37]:
chat4data(gen_prompt, main_event='ATTACK', max_refine=None)

	generated in(s): 13.621853351593018
	revised for ARREST-JAIL in(s): 46.77440404891968
	revised for SENTENCE in(s): 73.66459012031555
	revised for RELEASE-PAROLE in(s): 104.59808015823364
	revised for ATTACK in(s): 128.41718769073486
	revised for ARREST-JAIL in(s): 157.2265853881836
	passage done in(s): 157.2265853881836


[{'sentences': ['In a dramatic legal sequence at the <Place>Courthouse</Place>, the <Agent>NSA</Agent> <ARREST-JAIL><trigger>immobilized</trigger></ARREST-JAIL> the notorious <Person>Murderer</Person>, while authorities separately <SENTENCE><trigger>arrested</trigger></SENTENCE> the <Defendant>brother</Defendant> for sentencing. Later, <Person>Andrew Harris</Person> <RELEASE-PAROLE><trigger>surrendered</trigger></RELEASE-PAROLE> to a <Entity>lieutenant</Entity> at the same location. Amidst these proceedings, an <ATTACK><trigger>explosion</trigger></ATTACK> erupted nearby. Finally, security <Agent>forces</Agent> <ARREST-JAIL><trigger>nabbed</trigger></ARREST-JAIL> the <Person>Murderer</Person> again within the <Place>Courthouse</Place> premises.']},
 {'event_1': {'event': 'ARREST-JAIL',
   'trigger': 'immobilize',
   'argument': {'Place': ['Courthouse'],
    'Person': ['Murderer'],
    'Agent': ['NSA']}},
  'event_2': {'event': 'SENTENCE',
   'trigger': 'arrested',
   'argument': {'Defe

In [12]:
'''The <Entity>chief</Entity> <PHONE-WRITE><trigger>faxed</trigger></PHONE-WRITE> a critical warning, 
which <DEMONSTRATE><trigger>incited</trigger></DEMONSTRATE> <Entity>demonstrators</Entity> across the <Place>World</Place>. 
This unrest escalated into an <ATTACK><trigger>threat</trigger></ATTACK> against <Target>Achille lauro</Target> using an <Instrument>Bomb</Instrument>, 
culminating in the <EXECUTE><trigger>bombed</trigger></EXECUTE> execution of <Person>Mata Hari</Person> by <Agent>Mossad</Agent> and <Agent>China</Agent>
. Subsequently, the <Agent>government</Agent>'s actions led to <DIE><trigger>lost</trigger></DIE> the life of <Victim>Achille lauro</Victim> 
by <Instrument>Bomb</Instrument> in the <Place>west bank</Place>.'''



In [29]:
valid_output = '''
Valid Output(JSON format):
         {
          "sentences": [
            "In yesterday's special election, the <Entity>district</Entity> of <Place>Ohio</Place> successfully <ELECT><trigger>voted</trigger></ELECT> to fill the congressional seat vacancy."
          ]
        }
Note: when rewriting passage, just change what the prompt demands, DO NOT change event mentions which belong to other event. 
'''
s_time = time.time()
print('*'*50)
chat = Chat2DeepSeek(api_key=API_KEY, model='deepseek-chat')
response = chat.prompt2chat(prompt).strip()
gen_sen = re.findall('```json(.*?)```', response, flags=re.S)
print(time.time() - s_time)
print(response)
for e_id in record:
    ss_time = time.time()
    event_data = record[e_id]
    event_type = event_data['event']
    trigger = event_data['trigger']
    head = 'for event {} triggered by {}:\n'.format(event_type, trigger)
    print('*'*50)

    check = generate_questions(event_data, arg_role_definition)
    print(head + check)
    response = chat.prompt2chat(head + check + '\nNote:when checking, exam whether mentions are semantically appropriate rather than merely check if they are tagged properly.\n Please check the questions step by step, print out the process, then return your answer in json:').strip()
    print(response)
    print('*'*50)
    a_d = re.findall('```json(.*?)```', response, flags=re.S)
    
    for sen in a_d[::-1]:
        try:
            sen = json.loads(sen)
        except:
            print('error checking on {}:  {}'.format(main_event, sen))
        if 'mention_exist' in sen.keys():
            result = sen
            break
    
    problem_statements = generate_problem_statements(result, event_data, arg_role_definition)
    if problem_statements:
        print(head + '\n'.join(problem_statements) + valid_output)
        response = chat.prompt2chat(head + '\n'.join(problem_statements) + valid_output).strip()
        gen_sen = re.findall('```json(.*?)```', response, flags=re.S)
    target_sen = {}
    
    for sen in gen_sen[::-1]:
        try:
            sen = json.loads(sen)
        except:
            print('error checking on {}:  {}'.format(main_event, sen))
        if 'sentences' in sen.keys():
            if sen['sentences']:
                target_sen = sen
                break
    print(target_sen)
    print(time.time() - s_time)
    
    print('+'*50)
print(time.time() - s_time)

**************************************************
12.741190433502197
```json
{
  "sentences": [
    "In the cramped confines of <Place>Shelter</Place>, <Person>mary</Person> was <BE-BORN><trigger>pressured</trigger></BE-BORN> into existence, her first cries echoing through the makeshift delivery room. Years later, in that same <Place>Shelter</Place>, the <Entity>council</Entity> would orchestrate a historic transition, <ELECT><trigger>becoming</trigger></ELECT> <Person>winner</Person> as their new leader amid rising tensions. Those tensions soon erupted into <ATTACK><trigger>violent</trigger></ATTACK> unrest, though details of perpetrators and targets remained unclear. Meanwhile, a desperate evacuation unfolded as <Artifact>kids</Artifact> began <TRANSPORT><trigger>sailing</trigger></TRANSPORT> from the war-torn <Origin>outskirts</Origin> toward presumed safety in <Destination>Al Kut</Destination>. This chaotic exodus coincided with the final <END-ORG><trigger>shattered</trigger></END

In [25]:
print('-------------START CHARTING------------')
chat4data_all(event_dict, gen_prompt, max_refine)
save_json(event_sen, './data_sen_{}.json'.format(get_local_time()))

-------------START CHARTING------------
TIME 6_17_2_46 -- chating for EVENT ATTACK
TIME 6_17_2_47 -- done 0/50, in 49.011321783065796 s
TIME 6_17_2_47 -- done save
TIME 6_17_2_49 -- done 1/50, in 84.26647186279297 s
TIME 6_17_2_49 -- done save
TIME 6_17_2_50 -- done 2/50, in 51.803590059280396 s
TIME 6_17_2_50 -- done save
TIME 6_17_2_50 -- done 3/50, in 57.122848987579346 s
TIME 6_17_2_50 -- done save
TIME 6_17_2_51 -- done 4/50, in 47.57805371284485 s
TIME 6_17_2_51 -- done save
TIME 6_17_2_52 -- done 5/50, in 37.92355155944824 s
TIME 6_17_2_52 -- done save
TIME 6_17_2_53 -- done 6/50, in 67.26207041740417 s
TIME 6_17_2_53 -- done save
TIME 6_17_2_54 -- done 7/50, in 66.11436200141907 s
TIME 6_17_2_54 -- done save
TIME 6_17_2_55 -- done 8/50, in 67.1168429851532 s
TIME 6_17_2_55 -- done save
TIME 6_17_2_56 -- done 9/50, in 63.71771216392517 s
TIME 6_17_2_56 -- done save
TIME 6_17_2_57 -- done 10/50, in 63.76286840438843 s
TIME 6_17_2_57 -- done save
TIME 6_17_2_58 -- done 11/50, in 5

In [12]:
# pipline DEGREE -> "event is ___, trigger is ___" -> "event record" -- loss
# faiss 搜素

In [45]:
time.localtime().tm_sec

45