In [4]:
# gen data

import os
import re
import json
import copy
import time
import logging
import random
import pickle

from utils.chat2DeepSeek import Chat2DeepSeek
from utils.gen_prompt import Prompt
from utils.util import parse_trigger_answer, parse_argument_answer, save_json, load_json

In [6]:
API_KEY = 'sk-7015d98e24c9431f9fb7fb2a4454186e'
event_sen = {}
# event_sub_list = ['TRANSPORT', 'ELECT', 'START-POSITION', 'ATTACK', 'END-POSITION', 'MEET', 'MARRY', 'PHONE-WRITE', 'TRANSFER-MONEY', 'SUE']

event_sen = load_json('./data_sen_5_28_11_20.json')
event_dict = load_json('./meta_data/event_dict_full.json') # defin from doc
trigger_dict = load_json('./meta_data/trigger_pool_ori.json')
arguments_dict = load_json('./meta_data/argument_pool_ori.json')



In [8]:
n_trigger = 100
n_argument = 20
sentence_per_event = 50
complex_score = [5, 10] # not implemented
max_argument = 2
max_event = 1
max_refine = 3

weight_dict = {0:[1],
                1:[1],
                2:[0.9, 0.1],
                3:[0.8, 0.15, 0.05],
                4:[0.5, 0.3, 0.1, 0.1]
                  }

In [10]:
def get_n_event(max_event, weight_dict):
    if max_event == 0:
        return 0
    return random.choices([ i for i in range(1, max_event + 1)], weights=weight_dict[max_event])

def get_local_time():
    return str(time.localtime().tm_mon) + '_' + str(time.localtime().tm_mday) + '_' + str(time.localtime().tm_hour) + '_' + str(time.localtime().tm_min)
    

In [12]:
refine = '''
### **Event Extraction Sentence Validation Protocol**  
**Objective**: Ensure generated sentences strictly comply with event structure, argument roles, and tense constraints.  

#### **1. Core Validation Checks**  
**1.1 Trigger Compliance**  
- ☑ **Explicit markup**: Triggers wrapped in `<EVENT><trigger>...</trigger></EVENT>`.  
- ☑ **Only past tense verb/nominalized verb/simple present tense verb for triggers**(e.g.,"elect", "meeting", "talks", "explosion" are valid)
- ☑ **No future/conditional/hypothetical triggers** (e.g., "will elect" is invalid).  
- ☑ **Only one trigger per sentence**

**1.2 Argument Role Alignment**  
- ☑ **Role definitions**: Arguments must act *exactly* as their abstract role requires
- ☑ **Role presence**: Arguments not in the Input(Task Execution -> Your Input of Event Extraction Sentence Generation Task) must not appear
- ☑ **No role leakage**: Arguments cannot modify other words.  
- ☑ **No TIME role**: If not provided in the INPUT, time information should not appear in the sentences(e.g., "aghdad witnessed explosions yesterday." ->"aghdad witnessed explosions." if TIME argument is not in the INPUT). 
- ☑ **Explicit markup**: Arguments wrapped in `<EVENT><ROLE>...</ROLE></EVENT>`.  

**1.3 Semantic Coherence**  
- ☑ **Arguments logically fit roles**:  
  - Example: `Entity` must be capable of voting (e.g., "government" ✅, "tree" ❌).  
- ☑ **No Need for Additional Arguments**:
  - Arguments in the INPUT can make sentence Logical(No Need for Additional Arguments to make sentence vaild)
- ☑ **No factual checks**: Unrealistic scenarios are allowed (e.g., "AI was elected in 1200 BC").  

---

#### **2. Revision Rules**  
**2.1 Mandatory Fixes**  
- ❌ **Misaligned arguments**: Rewrite to ensure arguments *act* as their role (e.g., delete "voters" to let `Entity="american"` function as the voter).  
- ❌ **Tense errors**: Replace future/conditional/hypothetical triggers (e.g., "will elect" → "elect"/"elected").  

**2.2 Prohibited Actions**  
- ⛔ **Never modify original arguments/triggers** (e.g., cannot change `Entity="american"` to `Entity="Americans"`).  
- ⛔ **No role swaps** (e.g., cannot repurpose `Place` as `Entity`).  

**2.3 Unfixable Errors**  
- If constraints cannot be met without violating rules, return  `{"sentences": []}`.  

---

#### **3. Examples**  
**Valid**:  
> *"<ELECT><Person>AI</Person></ELECT> was <ELECT><trigger>elected</trigger></ELECT> by <ELECT><Entity>robots</Entity></ELECT> in <ELECT><Place>Mars</Place></ELECT>."*  
- ✅ Past tense trigger.  
- ✅ `Entity="robots"` acts as voter.  

**Invalid → Revision**:  
> *"The <ELECT><Entity>government</Entity></ELECT> <ELECT><trigger>elected</trigger></ELECT> <ELECT><Person>Chiluba</Person></ELECT> in <ELECT><Place>Florida</Place></ELECT>."*  
- ❌ `Entity="government"` should vote *for* someone, not act as elector.  
- **Fix**: *"<ELECT><Person>Chiluba</Person></ELECT> was <ELECT><trigger>elected</trigger></ELECT> by <ELECT><Entity>government</Entity></ELECT> in <ELECT><Place>Florida</Place></ELECT>."*  

**Unfixable → Empty Output**:  
> Input: `{"Entity": "stone"}` (stones cannot vote).  
> Output: `{"sentences": []}`  

---

#### **4. Final Workflow**  

1. Run **all checks** (trigger, argument roles, coherence).  
2. if possible, Generate Revised sentence from INPUT and previous conversation.  
3. else, return empty.  


This protocol ensures strict adherence to linguistic and structural constraints while maintaining reproducibility.
'''

In [14]:
def chat4data(gen_prompt, main_event, max_refine=None):
    prompt, record = gen_prompt.gen_data_prompt(main_event=main_event, n_event=1)# n_event=get_n_event(max_event, weight_dict)[0]
    record = json.loads(record)
    chat = Chat2DeepSeek(api_key=API_KEY, model='deepseek-chat')
    response = chat.prompt2chat(prompt).strip()
    a_d = re.findall('```json(.*?)```', response, flags=re.S)[0]
    
    response = chat.prompt2chat(refine).strip()
    a_d = re.findall('```json(.*?)```', response, flags=re.S)
    target_sen = {}
    
    for sen in a_d[::-1]:
        try:
            sen = json.loads(sen)
        except:
            print('error checking on {}:  {}'.format(main_event, sen))
            return []
        if 'sentences' in sen.keys():
            target_sen = sen
            break
    if target_sen:
        return [target_sen, record]
    else:
        return []

In [16]:
def chat4data_all(event_dict, gen_prompt, max_refine):
    for i, event in enumerate(event_dict, 1):
        # if not event in event_sub_list:
        #    continue
        # event_dict = {'ATTACK':[event_def, role_dict, example, exam_trigger]}
        if event_sen.get(event) and len(event_sen[event]) >= 50:
            print('---EVENT {} chated'.format(event))
            continue

        s_start_time = time.time()
        print('TIME {} -- chating for EVENT {}'.format(get_local_time(), event))
        if not event_sen.get(event):
            event_sen[event] = []
        while len(event_sen[event]) < 50: 
            try:
                start_time = time.time()
                sen_record = chat4data(gen_prompt, event, max_refine)
                end_time = time.time()
                print('TIME {} -- done {}/50, in {} s'.format(get_local_time(), len(event_sen[event]), end_time - start_time))
                if sen_record[0]['sentences']:
                    event_sen[event].append({'sentence':sen_record[0], 'record':sen_record[1]})
                else:
                    print('error ', event)
                    try:
                        with open('./log/error/' + event + '_arg.txt', 'w') as f:
                            f.write(sen_record)
                    except:
                        print('---can not save:', sen_record)
                save_json(event_sen, './history_data/data_{}_{}_{}.json'.format(i, event, get_local_time()))
                print('TIME {} -- done save'.format(get_local_time()))
            except:
                continue
    
        save_json(event_sen, './data_{}_{}_{}.json'.format(i, event, get_local_time()))
        print('TIME {} -- done save'.format(get_local_time()))
        print('Done processing EVENT: {}, in {} s'.format(event, end_time - s_start_time))
        time.sleep(0.55)
        


In [18]:
gen_prompt = Prompt(
    event_dict = event_dict, 
    trigger_dict = trigger_dict, 
    arguments_dict = arguments_dict, 
    n_event = get_n_event(max_event, weight_dict)[0], 
    max_argument = max_argument,
    complex_score = complex_score)

In [40]:
prompt, record = gen_prompt.gen_data_prompt(main_event='ATTACK', n_event=1)# n_event=get_n_event(max_event, weight_dict)[0]

getting orignal record...


In [42]:
print(record)

{"event_1": {"event": "ATTACK", "trigger": "invasion", "argument": {"Attacker": ["US"]}}}


In [57]:
# record = json.loads(record)
chat = Chat2DeepSeek(api_key=API_KEY, model='deepseek-chat')
response = chat.prompt2chat(prompt).strip()
a_d = re.findall('```json(.*?)```', response, flags=re.S)[0]

In [58]:
response = chat.prompt2chat(refine).strip()
a_d = re.findall('```json(.*?)```', response, flags=re.S)
target_sen = {}
print(a_d)

['\n{\n  "sentences": [\n    "The <ATTACK><Attacker>US</Attacker></ATTACK> conducted a military <ATTACK><trigger>invasion</trigger></ATTACK>."\n  ]\n}\n']


In [59]:
print(response)

Given the input and strict validation rules, here is the compliant sentence:  

```json
{
  "sentences": [
    "The <ATTACK><Attacker>US</Attacker></ATTACK> conducted a military <ATTACK><trigger>invasion</trigger></ATTACK>."
  ]
}
```

### **Validation Breakdown**  
1. **Trigger Compliance** ✅  
   - `<ATTACK><trigger>invasion</trigger></ATTACK>` is a valid **nominalized trigger** (past action implied by context).  
   - No future/conditional/hypothetical wording.  
   - Only one trigger per sentence.  

2. **Argument Role Alignment** ✅  
   - `<ATTACK><Attacker>US</Attacker></ATTACK>` strictly matches the input-provided role.  
   - No extra arguments (e.g., `Target`, `Instrument`, `Time`, `Place`) are introduced.  

3. **Semantic Coherence** ✅  
   - "US" logically fits the `Attacker` role.  
   - No factual checks needed (even if unrealistic).  

---

### **Key Decisions**  
- **No TIME/Place**: Omitted since not in input.  
- **Nominalized Trigger**: "Invasion" is treated as a comp

In [60]:
for sen in a_d[::-1]:
    print(sen)
    sen = json.loads(sen)
    if 'sentences' in sen.keys():
        target_sen = sen
        break
if target_sen:
    print([target_sen, record])


{
  "sentences": [
    "The <ATTACK><Attacker>US</Attacker></ATTACK> conducted a military <ATTACK><trigger>invasion</trigger></ATTACK>."
  ]
}

[{'sentences': ['The <ATTACK><Attacker>US</Attacker></ATTACK> conducted a military <ATTACK><trigger>invasion</trigger></ATTACK>.']}, {'event_1': {'event': 'ATTACK', 'trigger': 'invasion', 'argument': {'Attacker': ['US']}}}]


In [20]:
print('-------------START CHARTING------------')
chat4data_all(event_dict, gen_prompt, max_refine)
save_json(event_sen, './data_sen_{}.json'.format(get_local_time()))

-------------START CHARTING------------
TIME 5_28_22_5 -- chating for EVENT START-ORG
getting orignal record...
TIME 5_28_22_5 -- done 0/50, in 31.694997549057007 s
error  START-ORG
---can not save: [{'sentences': []}, {'event_1': {'event': 'START-ORG', 'trigger': 'founded', 'argument': {'Org': ['China Polar Museum Foundation']}}}]
TIME 5_28_22_5 -- done save
getting orignal record...
TIME 5_28_22_6 -- done 0/50, in 43.53117108345032 s
TIME 5_28_22_6 -- done save
getting orignal record...
TIME 5_28_22_6 -- done 1/50, in 32.67620086669922 s
TIME 5_28_22_6 -- done save
getting orignal record...
TIME 5_28_22_7 -- done 2/50, in 47.118796586990356 s
TIME 5_28_22_7 -- done save
getting orignal record...
TIME 5_28_22_8 -- done 3/50, in 38.98767375946045 s
TIME 5_28_22_8 -- done save
getting orignal record...
TIME 5_28_22_9 -- done 4/50, in 40.45914149284363 s
TIME 5_28_22_9 -- done save
getting orignal record...
TIME 5_28_22_9 -- done 5/50, in 34.58124780654907 s
TIME 5_28_22_9 -- done save
g