In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from OmniEvent.infer import infer, AttrDict, get_model, get_tokenizer

MODEL_PATH = "../models/s2s-mt5-ed/"

In [None]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = 'cuda'

def get_model_and_tokenizer(model_name_or_path):
    
    model_args = AttrDict({
        "paradigm": "seq2seq",
        "model_type": "mt5"
    })
    model = get_model(model_args, model_name_or_path)
    model = model.to(device)
    # tokenizer 
    tokenizer = get_tokenizer(model_name_or_path)

    return model, tokenizer

In [None]:
df = pd.read_csv("../data/data.csv")
df = df.sample(n=50, random_state=42)

In [None]:
model, tokenizer = get_model_and_tokenizer(MODEL_PATH)

In [None]:
# from title
for row in df.sample(n=10).itertuples(index=False):
    infer(text=row.Title, task="ED", model=model, tokenizer=tokenizer)

In [None]:
# get all from articles
ed_results = []
for row in tqdm(df.itertuples(index=False), total=len(df)):
    sentences = [s.strip() for s in row.Content.split(".")]
    ed_article = []
    for sentence in sentences:
        event = infer(text=sentence, task="ED", model=model, tokenizer=tokenizer)[0]
        if len(event['events']) > 0:
            ed_article.append(event)
    ed_results.append(ed_article)

In [None]:
df['ed_results'] = ed_results

In [None]:
df.iloc[0]

In [None]:
df.to_parquet("../data/ed_output.parquet", engine="pyarrow")