In [None]:
import spacy
from spacy import util
import pandas as pd
import os
import json
from dotenv import load_dotenv

In [None]:
load_dotenv()

# Environment variables
cuda_order = os.getenv("CUDA_DEVICE_ORDER")
cuda_device = os.getenv("CUDA_DEVICE_NUM")

os.environ["CUDA_DEVICE_ORDER"], os.environ["CUDA_VISIBLE_DEVICES"] = (
    cuda_order,
    cuda_device,
)

In [None]:
spacy.prefer_gpu()

In [None]:
class EntityExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_trf")

    def extract_entities(self, df, text_column="text", entities_column="entities"):
        df[entities_column] = df[text_column].apply(self._get_entities)
        return df

    def _get_entities(self, text):
        doc = self.nlp(text)
        entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
        return entities


In [None]:
extractor = EntityExtractor()

In [None]:
for dataset in ["cidii", "climate_fever", "covid", "euvsdisinfo"]:
    print("Extracting NEs for", dataset)
    dataset_df = pd.read_csv(f"../datasets/processed/{dataset}.csv")
    dataset_df = extractor.extract_entities(dataset_df)
    dataset_df["entities"] = dataset_df["entities"].apply(lambda x: json.dumps(x, ensure_ascii=False))
    dataset_df.to_csv(f"../datasets/processed/{dataset}.csv", index=False)