Skip to content

Commit

Permalink
add hipe-2022 edition option (to prevent spurious normalization on ME…
Browse files Browse the repository at this point in the history
…TO cols)
  • Loading branch information
simon-clematide committed May 12, 2022
1 parent dffa502 commit 6605770
Showing 1 changed file with 14 additions and 4 deletions.
18 changes: 14 additions & 4 deletions normalize_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Normalize entity linking by remapping links according to an external file
Usage:
normalize_linking.py -i=<fpath> -o=<fpath> [--norm-time (--norm-histo --map=<fpath>) --union-meto-lit]
normalize_linking.py -i=<fpath> -o=<fpath> [--norm-time (--norm-histo --map=<fpath>) --union-meto-lit] [--hipe_edition=<str>]
normalize_linking.py -h | --help
Options:
Expand All @@ -16,6 +16,8 @@
--norm-time Normalize NEL for time mentions by linking to NIL.
--norm-histo Normalize NEL for historical entities
--union-meto-lit Unionize literal and metonymic columns (apply on both columns).
-e --hipe_edition=<str> Specify the HIPE edition. Ignores METO columns if set to hipe-2022. Possible values: hipe-2020, hipe-2022 [default: hipe-2020]
All file path can be local or remote URLs.
Expand All @@ -26,6 +28,7 @@
import pandas as pd
from docopt import docopt

HIPE_EDITIONS = ["HIPE-2020", "HIPE-2022"]

def get_mappings(f_map):
df_mapping = pd.read_csv(f_map, delimiter="\t")
Expand Down Expand Up @@ -120,10 +123,11 @@ def union(list1, list2):
return df


def remove_time_linking(df, replacement="NIL"):
def remove_time_linking(df, replacement="NIL",map_meto=True):
try:
df.loc[df["NE-COARSE-LIT"].str.contains("time"), "NEL-LIT"] = replacement
df.loc[df["NE-COARSE-LIT"].str.contains("time"), "NEL-METO"] = replacement
if map_meto:
df.loc[df["NE-COARSE-LIT"].str.contains("time"), "NEL-METO"] = replacement
except KeyError:
pass

Expand All @@ -138,6 +142,12 @@ def main(args):
norm_time = args["--norm-time"]
norm_histo = args["--norm-histo"]
unionize = args["--union-meto-lit"]
hipe_edition = args["--hipe_edition"].upper() # mandatory option

if hipe_edition not in HIPE_EDITIONS:
msg = f"Hipe edition was not or incorrectly set. Use --hipe_edition=hipe-2022 or --hipe_edition=hipe-2022. '"
logging.error(msg)
sys.exit(1)

df = pd.read_csv(f_in, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", skip_blank_lines=False)
df = df.fillna(value={"NE-COARSE-LIT": "", "NEL-LIT": "", "NEL-METO": ""})
Expand All @@ -147,7 +157,7 @@ def main(args):
df = normalize_n_to_n(df, mappings)

if norm_time:
df = remove_time_linking(df)
df = remove_time_linking(df,map_meto=hipe_edition == 'HIPE-2020')

if unionize:
df = unionize_meto_lit(df)
Expand Down

0 comments on commit 6605770

Please sign in to comment.