In [48]:
import argparse
from evaluation.rouge_evaluator import RougeEvaluator
import json
import tqdm

evaluator = RougeEvaluator()

for data_type in ["train", "validation", "test"]:
    print("\n\n\n", data_type, " set")
    with open(f"./data/{data_type}.json", 'r') as f:
        eval_data = json.load(f)

    with open(f"./models/{data_type}_pred_data.json", 'r') as f:
        pred_data = json.load(f)

    assert len(eval_data) == len(pred_data)

    pred_sums = []
    eval_sums = []
    for i, (eval, pred) in enumerate(tqdm.tqdm(zip(eval_data, pred_data), total=len(eval_data))):
        pred_sums.append(pred['summary'])
        eval_sums.append(eval['summary'])

    scores = evaluator.batch_score(pred_sums, eval_sums)

    for k, v in scores.items():
        print(k)
        print("\tPrecision:\t", v["p"])
        print("\tRecall:\t\t", v["r"])
        print("\tF1:\t\t", v["f"])




 train  set


100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 1109281.43it/s]


rouge-1
	Precision:	 0.2714084271091003
	Recall:		 0.35620294032443606
	F1:		 0.3080775294444611
rouge-2
	Precision:	 0.10551382384766454
	Recall:		 0.14038935860727692
	F1:		 0.12047845746680498
rouge-4
	Precision:	 0.03930735395248246
	Recall:		 0.052242200107432885
	F1:		 0.04486100826958371
rouge-l
	Precision:	 0.25137779860101084
	Recall:		 0.3297205458133045
	F1:		 0.28526815041474884



 validation  set


100%|█████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1049100.55it/s]


rouge-1
	Precision:	 0.3136344490928959
	Recall:		 0.38441259161845703
	F1:		 0.345435262568448
rouge-2
	Precision:	 0.12817218618728507
	Recall:		 0.15930864328114958
	F1:		 0.1420542519348566
rouge-4
	Precision:	 0.04829823623763358
	Recall:		 0.059919139467185495
	F1:		 0.05348473356137698
rouge-l
	Precision:	 0.2895356076879822
	Recall:		 0.35504441637642126
	F1:		 0.3189611747617577



 test  set


100%|█████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1006552.44it/s]


rouge-1
	Precision:	 0.3051640128265358
	Recall:		 0.3895018900973398
	F1:		 0.34221331228531576
rouge-2
	Precision:	 0.12652766723667142
	Recall:		 0.16089493083217069
	F1:		 0.14165664359852587
rouge-4
	Precision:	 0.04644833653052124
	Recall:		 0.05665898250357198
	F1:		 0.05104808293837277
rouge-l
	Precision:	 0.28223028618179574
	Recall:		 0.36002475342139845
	F1:		 0.316416012098339


# Baseline 1 (three random sentences)

In [25]:
with open("./data/test.json", 'r') as f:
    eval_data = json.load(f)
    
with open("./models/test_data_baseline.json", 'r') as f:
    pred_data = json.load(f)

assert len(eval_data) == len(pred_data)

pred_sums = []
eval_sums = []
for eval, pred in tqdm.tqdm(zip(eval_data, pred_data), total=len(eval_data)):
    pred_sums.append(pred['summary'])
    eval_sums.append(eval['summary'])

scores = evaluator.batch_score(pred_sums, eval_sums)

for k, v in scores.items():
    print(k)
    print("\tPrecision:\t", v["p"])
    print("\tRecall:\t\t", v["r"])
    print("\tF1:\t\t", v["f"])

100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 982963.21it/s]


rouge-1
	Precision:	 0.23100286990632624
	Recall:		 0.24776795743078206
	F1:		 0.23909188266827794
rouge-2
	Precision:	 0.06332632996844961
	Recall:		 0.06857402872031029
	F1:		 0.06584578864194349
rouge-4
	Precision:	 0.019636750129786787
	Recall:		 0.02076334848861271
	F1:		 0.020184341131428977
rouge-l
	Precision:	 0.21054500801958181
	Recall:		 0.22532294442134868
	F1:		 0.21768345607661016


# Baseline 2 (greedy search)

In [27]:
with open("./data/train.json", 'r') as f:
    eval_data = json.load(f)
    
with open("./models/train_greedy_pred_data.json", 'r') as f:
    pred_data = json.load(f)

assert len(eval_data) == len(pred_data)

pred_sums = []
eval_sums = []
for eval, pred in tqdm.tqdm(zip(eval_data, pred_data), total=len(eval_data)):
    pred_sums.append(pred['summary'])
    eval_sums.append(eval['summary'])

scores = evaluator.batch_score(pred_sums, eval_sums)

for k, v in scores.items():
    print(k)
    print("\tPrecision:\t", v["p"])
    print("\tRecall:\t\t", v["r"])
    print("\tF1:\t\t", v["f"])

100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 1803226.14it/s]


rouge-1
	Precision:	 0.4785132510398555
	Recall:		 0.469473039810747
	F1:		 0.4739500406779775
rouge-2
	Precision:	 0.2683624508316805
	Recall:		 0.2604720586466614
	F1:		 0.26435839106091474
rouge-4
	Precision:	 0.12957751651443403
	Recall:		 0.12238090188768092
	F1:		 0.12587643180148803
rouge-l
	Precision:	 0.45812417387875837
	Recall:		 0.44864043109582874
	F1:		 0.4533327078203429
