In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
import pickle
from utills import get_files_in_dir, get_file_nums, load_ground_truth_from_nums, load_preprocessed_paras
from predict import make_predictions
import json
import os
import numpy as np
import math
from tqdm.auto import tqdm, trange
from features import prepare_entry, get_nltk_pos_tag_based_ml_chunker, perceptron_tagger, tokenize
import evaluator # From PAN github
import re

In [3]:
MODEL_PATH = 'temp_data/model.p'
PREPROCESSED_VALIDATION_DIR = 'preprocessed_data/2021/validation/'
GROUND_TRUTH_DIR = 'data/pan2021/validation/'

In [4]:
with open(MODEL_PATH, 'rb') as f:
    (
        transformer,
        primary_scaler,
        secondary_scaler,
        clf
    ) = pickle.load(f)

In [5]:
file_names = get_files_in_dir(PREPROCESSED_VALIDATION_DIR)
file_nums = get_file_nums(file_names)
all_preprocessed_paras = []
for n in file_nums:
    fp = os.path.join(PREPROCESSED_VALIDATION_DIR, 'problem-' + n + '.jsonl')
    with open(fp, 'r') as f:
        lines = f.readlines()
    paras = [json.loads(l) for l in lines]
    all_preprocessed_paras.append(paras)

In [11]:
solutions = {}
for i in trange(len(file_nums)):
    r = make_predictions(all_preprocessed_paras[i], transformer, primary_scaler, secondary_scaler, clf, clust_thresh=0.5)

    solutions['problem-' + file_nums[i]] = {
        "multi-author": r[0],
        "changes": r[1],
        "paragraph-authors": r[2]
    }






In [12]:
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)
task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.649138367110405
task2_score 0.6641936772118469
task3_score 0.4275907844092534


Old Experiment Results
====

In [7]:
# 2020 Narrow, 2020 Wide, 2021 with additional training data, Logistic Regression - parameter tuned
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6620657016243532
task2_score 0.6304176617838073
task3_score 0.3019430534744456


In [32]:
# 2020 Narrow, 2020 Wide, 2021 - parameter tuned
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.20062193598125155
task2_score 0.3213416265639423
task3_score 0.1489315603643603


  average, "true nor predicted", 'F-score is', len(true_sum)


In [15]:
# 2020 Narrow, 2020 Wide, 2021
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6192071134503623
task2_score 0.5950326977258598
task3_score 0.2899713781706845


In [57]:
# 2020 Narrow, 2021 Param tuned, Logistic regression
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.5913422946130158
task2_score 0.5614510717780228
task3_score 0.2858810338111458


In [47]:
# 2020 Narrow, 2021 Param tuned
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.585147614272933
task2_score 0.542232968242265
task3_score 0.2809123970141366


In [18]:
# 2020 Narrow, 2021 
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.5756817056981113
task2_score 0.6107056452974297
task3_score 0.30518762256514587


In [24]:
# 2020 Wide, 2021 
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6315126800050974
task2_score 0.6080145538527165
task3_score 0.299864687412713


In [60]:
# 2021 with additional training data, Logistic Regression, Tuned
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6515297375658958
task2_score 0.6746050576341451
task3_score 0.33715115409122587


In [54]:
# 2021 Logistic Regression, Tuned
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6543042733019168
task2_score 0.6726840425088245
task3_score 0.3326809901827926


In [50]:
# 2021 Tuned
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6455188481389458
task2_score 0.668116804548034
task3_score 0.33306900696144565


In [21]:
# 2021
truth = evaluator.read_ground_truth_files(GROUND_TRUTH_DIR)

task1_results = evaluator.compute_score_single_predictions(truth, solutions, 'multi-author')
task2_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'changes', labels=[0, 1])
task3_results = evaluator.compute_score_multiple_predictions(truth, solutions, 'paragraph-authors', labels=[1,2,3,4,5])

print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6450167408562897
task2_score 0.6698177624071655
task3_score 0.32884445494938486


In [10]:
# With new vocab richness features, re-sampled training data, and abbrev features
print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6381345429606189
task2_score 0.6596201150544162
task3_score 0.33332838489114


In [42]:
# With new vocab richness features, re-sampled training data, and abbrev features
print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6307818231963915
task2_score 0.6618821073958295
task3_score 0.33100871404004595


In [58]:
# With new vocab richness features and re-sampled training data
print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6368675438367883
task2_score 0.667185893120964
task3_score 0.33187120141369514


In [17]:
# With new vocab richness features and misspelling features
print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6499788163794786
task2_score 0.6722909734510989
task3_score 0.3306673714495353


In [45]:
# With new vocab richness features
print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6489929440418344
task2_score 0.6630730084014798
task3_score 0.3264218118122444


In [249]:
print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6342856461586952
task2_score 0.6526037259064174
task3_score 0.3308351036739131


In [38]:
# trying different cluster parameters
print("task1_score", task1_results)
print("task2_score", task2_results)
print("task3_score", task3_results)

task1_score 0.6381345429606189
task2_score 0.6596201150544162
task3_score 0.2861795037704706
