# Performance Evaluation on SOTA dataset

This notebook runs AxCell on the **PWCLeaderboards** dataset.

For the pipeline to work we need a running elasticsearch instance. Run `docker-compose up -d` from the `axcell` repository to start a new instance.

Due to the docker permision issue, run the `extract_sota.py` file after running `docker-compose up -d` from the `axcell` repository

In [21]:
from pathlib import Path
import json
import pandas as pd

PROJECT_ROOT = Path('/home/jakub.suran/netstore1/COS470/project/sota')
VALIDATION_DATA_ROOT = PROJECT_ROOT / 'dataset' / 'validation'
TRAIN_DATA_ROOT = PROJECT_ROOT / 'dataset' / 'train'

validation_ids = [dir.name for dir in VALIDATION_DATA_ROOT.iterdir() if dir.is_dir()]
train_ids = [dir.name for dir in TRAIN_DATA_ROOT.iterdir() if dir.is_dir()]

In [9]:
def fix_json_quotes(json_string):
    fixed_json = ""
    inside_double_quotes = False
    for char in json_string:
        if char == "'":
            if inside_double_quotes:
                fixed_json += char
            else:
                fixed_json += '"'
        else:
            fixed_json += char
        if char == '"':
            inside_double_quotes = not inside_double_quotes
    return fixed_json

In [18]:
from json import JSONDecodeError
from tqdm import tqdm
import pandas as pd

tables = []
arxiv_ids = []
for arxiv_id in tqdm(train_ids):
    annotations_file = TRAIN_DATA_ROOT / arxiv_id / 'annotations.json'
    table = {}
    table['index'] = 0
    table['records'] = []
    try:
        with open(annotations_file, 'r') as f:
            content = f.read()
            if content.strip() != 'unanswerable':
                content = fix_json_quotes(content)
                annotations = json.loads(content)
                for leaderboard in annotations:
                    leaderboard = leaderboard['LEADERBOARD']
                    record = {
                        'task': leaderboard['Task'],
                        'dataset': leaderboard['Dataset'],
                        'metric': leaderboard['Metric'],
                        'value': leaderboard['Score']
                    }
                    table['records'].append(record)
        tables.append([table])
        arxiv_ids.append(arxiv_id)
    except JSONDecodeError as e:
        print(f"Error parsing {annotations_file}: {e}")
    
    
sota_leaderboards = pd.DataFrame({'arxiv_id': arxiv_ids, 'tables': tables})

sota_leaderboards.tail()

 77%|███████▋  | 9512/12288 [00:07<00:02, 1013.38it/s]

Error parsing /home/jakub.suran/netstore1/COS470/project/sota/dataset/train/2110.00976v4/annotations.json: Invalid \escape: line 1 column 1026 (char 1025)


 96%|█████████▌| 11750/12288 [00:09<00:00, 978.56it/s] 

Error parsing /home/jakub.suran/netstore1/COS470/project/sota/dataset/train/2303.16886v1/annotations.json: Expecting ',' delimiter: line 1 column 141 (char 140)


100%|██████████| 12288/12288 [00:09<00:00, 1236.81it/s]


Unnamed: 0,arxiv_id,tables
12281,2312.02139v1,"[{'index': 0, 'records': [{'task': 'Image Gene..."
12282,2312.02185v1,"[{'index': 0, 'records': [{'task': 'Human Acti..."
12283,2312.03288v1,"[{'index': 0, 'records': [{'task': 'Skeleton B..."
12284,2312.03430v1,"[{'index': 0, 'records': [{'task': 'Semantic S..."
12285,2312.03701v1,"[{'index': 0, 'records': [{'task': 'Image Gene..."


Download and unpack the archive with trained models (table type classifier, table segmentation), taxonomy and abbreviations.

In [19]:
V1_URL = 'https://github.com/paperswithcode/axcell/releases/download/v1.0/'
MODELS_URL = V1_URL + 'models.tar.xz'
MODELS_ARCHIVE = 'models.tar.xz'
MODELS_PATH = Path('models')

from fastai.core import download_url
import tarfile

download_url(MODELS_URL, MODELS_ARCHIVE)
with tarfile.open(MODELS_ARCHIVE, 'r:*') as archive:
    archive.extractall()

from axcell.helpers.results_extractor import ResultsExtractor
extract_results = ResultsExtractor(MODELS_PATH)

[PID 1377489] Load model table-structure-classifier.pth


In [20]:
import pandas as pd

papers = []
our_taxonomy = set(extract_results.taxonomy.taxonomy)
gold_records = []
for _, paper in sota_leaderboards.iterrows():
    for table in paper.tables:
        for record in table['records']:
            r = dict(record)
            r['arxiv_id'] = paper.arxiv_id
            tdm = (record['task'], record['dataset'], record['metric'])
            if tdm in our_taxonomy:
                gold_records.append(r)
                papers.append(paper.arxiv_id)
gold_records = pd.DataFrame(gold_records)
papers = sorted(set(papers))

print(f"Found {len(gold_records)} records in {len(papers)} papers")


Found 13515 records in 3257 papers


In [None]:
AXCELL_SOTA_ROOT_PATH = Path('/home/jakub.suran/netstore1/COS470/axcell') / 'data_sota'
# SOURCES_PATH = AXCELL_SOTA_ROOT_PATH / 'sources'
PAPERS_PATH = AXCELL_SOTA_ROOT_PATH / 'papers'

extracted_papers = [dir.name for dir in PAPERS_PATH.iterdir() if dir.is_dir()]
print(f"Successfullly extracted {len(extracted_papers)} papers")

sota_leaderboards = sota_leaderboards[sota_leaderboards['arxiv_id'].isin(extracted_papers)]

assert len(sota_leaderboards) == len(extracted_papers)

Successfullly extracted 95 papers


In [68]:
from axcell.data.paper_collection import PaperCollection
pc = PaperCollection.from_files(PAPERS_PATH)
pc = PaperCollection([pc.get_by_id(p) for p in papers])

In [72]:
%%time
from tqdm import tqdm
from joblib import delayed, Parallel

def process_single(index):
    extract_results = ResultsExtractor(MODELS_PATH)
    return extract_results(pc[index])

results = []
for index in tqdm(range(len(pc)), "Processing papers"):
    results.append(process_single(index))

Processing papers: 100%|██████████| 16/16 [13:24<00:00, 50.26s/it]

CPU times: user 1h 19min 30s, sys: 2min 47s, total: 1h 22min 18s
Wall time: 13min 24s





In [76]:
predicted_records = []
# print(pd.Series(gold_records.arxiv_id).value_counts())

for paper, records in zip(pc, results):
    r = records.copy()
    r['arxiv_id'] = paper.arxiv_no_version
    predicted_records.append(r)
predicted_records = pd.concat(predicted_records)
predicted_records.to_json('axcell-predictions-on-sota.json.xz', orient='records')

1904.09408v2    9
1803.09454v1    8
1905.10295v6    4
2006.06936v2    4
2107.12028v2    3
2106.01223v1    3
1702.01691v2    2
1807.04067v1    2
2006.09264v3    2
2308.16775v3    1
2008.05770v1    1
1712.06113v3    1
1411.1091v1     1
1803.10683v3    1
1805.03779v3    1
2004.05343v1    1
Name: arxiv_id, dtype: int64


In [77]:
from axcell.helpers.evaluate import evaluate
print(len(predicted_records), len(gold_records))

# eval = evaluate(predicted_records, gold_records).style.format('{:.2%}')
eval = evaluate(predicted_records, gold_records)
print(eval)

25 44
   Micro Precision  Micro Recall  Micro F1  Macro Precision  Macro Recall  \
0         0.000000      0.000000  0.000000         0.000000      0.000000   
1         0.440000      0.275000  0.338462         0.302083      0.276042   
2         0.636364      0.388889  0.482759         0.437500      0.437500   
3         0.473684      0.360000  0.409091         0.333333      0.406250   
4         0.526316      0.400000  0.454545         0.416667      0.406250   

   Macro F1  
0  0.000000  
1  0.276190  
2  0.437500  
3  0.358333  
4  0.389881  
