In [1]:
import json
from pathlib import Path
import pandas as pd
import re

In [78]:
acc_pattern = re.compile(": (\d*[.]\d*).*")
f1_pattern = re.compile("F1 score: (\d*[.]\d*).*")

metrics = []

for nb_file in Path("../torch_cnn_experiments/").glob("*.ipynb"):
    with open(nb_file, "r") as fr:
        print(nb_file)
        nb_json = json.load(fr)
        print(nb_json['cells'][-1]['outputs'][-2]['text'][-2])
        
        acc_text = acc_pattern.search(
                nb_json['cells'][-1]['outputs'][-2]['text'][-2]
            ).group(1)
        acc = float(acc_text)

        f1_text = f1_pattern.search(
                nb_json['cells'][-1]['outputs'][-2]['text'][-2]
            ).group(1)
        f1 = float(f1_text)

        metrics.append((str(nb_file.stem), round(acc*100,1), round(f1*100,1)))

metrics

../torch_cnn_experiments_old/demo_coding_vs_intergenomic_seqs_kmer_2.ipynb
 Accuracy: 0.866200, F1 score: 0.857742, Avg loss: 0.563593 

../torch_cnn_experiments_old/human_nontata_promoters_subword_128.ipynb
 Accuracy: 0.806066, F1 score: 0.775001, Avg loss: 0.610672 

../torch_cnn_experiments_old/demo_coding_vs_intergenomic_seqs_subword_512.ipynb
 Accuracy: 0.841240, F1 score: 0.832634, Avg loss: 0.576374 

../torch_cnn_experiments_old/demo_coding_vs_intergenomic_seqs_kmer_3.ipynb
 Accuracy: 0.830840, F1 score: 0.818175, Avg loss: 0.578670 

../torch_cnn_experiments_old/human_nontata_promoters_subword_256.ipynb
 Accuracy: 0.792783, F1 score: 0.749054, Avg loss: 0.613728 

../torch_cnn_experiments_old/demo_coding_vs_intergenomic_seqs_subword_64.ipynb
 Accuracy: 0.846520, F1 score: 0.842101, Avg loss: 0.576331 

../torch_cnn_experiments_old/human_nontata_promoters_kmer_4.ipynb
 Accuracy: 0.794333, F1 score: 0.774636, Avg loss: 0.622656 

../torch_cnn_experiments_old/demo_coding_vs_inter

[('demo_coding_vs_intergenomic_seqs_kmer_2', 86.6, 85.8),
 ('human_nontata_promoters_subword_128', 80.6, 77.5),
 ('demo_coding_vs_intergenomic_seqs_subword_512', 84.1, 83.3),
 ('demo_coding_vs_intergenomic_seqs_kmer_3', 83.1, 81.8),
 ('human_nontata_promoters_subword_256', 79.3, 74.9),
 ('demo_coding_vs_intergenomic_seqs_subword_64', 84.7, 84.2),
 ('human_nontata_promoters_kmer_4', 79.4, 77.5),
 ('demo_coding_vs_intergenomic_seqs_subword_128', 83.5, 82.2),
 ('human_nontata_promoters_character', 80.6, 76.9),
 ('human_nontata_promoters_subword_512', 82.4, 80.2),
 ('human_nontata_promoters_kmer_2', 80.8, 77.5),
 ('human_nontata_promoters_subword_64', 81.2, 79.2),
 ('demo_coding_vs_intergenomic_seqs_character', 86.5, 85.9),
 ('demo_coding_vs_intergenomic_seqs_subword_256', 83.6, 83.2),
 ('human_nontata_promoters_kmer_3', 79.4, 76.4)]

In [68]:
def split_experiment(ex):
    if ex.endswith('character'):
        return (
            ex[:ex.index('character') - 1], 
            'character',
            ''
        )

    if 'kmer' in ex:
        return (
            ex[:ex.index('kmer') - 1], 
            'kmer', 
            ex[ex.rindex('_') + 1:]
        )

    if 'subword' in ex:
        return (
            ex[:ex.index('subword') - 1], 
            'subword', 
            ex[ex.rindex('_') + 1:]
        )

In [79]:
for index, row in enumerate(metrics):
    metrics[index] = split_experiment(row[0]) + metrics[index][1:]
metrics


[('demo_coding_vs_intergenomic_seqs', 'kmer', '2', 86.6, 85.8),
 ('human_nontata_promoters', 'subword', '128', 80.6, 77.5),
 ('demo_coding_vs_intergenomic_seqs', 'subword', '512', 84.1, 83.3),
 ('demo_coding_vs_intergenomic_seqs', 'kmer', '3', 83.1, 81.8),
 ('human_nontata_promoters', 'subword', '256', 79.3, 74.9),
 ('demo_coding_vs_intergenomic_seqs', 'subword', '64', 84.7, 84.2),
 ('human_nontata_promoters', 'kmer', '4', 79.4, 77.5),
 ('demo_coding_vs_intergenomic_seqs', 'subword', '128', 83.5, 82.2),
 ('human_nontata_promoters', 'character', 'NaN', 80.6, 76.9),
 ('human_nontata_promoters', 'subword', '512', 82.4, 80.2),
 ('human_nontata_promoters', 'kmer', '2', 80.8, 77.5),
 ('human_nontata_promoters', 'subword', '64', 81.2, 79.2),
 ('demo_coding_vs_intergenomic_seqs', 'character', 'NaN', 86.5, 85.9),
 ('demo_coding_vs_intergenomic_seqs', 'subword', '256', 83.6, 83.2),
 ('human_nontata_promoters', 'kmer', '3', 79.4, 76.4)]

In [80]:
tab = pd.DataFrame.from_records(metrics, columns = ["Dataset", "Tokenization", "Parameters", "Accuracy", "F1 score"])
tab = tab.sort_values(['Dataset'])
tab

Unnamed: 0,Dataset,Tokenization,Parameters,Accuracy,F1 score
0,demo_coding_vs_intergenomic_seqs,kmer,2.0,86.6,85.8
2,demo_coding_vs_intergenomic_seqs,subword,512.0,84.1,83.3
3,demo_coding_vs_intergenomic_seqs,kmer,3.0,83.1,81.8
5,demo_coding_vs_intergenomic_seqs,subword,64.0,84.7,84.2
7,demo_coding_vs_intergenomic_seqs,subword,128.0,83.5,82.2
12,demo_coding_vs_intergenomic_seqs,character,,86.5,85.9
13,demo_coding_vs_intergenomic_seqs,subword,256.0,83.6,83.2
1,human_nontata_promoters,subword,128.0,80.6,77.5
4,human_nontata_promoters,subword,256.0,79.3,74.9
6,human_nontata_promoters,kmer,4.0,79.4,77.5
