# Score weak classifiers
## Import stuff

In [1]:
import pandas as pd
import re
from pathlib import Path
from constants import PROJECT_PATH, TOKENIZER_SUFFIXES
from IPython.display import display

  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/jgut/miniconda3/envs/SMILES_OR_SELFIES/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


## Define useful classes

In [2]:
def get_cells(line:str)->[str]:
    return [cell.strip() for cell in re.split("  ", line) if len(cell)>0]

def get_report(path:Path):
    with open(path, "r") as file :
        lines = file.readlines()[:8]
    cells = [get_cells(line) for line in lines if len(line.strip())>0]
    out = {"estimator": cells[0][0], "0p":float(cells[1][1]), "0r":float(cells[1][2]), "0f":float(cells[1][3]), "1p":float(cells[2][1]), "1r":float(cells[2][2]), "1f":float(cells[2][3]), "accuracy": float(cells[3][1]), "Precision":float(cells[4][1]), "Recall":float(cells[4][2]), "F1":float(cells[4][3]), "wp":float(cells[5][1]), "wr":float(cells[5][2]), "wf":float(cells[5][3])}
    return out

def parse_tokenizer(tokenizer_string:str):
    """Parse tokenizer string

    Args:
        tokenizer_string (str): tokenizer string to parse

    Returns:
        Dict[str, str]: dictionary with tokenizer settings
    """
    tokenizer_parts = tokenizer_string.split("_")
    output = {
        "embedding": tokenizer_parts[0],
        "tokenizer": tokenizer_parts[1],
        "dataset": tokenizer_parts[2],
        "architecture": tokenizer_parts[3] if len(tokenizer_parts)>3 else "bart",
    }
    return output

## Compute dataframe

In [3]:
entries = []
for descriptor_name in ["Heterocycles", "NumHDonors","NumAromaticRings",]:
    for tokenizer_suffix in TOKENIZER_SUFFIXES:
        report_parent_path = PROJECT_PATH / "embeddings" / descriptor_name / tokenizer_suffix / "reports"
        if not report_parent_path.exists():
            continue
        for estimators_file in report_parent_path.glob("*.txt"):
            entry = {"descriptor": descriptor_name} | parse_tokenizer(tokenizer_suffix) | get_report(estimators_file)
            entries.append(entry)
df = pd.DataFrame(entries)
df

Unnamed: 0,descriptor,embedding,tokenizer,dataset,architecture,estimator,0p,0r,0f,1p,1r,1f,accuracy,Precision,Recall,F1,wp,wr,wf
0,Heterocycles,smiles,atom,isomers,bart,RBF SVC,0.96,0.98,0.97,0.98,0.96,0.97,0.97,0.97,0.97,0.97,0.97,0.97,0.97
1,Heterocycles,smiles,atom,isomers,bart,Logistic Regression,0.93,0.95,0.94,0.95,0.93,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94
2,Heterocycles,smiles,atom,isomers,bart,Linear SVC,0.93,0.95,0.94,0.95,0.93,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94
3,Heterocycles,smiles,atom,isomers,bart,KNN,0.83,0.94,0.88,0.93,0.8,0.86,0.87,0.88,0.87,0.87,0.88,0.87,0.87
4,Heterocycles,smiles,atom,standard,bart,RBF SVC,0.96,0.98,0.97,0.98,0.96,0.97,0.97,0.97,0.97,0.97,0.97,0.97,0.97
5,Heterocycles,smiles,atom,standard,bart,Logistic Regression,0.93,0.95,0.94,0.95,0.93,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94
6,Heterocycles,smiles,atom,standard,bart,Linear SVC,0.93,0.95,0.94,0.95,0.93,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94
7,Heterocycles,smiles,atom,standard,bart,KNN,0.82,0.94,0.88,0.93,0.8,0.86,0.87,0.88,0.87,0.87,0.88,0.87,0.87
8,Heterocycles,smiles,trained,isomers,bart,RBF SVC,0.95,0.97,0.96,0.97,0.95,0.96,0.96,0.96,0.96,0.96,0.96,0.96,0.96
9,Heterocycles,smiles,trained,isomers,bart,Logistic Regression,0.91,0.93,0.92,0.93,0.91,0.92,0.92,0.92,0.92,0.92,0.92,0.92,0.92


## Diced by estimator

In [4]:
for estimator in df.estimator.unique():
    print(estimator)
    display(df[["descriptor","embedding","tokenizer","dataset","estimator", "accuracy"]][df.estimator==estimator].sort_values("accuracy"))

RBF SVC


Unnamed: 0,descriptor,embedding,tokenizer,dataset,estimator,accuracy
28,Heterocycles,selfies,trained,standard,RBF SVC,0.91
24,Heterocycles,selfies,trained,isomers,RBF SVC,0.93
16,Heterocycles,selfies,atom,isomers,RBF SVC,0.94
20,Heterocycles,selfies,atom,standard,RBF SVC,0.95
8,Heterocycles,smiles,trained,isomers,RBF SVC,0.96
12,Heterocycles,smiles,trained,standard,RBF SVC,0.96
0,Heterocycles,smiles,atom,isomers,RBF SVC,0.97
4,Heterocycles,smiles,atom,standard,RBF SVC,0.97


Logistic Regression


Unnamed: 0,descriptor,embedding,tokenizer,dataset,estimator,accuracy
29,Heterocycles,selfies,trained,standard,Logistic Regression,0.86
25,Heterocycles,selfies,trained,isomers,Logistic Regression,0.87
17,Heterocycles,selfies,atom,isomers,Logistic Regression,0.9
21,Heterocycles,selfies,atom,standard,Logistic Regression,0.91
9,Heterocycles,smiles,trained,isomers,Logistic Regression,0.92
13,Heterocycles,smiles,trained,standard,Logistic Regression,0.92
1,Heterocycles,smiles,atom,isomers,Logistic Regression,0.94
5,Heterocycles,smiles,atom,standard,Logistic Regression,0.94


Linear SVC


Unnamed: 0,descriptor,embedding,tokenizer,dataset,estimator,accuracy
30,Heterocycles,selfies,trained,standard,Linear SVC,0.86
26,Heterocycles,selfies,trained,isomers,Linear SVC,0.87
18,Heterocycles,selfies,atom,isomers,Linear SVC,0.9
22,Heterocycles,selfies,atom,standard,Linear SVC,0.91
10,Heterocycles,smiles,trained,isomers,Linear SVC,0.92
14,Heterocycles,smiles,trained,standard,Linear SVC,0.92
2,Heterocycles,smiles,atom,isomers,Linear SVC,0.94
6,Heterocycles,smiles,atom,standard,Linear SVC,0.94


KNN


Unnamed: 0,descriptor,embedding,tokenizer,dataset,estimator,accuracy
31,Heterocycles,selfies,trained,standard,KNN,0.79
27,Heterocycles,selfies,trained,isomers,KNN,0.82
19,Heterocycles,selfies,atom,isomers,KNN,0.84
23,Heterocycles,selfies,atom,standard,KNN,0.84
11,Heterocycles,smiles,trained,isomers,KNN,0.86
15,Heterocycles,smiles,trained,standard,KNN,0.86
3,Heterocycles,smiles,atom,isomers,KNN,0.87
7,Heterocycles,smiles,atom,standard,KNN,0.87


We see that for all of them, "atom" tokenizer performs better than "trained"/sentencepiece tokenizer and with a bigger impact SMILES performs better than SELFIES.