In [None]:
import pandas as pd
import os
from constants import TASK_MODEL_PATH, TOKENIZER_SUFFIXES, MOLNET_DIRECTORY, PROJECT_PATH, REACTION_PREDICTION_DIRECTORY, DESCRIPTORS

from glob import glob
from pathlib import Path
pd.set_option('display.max_rows', 500)

classification_scores = []
regression_scores = []
TOKENIZER_SUFFIXES+=["smiles_isomers_atom","smiles_isomers_sentencepiece","selfies_isomers_atom","selfies_isomers_sentencepiece"]
for tokenizer_suffix in TOKENIZER_SUFFIXES:
    for task in MOLNET_DIRECTORY.keys():
        task_tokenizer_path = TASK_MODEL_PATH/task/tokenizer_suffix
        if task.startswith("bace") and not ("isomer" in tokenizer_suffix) :
            task_tokenizer_path = Path("/data/jgut/SoS_models/task")/task/tokenizer_suffix
        print(task_tokenizer_path)
        if task_tokenizer_path.exists():
            for hyperparameter_path in glob(str(task_tokenizer_path) + "/*", recursive=True):
                scores_path = hyperparameter_path+"/scores.csv"
                print(scores_path)
                if not Path(scores_path).is_file():
                    continue
                new_score_df = pd.read_csv(scores_path)
                if list(new_score_df.task_type)[0] == "classification":
                    classification_scores.append(new_score_df)
                else:
                    regression_scores.append(new_score_df)

regression_scores = pd.concat(regression_scores, axis = 0, sort = False)
classification_scores = pd.concat(classification_scores, axis = 0, sort = False)

In [None]:
columns = list(regression_scores.columns[-3:])
columns.extend(regression_scores.columns[:-3])

regression_scores.sort_values(["task", "tokenizer"])[columns].sort_values(["mean_absolute_error"]).drop(["model_size", "Unnamed: 0", "task_type"], axis="columns").groupby(["task", "tokenizer"]).first()


In [None]:
from plotting import plot_scores
columns = list(classification_scores.columns[-3:])
columns.extend(classification_scores.columns[:-3])
tasks = classification_scores.task.unique()
tokenizers = classification_scores.tokenizer.unique()
scores = {}
for task in tasks:
    for tokenizer in tokenizers:
        scores[tokenizer] = scores.get(tokenizer, []) + [max(classification_scores[(classification_scores["tokenizer"]==tokenizer)&(classification_scores["task"]== task)]["ROC_AUC"])]
plot_scores(scores, tasks, "AUROC", Path("test/classification_report.svg"))
#classification_scores.sort_values(["task", "tokenizer"])[columns].sort_values(["ROC_AUC"], ascending=False).drop(["model_size", "Unnamed: 0", "task_type"], axis="columns").groupby(["task", "tokenizer"]).first()


In [None]:
from plotting import plot_scores
columns = list(regression_scores.columns[-3:])
columns.extend(regression_scores.columns[:-3])
tasks = regression_scores.task.unique()
tokenizers = regression_scores.tokenizer.unique()
scores = {}
for task in tasks:
    for tokenizer in tokenizers:
        scores[tokenizer] = scores.get(tokenizer, []) + [max(regression_scores[(regression_scores["tokenizer"]==tokenizer)&(regression_scores["task"]== task)]["rectified_mean_squared_error"])]
plot_scores(scores, tasks, "RMSE", Path("test/regression_report.svg"))

In [None]:
from IPython.display import SVG, display
from pathlib import Path

def show_svg():
    
    display(SVG(Path("test/test_pca.svg")))
    display(SVG(Path("test/test_umap.svg")))
        
show_svg()

In [None]:
max(classification_scores[(classification_scores["tokenizer"]=="smiles_atom")&(classification_scores["task"]== "hiv")]["ROC_AUC"])

In [None]:
import pandas as pd
from constants import REACTION_PREDICTION_DIRECTORY, TOKENIZER_SUFFIXES, PROJECT_PATH
pd.options.display.float_format = '{:,.3f}'.format
dfs = []
for task in ["lef"]:
    for tokenizer in ["selfies_atom", "selfies_sentencepiece", "smiles_sentencepiece"]:
        dfs.append(pd.read_csv(PROJECT_PATH/"reaction_prediction_beam_neu"/task/tokenizer/"output.csv"))
df = pd.concat(dfs)
df["top_perc_0"]=0
df["valid_perc_0"]=0
df["unk_perc_0"]=0
for i in range(1,11):
    df["top_perc_"+str(i)] =df["top_"+str(i)]/df["all_samples"]+df["top_perc_"+str(i-1)]
    df["valid_perc_"+str(i)] =df["valid_"+str(i)]/df["all_samples"]
    df["unk_perc_"+str(i)] =df["unk_"+str(i)]/df["all_samples"]
df.sort_values(["task", "model"])[[i for i in df.columns if (("perc" in i) and ("0" not in i))or i in ["model", "task"]]].to_markdown()

In [None]:
descriptors = [ 'NumAliphaticCarbocycles',
 'NumAliphaticHeterocycles',
 'NumAliphaticRings',
 'NumAromaticCarbocycles',
 'NumAromaticHeterocycles',
 'NumAromaticRings',
 'NumHAcceptors',
 'NumHDonors',
 'NumHeteroatoms',
 'NumRotatableBonds',
 'NumSaturatedCarbocycles',
 'NumSaturatedHeterocycles',
 'NumSaturatedRings',
 'RingCount',]
indexes = [str(DESCRIPTORS.index(descriptor)) for descriptor in descriptors]
test = pd.read_csv("processed/10m_deduplicated.csv", skiprows=0, usecols=indexes)
for index, descriptor in enumerate(descriptors):
    amount = sum(test[indexes[index]].gt(0))
    print(f"The stats for descriptor {descriptor}")
    print(f"Amount of mols with trait: {amount}, mols without trait: {len(test)-amount}")
    print(f"This is {amount/len(test):.3f}, mols without trait: {1-amount/len(test):.3f}")

In [None]:
import pandas as pd
from constants import DESCRIPTORS

descriptors = [
 'NumAliphaticHeterocycles',
 'NumAromaticHeterocycles',
 'NumSaturatedHeterocycles',]
indexes = [str(DESCRIPTORS.index(descriptor)) for descriptor in descriptors]
test = pd.read_csv("processed/10m_deduplicated.csv", skiprows=0, usecols=indexes)

amount = sum((test[indexes[0]]+test[indexes[1]]+test[indexes[2]]).gt(0))
print(f"The stats for descriptor heterocycles")
print(f"Amount of mols with trait: {amount}, mols without trait: {len(test)-amount}")
print(f"This is {amount/len(test):.3f}, mols without trait: {1-amount/len(test):.3f}")