In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [5]:
def extract_data_three_cols_format(filename):
    '''Get loss and AUC data from log files (exp #2-6) with three columns format: 
    number of epoch, number of iteration, loss/AUC

    Parameters
    ----------
    filename : str
        The file location of the log file

    Returns
    -------
    list
        list with mean loss/auc for each epoch
    list
        list with all the loss/auc for each epoch
    '''
    with open(filename, 'r') as f:
        values=list(map(lambda x: x.split('\t'),f.read().split('\n')))[:-1]
    values=list(map(lambda x: list(map(lambda y: float(y),x)),values))
    values=np.array(values)
    epoch_prev=1.0
    epoch_input=[]
    mean_values=[]
    for epoch,input in zip(values[:,0],values[:,2]):
        if epoch_prev==epoch:
            epoch_input.append(input)
        else:
            mean_values.append(np.array(epoch_input).mean())
            epoch_input=[]
        epoch_prev=epoch
    mean_values.append(np.array(epoch_input).mean())
    return (np.array(mean_values)),values

def extract_data_channels(filename):
    '''Get AUC data from log files (exp #9-17) with four columns format: 
    number of epoch, number of iteration, number of channel, AUC

    Parameters
    ----------
    filename : str
        The file location of the log file

    Returns
    -------
    list
        list of mean values of AUC per channel per epoch
    '''
    with open(filename,'r') as f:
        values=list(map(lambda x: x.split('\t'),f.read().split('\n')))[:-1]
    values=list(map(lambda x: list(map(lambda y: float(y),x)),values))
    values=np.array(values)
    aucs=np.zeros(shape=(12,int(np.max(values[:,0]))))

    for p in range(12):
        one_aucs=values[np.where(values[:,2]==p)][:,[0,3]]
        for epoch in range(int(np.max(values[:,0]))):
            one_aucs_epoch=one_aucs[np.where(one_aucs[:,0]==epoch)][:,1].mean()
            aucs[p][epoch]=one_aucs_epoch
    return aucs[:,1:]

In [None]:
data_dir = './experiments'

data=pd.read_csv('../MultiTox/database/data/tox21_10k_data_all_no_salts.csv')

props=list(data)
try:
    props.remove('SMILES')
except:
    pass

experiments_description = pd.read_excel(os.path.join(data_dir,'Experiments_description.xlsx'), index_col = 'Number of experiment')
experiments_description = experiments_description.reindex(columns = experiments_description.columns.tolist() + props)

for i in range(2,18):
    filename = str(i) + '_log_train_auc.txt'
    try:
        if i<9:
            aucs, _ = extract_data_three_cols_format(os.path.join(data_dir, filename))
            experiments_description.loc[i, 'train auc'] = aucs[-1]
        else:
            aucs = extract_data_channels(os.path.join(data_dir, filename))[:,-1]
            experiments_description.loc[i, 'train auc'] = aucs.mean()
    except FileNotFoundError:
        pass
    
    try:
        filename = str(i) + '_log_test_auc.txt'
        if i<9:
            aucs, _ = extract_data_three_cols_format(os.path.join(data_dir, filename))
            experiments_description.loc[i, 'test auc'] = aucs[-1]
        else:
            aucs = extract_data_channels(os.path.join(data_dir, filename))[:,-1]
            experiments_description.loc[i, props] = aucs
            experiments_description.loc[i, 'test auc'] = aucs.mean()
    except FileNotFoundError:
        pass
    
    try:
        filename = str(i) + '_log_test_loss.txt'
        losses, _ = extract_data_three_cols_format(os.path.join(data_dir, filename)) 
        experiments_description.loc[i, 'test loss'] = losses[-1]
    except FileNotFoundError:
        pass
    
    try:
        filename = str(i) + '_log_train_loss.txt'
        losses, _ = extract_data_three_cols_format(os.path.join(data_dir, filename)) 
        experiments_description.loc[i, 'train loss'] = losses[-1]
    except FileNotFoundError:
        pass
# for i in range(9,18):    
#     filename = str(i) + '_log_test_auc.txt'
#     aucs = extract_data_channels(os.path.join(data_dir, filename))[:,-1]
#     experiments_description.loc[i, props] = aucs
#     experiments_description.loc[i, 'test auc'] = aucs.mean()
    

In [80]:
experiments_description

Unnamed: 0_level_0,Date,Augmentation,Sigma,Number of epochs,Learning Rate,Batch Size,Patience(early stopping),Penalty(loss),Comments,train loss,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
Number of experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2019-04-11,Waves,6,20,0.0001,10,-,-,overfit,,...,,,,,,,,,,
2,2019-04-11,Waves,6,65,0.0001,10,-,-,overfit,,...,,,,,,,,,,
3,2019-04-11,Waves,6,63,1e-05,10,-,-,less gap between train and validation,,...,,,,,,,,,,
4,2019-04-11,Waves,6,24,0.0001,16,-,-,overfit,,...,,,,,,,,,,
5,2019-04-11,Waves,6,100,1e-05,32,-,-,,,...,,,,,,,,,,
6,2019-04-11,Gauss,3,75,1e-05,32,-,-,,,...,,,,,,,,,,
7,2019-05-21,Gauss,3,53,1e-05,16,-,-,save AUC for different channels,,...,,,,,,,,,,
8,2019-05-21,Gauss,3,16,1e-05,16,-,0.5,try to add weight to positive classes,,...,,,,,,,,,,
9,2019-05-21,Gauss,3,60 (early stopping),1e-05,16,15,"[0.1,0.2,0.4,0.4,0.4,0.2,0.2,0.6,0.2,0.3,0.6,0.2]",different penalty depending on ration of posit...,0.132037,...,0.627338,0.504177,0.625999,0.640511,0.518519,0.634177,0.576444,0.565987,0.635029,0.637666
10,2019-05-22,Waves,6,82 (early stopping),1e-05,16,25,"[0.1,0.2,0.4,0.4,0.4,0.2,0.2,0.6,0.2,0.3,0.6,0.2]",,0.117969,...,0.642251,0.582035,0.661264,0.648195,0.630307,0.617166,0.562275,0.570163,0.684604,0.592681
