In [1]:
import pandas as pd
# Display maximum rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

from raw_processing import other_sources
from config_management import UnifiedConfiguration
from data_management import DataManager
from metrics_management import MetricsManager
from results_management import ResultsManager
from results_analysis import ResultsAnalyzer
from plotting.moe import MOE_CATEGORIES

In [2]:
config = UnifiedConfiguration()
data_manager = DataManager(config.data, config.path)

In [19]:
training_chemicals = data_manager.load_training_chemicals()
training_chemicals

['DTXSID00100074',
 'DTXSID00100498',
 'DTXSID001005033',
 'DTXSID001006300',
 'DTXSID00100756',
 'DTXSID001013107',
 'DTXSID001013890',
 'DTXSID001014486',
 'DTXSID001014636',
 'DTXSID001014967',
 'DTXSID00102856',
 'DTXSID00104436',
 'DTXSID00104719',
 'DTXSID00105261',
 'DTXSID00107669',
 'DTXSID00107720',
 'DTXSID00108550',
 'DTXSID00109007',
 'DTXSID00109108',
 'DTXSID00110077',
 'DTXSID0020076',
 'DTXSID0020078',
 'DTXSID0020107',
 'DTXSID0020151',
 'DTXSID0020153',
 'DTXSID00201818',
 'DTXSID0020232',
 'DTXSID0020234',
 'DTXSID0020280',
 'DTXSID0020286',
 'DTXSID0020311',
 'DTXSID0020315',
 'DTXSID0020319',
 'DTXSID0020440',
 'DTXSID0020442',
 'DTXSID0020446',
 'DTXSID0020448',
 'DTXSID00204642',
 'DTXSID0020494',
 'DTXSID0020498',
 'DTXSID0020523',
 'DTXSID0020529',
 'DTXSID0020573',
 'DTXSID0020575',
 'DTXSID0020602',
 'DTXSID0020606',
 'DTXSID0020650',
 'DTXSID0020654',
 'DTXSID0020868',
 'DTXSID0020941',
 'DTXSID0020943',
 'DTXSID0021094',
 'DTXSID0021096',
 'DTXSID0021125',

In [7]:
smi_file = config.path.opera_structures_file

smi_file

'Input/Raw/OPERA/OPERA2.9-Predictions/2023-12-11/chemical-identifiers.smi'

In [11]:
with open(smi_file, 'r') as file:
    smi_lines = list(file)    

In [20]:
new_file = 'Input/Raw/OPERA/training_chemicals.smi'

with open(new_file, 'w') as file:
    for line in smi_lines:
        dtxsid = line.split('\t')[-1].strip()
        if dtxsid in training_chemicals:
            file.write(line)

In [21]:
with open(new_file, 'r') as file:
    print(len(list(file)))

2867


In [2]:
# Get the modeling instructions for the final models
# These are used to read the corresponding results
instruction_for_model = {
    'general' : {
      'target_effect' : 'general',
      'features_source' : 'opera',
      'ld50_type' : 'predicted',
      'data_condition' : 'missing',
      'select_features' : 'true',
      'estimators' : 'RandomForestRegressor'
    },
    'repro_dev' : {
      'target_effect' : 'repro_dev',
      'features_source' : 'opera',
      'ld50_type' : 'predicted',
      'data_condition' : 'missing',
      'select_features' : 'true',
      'estimators' : 'RandomForestRegressor'  
    }
}

def get_model_key(effect):
    '''
    Helper function to get the model key for the specified effect
    category. 
    
    The model key is a unique identifier for a given model. 
    
    Parameters
    ----------
    effect : str
     Name of effect category: 'general' or 'repro_dev'.
     
    Returns
    -------
    tuple of str
        The corresponding model key.
    '''
    return tuple(instruction_for_model[effect].values())

In [3]:
config = UnifiedConfiguration()

In [4]:
data_manager = DataManager(config.data, config.path)
metrics_manager = MetricsManager(config.category_to_dict('metric'))
results_manager = ResultsManager(
    results_file_type=config.data.file_type
)

In [5]:
new_features_path = (
    config.path.file_for_features_source['opera']
)

X_new, y_new = data_manager.load_features_and_target(
    new_features_path, 
    **instruction_for_model['general']
)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [6]:
X_new.shape, y_new.shape

((1791, 39), (1791,))

In [6]:
X = data_manager.load_features(
    new_features_path,
    **instruction_for_model['general']
)

X.duplicated().sum()

22315

In [9]:
X.duplicated().sum() / len(X)

0.027756701287393496

In [13]:
# Check if values are identical
X_duplicated = X.loc[list(X.loc[X.index.duplicated()].index)].fillna(0.)

X_duplicated

Unnamed: 0_level_0,CERAPP_Ago_pred_discrete,CERAPP_Anta_pred_discrete,CERAPP_Bind_pred_discrete,CoMPARA_Ago_pred_discrete,CoMPARA_Anta_pred_discrete,CoMPARA_Bind_pred_discrete,CATMoS_LD50_pred,FUB_pred,Clint_pred,CACO2_pred,OH_pred,BCF_pred,BioDeg_HalfLife_pred,ReadyBiodeg_pred_discrete,HL_pred,KM_pred,KOA_pred,Koc_pred,P_pred,MP_pred,MolWeight,nbAtoms_discrete,nbHeavyAtoms_discrete,nbC_discrete,nbO_discrete,nbN_discrete,nbAromAtom_discrete,nbRing_discrete,nbHeteroRing_discrete,Sp3Sp2HybRatio,nbRotBd_discrete,nbHBdAcc_discrete,ndHBdDon_discrete,nbLipinskiFailures_discrete,TopoPolSurfAir,MolarRefract,CombDipolPolariz,VP_pred,WS_pred
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
DTXSID501007165,0.0,0.0,0.0,0.0,0.0,0.0,9252.0,0.02,17.74,0.0,1.548817e-11,912.010839,0.0,1.0,0.002187762,1.513561,933254300.0,1862.087137,12589250.0,-35.0,284.27153,56.0,20.0,18.0,2.0,0.0,0.0,0.0,0.0,0.944444,15.0,2.0,0.0,2.0,26.3,71.16,0.519,0.001258925,3.548134e-08
DTXSID501007165,0.0,0.0,0.0,0.0,0.0,0.0,9252.0,0.02,17.74,0.0,1.548817e-11,912.010839,0.0,1.0,0.002187762,1.513561,933254300.0,1862.087137,12589250.0,-35.0,284.27153,56.0,20.0,18.0,2.0,0.0,0.0,0.0,0.0,0.944444,15.0,2.0,0.0,2.0,26.3,71.16,0.519,0.001258925,3.548134e-08
DTXSID3026932,0.0,0.0,0.0,0.0,0.0,0.0,6567.0,0.0,0.0,0.0,7.762471e-11,1148.153621,10.0,1.0,0.2454709,14.791084,151356100.0,0.0,100000000.0,18.0,252.281701,54.0,18.0,18.0,0.0,0.0,0.0,0.0,0.0,0.888889,15.0,0.0,0.0,2.0,0.0,61.2601,0.167,6.76083e-05,1.584893e-10
DTXSID3026932,0.0,0.0,0.0,0.0,0.0,0.0,6567.0,0.0,0.0,0.0,7.762471e-11,1148.153621,10.0,1.0,0.2454709,14.791084,151356100.0,0.0,100000000.0,18.0,252.281701,54.0,18.0,18.0,0.0,0.0,0.0,0.0,0.0,0.888889,15.0,0.0,0.0,2.0,0.0,61.2601,0.167,6.76083e-05,1.584893e-10
DTXSID4027781,0.0,0.0,0.0,0.0,1.0,1.0,7703.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001584893,300.0,1076.188107,116.0,72.0,40.0,16.0,12.0,36.0,6.0,2.0,0.2,24.0,28.0,12.0,3.0,463.86,120.9482,15.209,0.0,0.851138
DTXSID4027781,0.0,0.0,0.0,0.0,1.0,1.0,7703.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001584893,300.0,1076.188107,116.0,72.0,40.0,16.0,12.0,36.0,6.0,2.0,0.2,24.0,28.0,12.0,3.0,463.86,120.9482,15.209,0.0,0.851138
DTXSID7042061,0.0,0.0,0.0,0.0,0.0,0.0,5566.0,0.1,28.92,-4.37,1.380384e-11,41.686938,0.0,1.0,0.0006606934,0.281838,741310.2,398.107171,19952.62,19.0,196.14633,34.0,14.0,12.0,2.0,0.0,0.0,2.0,0.0,0.916667,2.0,2.0,0.0,0.0,26.3,53.4331,0.622,0.2238721,0.0003162278
DTXSID7042061,0.0,0.0,0.0,0.0,0.0,0.0,5566.0,0.1,28.92,-4.37,1.380384e-11,41.686938,0.0,1.0,0.0006606934,0.281838,741310.2,398.107171,19952.62,19.0,196.14633,34.0,14.0,12.0,2.0,0.0,0.0,2.0,0.0,0.916667,2.0,2.0,0.0,0.0,26.3,53.4331,0.622,0.2238721,0.0003162278
DTXSID0029397,0.0,0.0,0.0,0.0,1.0,1.0,8466.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,393.0,1132.250708,128.0,76.0,44.0,16.0,12.0,36.0,6.0,2.0,0.272727,24.0,28.0,12.0,3.0,463.86,139.2858,15.053,0.0,0.3311311
DTXSID0029397,0.0,0.0,0.0,0.0,1.0,1.0,8466.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,393.0,1132.250708,128.0,76.0,44.0,16.0,12.0,36.0,6.0,2.0,0.272727,24.0,28.0,12.0,3.0,463.86,139.2858,15.053,0.0,0.3311311


In [14]:
for dup_id in set(X_duplicated.index):
    dup_df = X_duplicated.loc[dup_id]
    if not all(dup_df.iloc[0] == dup_df.iloc[-1]):
        print(dup_id)

In [None]:
# TODO: Refactor code to drop duplicated index values and log it

In [16]:
len(X)

803950

In [18]:
len(X) - len(X.loc[~X.index.duplicated()])

456

In [19]:
X.index.duplicated().sum()

456

In [25]:
if any(X.isna().all(axis=1)):
    print('T')

In [26]:
any(X.isna().all(axis=1))

False