In [1]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt


In [2]:
ratio_oof_path1 = 'run_lgb_20240730_145315'
ratio_oof_path2 = 'run_lgb_20240730_145329'

In [3]:
label = 'ratio of preparation/free drug'

In [4]:
train = pd.read_csv('./input/train.csv')
ratio_oof1 = pd.read_csv(f'./output/{ratio_oof_path1}/oof.csv')
ratio_oof2 = pd.read_csv(f'./output/{ratio_oof_path2}/oof.csv')

In [5]:
ratio_train1 = train.loc[(~train[label].isna())&(train['onehot_type of the preparation_free drug']!=1)&(train[label]>=0.1)].reset_index(drop=True)
ratio_train1['ori_ratio'] = ratio_train1[label]
ratio_train1[label] = np.log10(ratio_train1[label].apply(lambda x:max(x,1e-6)))
ratio_train1['error'] = (ratio_oof1[label].rank()-ratio_train1[label].rank()).abs()
ns = ratio_train1.loc[ratio_train1['error']>200,'number'].unique()
ratio_train2 = ratio_train1.loc[(~ratio_train1['number'].isin(ns))&((ratio_train1[label]>=np.log10(0.5))&(ratio_train1[label]<=np.log10(100)))].drop(['error'],axis=1).reset_index(drop=True)


In [6]:
numerical_cols = ['PEGylation or not','PEGylation ratio','p-gp inhibition?','size （nm）','PDI','zeta potantial（mV）','encapsulation efficiency（%）',' DL（g/g）',\
                  'target ligand density（mol，%）','target ligand density（number per nanoparticle）',]
onehot_cols = ['type of the preparation','surface coating','type of targeting ligand','targeting ligand','transport mechanism','type of receptor/transporter','name of receptor/transporter','p-gp inhibitor','animal model','administration route',\
               'laser/magnectic field/radiation','core composition of the preparation']


In [7]:
onehot_train_cols = {}
for onehot_col in onehot_cols:
    onehot_train_cols[onehot_col] = [col for col in ratio_train2.columns if f'onehot_{onehot_col}' in col]

In [8]:
numerical_col_num = len(numerical_cols)
onehot_col_num = 0
for onehot_col in onehot_cols:
    onehot_col_num += len([col for col in ratio_train2.columns if f'onehot_{onehot_col}' in col])

In [9]:
numerical_col_num+onehot_col_num

526

In [10]:
feature_values = {}
for col in numerical_cols:
    if col in ['PEGylation or not','PEGylation ratio','size （nm）','target ligand density（mol，%）']:
        vs = ratio_train2[col].dropna().unique().tolist() 
        if col == 'PEGylation ratio':
            vs = [v for v in vs if v >=0 and v <= 55]
        if col == 'target ligand density（mol，%）':
            vs = [v for v in vs if v >=0.3 and v <= 34]
    elif col in ['p-gp inhibition?','encapsulation efficiency（%）',' DL（g/g）','target ligand density（number per nanoparticle）']:
        vs = [np.nan]
    else:
        vs = ratio_train2[col].unique().tolist()
    if col in ['size （nm）']:
        vs = [80,93,131,168,205,250]#list(set([v//20.*20 for v in vs]))
    elif col in ['PDI']:
        vs = [0.1,0.1547,0.2166,0.2785,0.3]#list(set([v//0.05*0.05 for v in vs]))
    elif col in ['zeta potantial（mV）']:
        vs = list(set([v//10.*10 for v in vs]))
        vs = [v for v in vs if not np.isnan(v)]
    feature_values[col] = vs
    
core = {'liposome':['cholesterol',
#  'dmpc',
 'spc',
#  'pc',
 'dotap',
#  'dope',
 'dspe-peg',
 'epc',
#  'bovine serum albumin',
#  'cholestrol',
#  'lecithin',
 'dspe-peg2000',
 'dspc',
 'dppc',
 'dopc',
#  'others',
 'dspe'],#['cholesterol', 'spc', 'pc', 'dspe-peg2000', 'others', 'dspe'],
       'nanoparticles':['plga',
 'spc',
 'au',
 'peg-pla',
 'chitosan',
 'dspe-peg',
 'epc',
 'bovine serum albumin',
 'hspc',
 'stearic acid',
 'peg-plga',
 'mpeg-pcl',
 'glyceryl monostearate',
 'pamam',
 'dspe',
 'peg-pamam',
 'ps',
 'phospholpon 90g',
 'nicotinamide',
 'cetyl palmitate',
 'tristearin',
 'n-butyl-cyanoacrylate',
 'plga-peg']}

targeting_ligand = {'antibody': ['ox26', 'item4 antibody'],
 'cell-penetrating peptide': ['gly-arg-lys-lys-arg-arg-gln-arg-arg-arg- gly (tat)',
  'mastoparan',
  'cell-penetrating peptide tat',
  'cereport (rmp-7)-pegdspe',
  'penetratin',
  'cpps-tat'],
 'folic acid': ['folic acid', 'none'],
 'non-antibody protein': ['transferrin+folic acid',
  'transferrin',
  'lactoferrin'],
 'others': ['β-hydroxybutyric acid',
  'phenylalanine',
  'phosphatidic acid (pa)',
  'ascorbic acid',
  'phosphatidylserine (ps)',
  'dehydroascorbic acid',
  'n-palmitoylglucosamine (npg)',
  'angiopep-2',
  'lactoferrin'],
 'peptide': ['lactofferin',
  'lmwp',
  'a dual receptor recognizing cell penetrating peptiderrrrrrrrdgr (r8dgr, lower case letter representsd-amino acid residue)',
  'amyloid beta',
  'angiopep-2\r\n',
  'penetrating peptide, rgd',
  'pep-1 (cgemgwvrc) peptide',
  'arg-ala-asp (rad)',
  'arg-gly-asp (rgd)',
  'r6dgr',
  'rgd',
  'scrambled peptide (hwpyahpthpsw)',
  'solanum\ntuberosum lectin',
  'crgd',
  'srgd',
  'csspiqgswtwengk(c)wtwgiirleq',
  't7 (haiyprh) peptide',
  't7 (haiyprh) peptide, tat (aygrkkrrqrrr) peptide',
  't7 序列： chaiyprh',
  'tat (aygrkkrrqrrr) peptide',
  'tgnykalhphngc',
  'tgnykalhphngc, qshyrhispaqvc',
  'egfr-targeting peptide (yhwygytpqnvi)',
  'transferrin receptor (tfr) aptamer',
  'glutathione ',
  '细胞穿透肽tat (aygrkkrrqrrr)',
  'i6p7',
  'i6p7scr',
  'internalizing tumor-penetrating peptide, irgd',
  'lactoferrin']}


for col in onehot_cols:
    if col == 'type of the preparation':
        feature_values[col] = ['liposome','nanoparticles']
    if col == 'type of targeting ligand':
        feature_values[col] = ['antibody','cell-penetrating peptide','folic acid','non-antibody protein','others','peptide']
    if col in ['surface coating','targeting ligand','transport mechanism','transport mechanism','type of receptor/transporter','name of receptor/transporter','p-gp inhibitor','laser/magnectic field/radiation']:
        feature_values[col] = [np.nan]
    if col == 'animal model':
        feature_values[col] = ['normal','brain tumor']
    if col == 'administration route':
        feature_values[col] = ['i.v.']
        

# Your designed parameters


In [11]:
feature_values['PEGylation or not'] = [1]*6
feature_values['PEGylation ratio'] = [2,3,3,3,3,2]
feature_values['size （nm）'] = [122.2,114,100.3,131,152,114.7]
feature_values['PDI'] = [0.315,0.204,0.248,0.225,0.243,0.253]
feature_values['zeta potantial（mV）'] = [-15,-3.74,-20.2,10,18.4,-21.9]
feature_values['target ligand density（mol，%）'] = [2,3,3,3,3,2]
feature_values['type of the preparation'] = ['nanoparticle']*6
feature_values['core composition of the preparation'] = ['plga']*6
feature_values['type of targeting ligand'] = ['peptide']*5+['non-antibody protein']
feature_values['targeting ligand'] = ['angiopep-2','t7 (haiyprh) peptide','tat','tat','tat','transferrin']
feature_values['animal model'] = ['normal']*6
feature_values['administration route'] = ['i.v.']*6

In [12]:
all_features = []
features = {}
i = 0
part = 0
for feature in feature_values.keys():
    try:
        if np.isnan(feature_values[feature][0]):
            if feature not in onehot_cols:
                features[feature] = np.nan
            else:
                for col in onehot_train_cols[feature]:
                    features[col] = np.nan
    except:
        pass

In [13]:
for i in range(len(feature_values['PEGylation or not'])):
    features['PEGylation or not'] = feature_values['PEGylation or not'][i]
    features['PEGylation ratio'] = feature_values['PEGylation ratio'][i]
    features['size （nm）'] = feature_values['size （nm）'][i]
    features['PDI'] = feature_values['PDI'][i]
    features['zeta potantial（mV）'] = feature_values['zeta potantial（mV）'][i]
    features['target ligand density（mol，%）'] = feature_values['target ligand density（mol，%）'][i]
    for f in ['type of the preparation','core composition of the preparation','type of targeting ligand','targeting ligand','animal model','administration route']:
        for col in onehot_train_cols[f]:
            print(col)
            if col.split('_')[-1] in feature_values[f][i].split(','):
                features[col] = 1
            else:
                features[col] = 0 
    all_features.append(features.copy())
    

onehot_type of the preparation_dendrimer
onehot_type of the preparation_emulsion
onehot_type of the preparation_extracellular vesicles
onehot_type of the preparation_free drug
onehot_type of the preparation_liposome
onehot_type of the preparation_micelle
onehot_type of the preparation_nanoparticles
onehot_type of the preparation_none
onehot_type of the preparation_others
onehot_core composition of the preparation_plga
onehot_core composition of the preparation_cholesterol
onehot_core composition of the preparation_spc
onehot_core composition of the preparation_au
onehot_core composition of the preparation_pc
onehot_core composition of the preparation_peg-pla
onehot_core composition of the preparation_chitosan
onehot_core composition of the preparation_dotap
onehot_core composition of the preparation_dspe-peg
onehot_core composition of the preparation_dope
onehot_core composition of the preparation_epc
onehot_core composition of the preparation_bovine serum albumin
onehot_core compositi

In [14]:
tmp = pd.DataFrame(all_features)[ratio_train2.columns[:-4]]

In [15]:
import lightgbm as lgb
models = [lgb.Booster(model_file= f'./output/{ratio_oof_path2}/fold%s.ckpt'%i) for i in range(10)]

# Predict ratio of preparation/free drug

In [16]:
10**np.mean([m.predict(tmp) for m in models],axis=0)

array([4.14648283, 6.11219813, 5.63301324, 5.79380783, 5.05133371,
       4.56408865])