# Format: ID, labels, [one hot categories]

In [1]:
import pandas as pd
import numpy as np

# Fill a one hot given a list of indices and the length of the one hot vector
def fill_oh(indices, oh_len):
    oh = [0] * oh_len
    
    for idx in indices:
        if 0 <= idx < oh_len:
            oh[idx] = 1
        else:
            raise ValueError(f"Index {index} is out of range for one-hot encoding of length {length}")

    return oh

In [2]:
# Create one hot dictionaries and labels
dt_idx = pd.read_csv('./index_drug-target.csv', index_col=0)

In [3]:
dt_idx

Unnamed: 0,index,compound_id,Smiles,Catalog Number,Target
0,0,HY_100144,O=C(OCC1=CC=CC(COC(C2=CC(OC)=C(OC)C(OC)=C2)=O)...,HY_100144,Others
1,1,HY_100384,O=C(NC1=CC=CC=C1N)CCCCCC(NC2=CC=CC=C2)=O,HY_100384,HDAC
2,2,HY_10018,O=C(C1=CC2=CC=C(C(F)(C(F)(F)F)F)C=C2N=C(C1)N)OCC,HY_10018,Toll-like Receptor (TLR)
3,3,HY_100366,O=C([C@@H]1[C@H](C(N)=O)CCCC1)NC2=CC=C(Cl)C(Cl...,HY_100366,mGluR
4,4,HY_100007,O=S(N1C=C(CNC)C=C1C2=CC=CC=C2F)(C3=CC=CN=C3)=O,HY_100007,Proton Pump
...,...,...,...,...,...
10884,10887,HY_Y0189,O=C(OC)C1=CC=CC=C1O,HY_Y0189,COX
10885,10888,HY_N0177,O[C@]1([H])CC2=CC[C@]3([H])[C@@](CC[C@@]4(C)[C...,HY_N0177,STAT
10886,10889,HY_Y0304,O=C(C1=CC=CC=C1C(OCCCC)=O)OCCCC,HY_Y0304,Others
10887,10890,HY_130187A,CCCCCCCCC/C=C\CCCCC(O[Na])=O,HY_130187A,Bacterial


In [4]:
# Convert classes to one hot
dt_idx['Target_List'] = [x.split('; ') for x in dt_idx['Target']]
all_classes = set(np.concatenate(dt_idx['Target_List'].values))

# Alphabetize the classes and create dictionary for one_hot indices
target_to_oh = pd.DataFrame(all_classes).sort_values(by=0).reset_index(drop=True).reset_index().set_index(0)
target_to_oh = target_to_oh['index'].to_dict()

OH_LEN = len(target_to_oh) # Length of one hot vector

# Add one hot values to train dictionary
dt_idx['OH_Values'] = dt_idx['Target_List'].apply(lambda x: fill_oh([target_to_oh[ts] for ts in x], OH_LEN))

In [5]:
dt_idx[['index', 'Target_List', 'OH_Values']]

Unnamed: 0,index,Target_List,OH_Values
0,0,[Others],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,[HDAC],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,[Toll-like Receptor (TLR)],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,[mGluR],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,[Proton Pump],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
10884,10887,[COX],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
10885,10888,[STAT],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
10886,10889,[Others],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
10887,10890,[Bacterial],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
new_col_names = list(target_to_oh.keys())

In [7]:
expanded_df = pd.DataFrame(dt_idx['OH_Values'].tolist())
expanded_df.columns = new_col_names

In [8]:
expanded_df

Unnamed: 0,15-PGDH,5 alpha Reductase,5-HT Receptor,AChE,ADC Cytotoxin,ALK,AMPK,APC,ATGL,ATM/ATR,...,iGluR,mAChR,mGluR,mTOR,nAChR,p38 MAPK,p97,sFRP-1,β-catenin,γ-secretase
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10884,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10885,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10886,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10887,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
total_train_df = pd.concat([dt_idx[['index', 'Target_List']], expanded_df], axis=1)

In [10]:
total_train_df

Unnamed: 0,index,Target_List,15-PGDH,5 alpha Reductase,5-HT Receptor,AChE,ADC Cytotoxin,ALK,AMPK,APC,...,iGluR,mAChR,mGluR,mTOR,nAChR,p38 MAPK,p97,sFRP-1,β-catenin,γ-secretase
0,0,[Others],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,[HDAC],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,[Toll-like Receptor (TLR)],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,[mGluR],0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,4,[Proton Pump],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10884,10887,[COX],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10885,10888,[STAT],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10886,10889,[Others],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10887,10890,[Bacterial],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
total_train_df.rename(columns={'index':'ID'}).to_csv('train_drugidx.csv', index=False)