# Data Processing
Here we setup the processing pipleine, transforming the raw input data into the features that we outlined in the data exploration notebook. In total, we have 52 remaining features that we will try using in the first iteration of this processing. The steps are as follows:

1. Create a row for each unique molecule and create the four label columns: JAK1, JAK2, JAK3, and TYK2. Fill in all labels from the input data set.
2. For each molecule in the test set, we will extract the 52 features from the RDKit descriptors.
3. We will perform principal components analysis to reduce the dimensionality of the feature set.
4. We will create a training set (80%) and a test set (20%) for each of the individual kinsaes. As an extension, I could try to implement a multi-task learning (MTL) model that could create a task for each of the molecules, but for now we will actually use 4 separate models for the 4 different tasks. 
5. Each test and training file will be saved separately as a csv to retain clarity on data separation.

In [1]:
import pandas as pd
df = pd.read_csv('ml-challenge-kinase-main/kinase_JAK.csv')

# Get kinase measurements
df = df.loc[df['measurement_type'] == 'pKi']

# Create row for each unique molecule and add columns
df = df.pivot(index="SMILES", columns="Kinase_name", values='measurement_value').reset_index()
display(df)

Kinase_name,SMILES,JAK1,JAK2,JAK3,TYK2
0,Brc1cnc2[nH]cc(-c3ccccc3)c2c1,,6.20,6.3,
1,C#Cc1cc2c(cc1OC)-c1[nH]nc(-c3ccc(C#N)nc3)c1C2,,6.20,,
2,C=CC(=O)N1CC(Nc2ncnc3[nH]ccc23)CCC1C,8.20,,8.2,
3,C=CCN(CCOc1ccc(C)cc1)C1CCN(C(=O)Cn2cc(NC(=O)c3...,8.49,8.14,,
4,CC(=NNC(=N)N)c1cc(NC(=O)NCCCCCCNC(=O)Nc2cc(C(C...,,6.00,6.0,
...,...,...,...,...,...
979,c1ccc(Cn2cc(-c3ccc4[nH]ncc4c3)nn2)cc1,,6.55,6.0,
980,c1ccc(Cn2nnc(-c3ccc4[nH]ncc4c3)c2C2CC2)cc1,,6.20,,
981,c1cnc2[nH]cc(-c3ccnc(NC4CCCCC4)n3)c2c1,,6.90,6.8,6.5
982,c1cncc(CN2CCC(n3nnc4cnc5[nH]ccc5c43)CC2)c1,7.80,7.55,,


In [2]:
# Get features 
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, Descriptors3D

features = ['SMR_VSA7', 'BCUT2D_CHGLO', 'PEOE_VSA8', 'VSA_EState6', 'PEOE_VSA12', 'EState_VSA2', 'SMR_VSA3', 'BCUT2D_MRHI', 
 'fr_amide', 'SMR_VSA9', 'PEOE_VSA14', 'TPSA', 'fr_alkyl_halide', 'NumHAcceptors', 'PEOE_VSA3', 'VSA_EState1', 
 'VSA_EState7', 'BCUT2D_LOGPHI', 'SlogP_VSA2', 'fr_NH0', 'SMR_VSA1', 'fr_halogen', 'VSA_EState9', 'FractionCSP3', 
 'NumAromaticCarbocycles', 'EState_VSA5', 'EState_VSA10', 'NumHeteroatoms', 'NumRotatableBonds', 'HallKierAlpha', 
 'NumSaturatedRings', 'qed', 'PEOE_VSA10', 'PEOE_VSA4', 'MinAbsPartialCharge', 'VSA_EState2', 'EState_VSA9', 
 'NumAromaticHeterocycles', 'fr_Ar_NH', 'PEOE_VSA2', 'BCUT2D_LOGPLOW', 'fr_piperdine', 'Chi3v', 'fr_C_O_noCOO', 
 'SlogP_VSA8', 'MolLogP', 'fr_Ar_N', 'Chi1n', 'fr_sulfonamd', 'MinEStateIndex', 'RingCount', 'NumAliphaticHeterocycles']

def getMolDescriptors(mol, features):
    all_descriptors = {}
    for name, equation in Descriptors._descList:
        if name in features:
            try:
                val = equation(mol)
            except:
                val = None
            all_descriptors[name] = val
    return all_descriptors

for index, row in df.iterrows():
    mol = Chem.MolFromSmiles(row['SMILES'])
    descriptors = getMolDescriptors(mol, features)
    for name in descriptors: 
        df.loc[index, name] = descriptors[name]

display(df)

Kinase_name,SMILES,JAK1,JAK2,JAK3,TYK2,MinEStateIndex,qed,MinAbsPartialCharge,BCUT2D_CHGLO,BCUT2D_LOGPHI,...,MolLogP,fr_Ar_N,fr_Ar_NH,fr_C_O_noCOO,fr_NH0,fr_alkyl_halide,fr_amide,fr_halogen,fr_piperdine,fr_sulfonamd
0,Brc1cnc2[nH]cc(-c3ccccc3)c2c1,,6.20,6.3,,0.918568,0.713045,0.137436,-1.982411,2.284200,...,3.99240,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,C#Cc1cc2c(cc1OC)-c1[nH]nc(-c3ccc(C#N)nc3)c1C2,,6.20,,,0.386681,0.577516,0.139982,-2.068753,2.419330,...,2.90448,3.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
2,C=CC(=O)N1CC(Nc2ncnc3[nH]ccc23)CCC1C,8.20,,8.2,,-0.010668,0.845167,0.245766,-2.375193,2.230954,...,1.93530,3.0,1.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0
3,C=CCN(CCOc1ccc(C)cc1)C1CCN(C(=O)Cn2cc(NC(=O)c3...,8.49,8.14,,,-3.122653,0.139748,0.387054,-2.377773,2.370100,...,5.96632,5.0,0.0,2.0,7.0,2.0,2.0,3.0,1.0,0.0
4,CC(=NNC(=N)N)c1cc(NC(=O)NCCCCCCNC(=O)Nc2cc(C(C...,,6.00,6.0,,-0.412285,0.047129,0.318735,-2.127161,2.184908,...,1.01848,0.0,0.0,2.0,4.0,0.0,4.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,c1ccc(Cn2cc(-c3ccc4[nH]ncc4c3)nn2)cc1,,6.55,6.0,,0.724003,0.625137,0.112573,-1.992031,2.209384,...,2.86970,5.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
980,c1ccc(Cn2nnc(-c3ccc4[nH]ncc4c3)c2C2CC2)cc1,,6.20,,,0.582184,0.623770,0.116067,-2.073532,2.435926,...,3.74710,5.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
981,c1cnc2[nH]cc(-c3ccnc(NC4CCCCC4)n3)c2c1,,6.90,6.8,6.5,0.507996,0.771229,0.222909,-2.222322,2.265868,...,3.76450,4.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
982,c1cncc(CN2CCC(n3nnc4cnc5[nH]ccc5c43)CC2)c1,7.80,7.55,,,0.384860,0.623421,0.139055,-2.315541,2.286867,...,2.53970,6.0,1.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0


In [3]:
# Split into datasets for each individual kinase
kinases = ['JAK1', 'JAK2', 'JAK3', 'TYK2']
df_kins = {}
for kinase in kinases:
    df_kins[kinase] = df.dropna(subset=[kinase]).drop(columns=[x for x in kinases if x != kinase])
    
display(df_kins['JAK1'])

Kinase_name,SMILES,JAK1,MinEStateIndex,qed,MinAbsPartialCharge,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,Chi1n,...,MolLogP,fr_Ar_N,fr_Ar_NH,fr_C_O_noCOO,fr_NH0,fr_alkyl_halide,fr_amide,fr_halogen,fr_piperdine,fr_sulfonamd
2,C=CC(=O)N1CC(Nc2ncnc3[nH]ccc23)CCC1C,8.20,-0.010668,0.845167,0.245766,-2.375193,2.230954,-2.487658,5.878355,7.164146,...,1.93530,3.0,1.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0
3,C=CCN(CCOc1ccc(C)cc1)C1CCN(C(=O)Cn2cc(NC(=O)c3...,8.49,-3.122653,0.139748,0.387054,-2.377773,2.370100,-2.480514,6.305795,16.422817,...,5.96632,5.0,0.0,2.0,7.0,2.0,2.0,3.0,1.0,0.0
6,CC(=O)N1CCC(CN2CC3CN(C(=O)Cn4cc(NC(=O)c5cnn6cc...,9.03,-3.114085,0.280473,0.387054,-2.333567,2.376191,-2.408864,6.305798,16.284944,...,3.74860,5.0,0.0,3.0,8.0,2.0,3.0,3.0,1.0,0.0
7,CC(=O)N1CCC(CN2CCC(n3cc(NC(=O)c4c(N)nn5cccnc45...,9.24,-3.075247,0.285504,0.387054,-2.371044,2.385437,-2.425219,6.305842,14.878753,...,4.57750,5.0,0.0,2.0,7.0,2.0,2.0,3.0,2.0,0.0
8,CC(=O)N1CCC(N2CCC(n3cc(NC(=O)c4c(N)nn5cccnc45)...,9.27,-3.077455,0.309099,0.387054,-2.412182,2.386678,-2.483326,6.305841,14.412476,...,4.32990,5.0,0.0,2.0,7.0,2.0,2.0,3.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
971,c1ccc(CN2CCC(n3cnc4cnc5[nH]ccc5c43)CC2)cc1,8.62,0.511596,0.621876,0.138992,-2.325111,2.298905,-2.335959,6.004582,8.932392,...,3.74970,4.0,1.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0
972,c1ccc(CN2CCC(n3nnc4cnc5[nH]ccc5c43)CC2)cc1,8.07,0.400485,0.625912,0.139055,-2.316517,2.287277,-2.340621,6.000115,8.815994,...,3.14470,5.0,1.0,0.0,5.0,0.0,0.0,0.0,1.0,0.0
973,c1ccc(CN2CCCC(n3cnc4cnc5[nH]ccc5c43)C2)cc1,7.62,0.461076,0.621876,0.138992,-2.333652,2.293425,-2.354883,6.004649,8.932392,...,3.74970,4.0,1.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0
974,c1ccc(CN2CCCC(n3nnc4cnc5[nH]ccc5c43)C2)cc1,7.60,0.349965,0.625912,0.139055,-2.324443,2.281902,-2.360507,6.000178,8.815994,...,3.14470,5.0,1.0,0.0,5.0,0.0,0.0,0.0,1.0,0.0


In [6]:
# Split each dataset into train and test sets, perform PCA, and save as csvs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

for df_kin in df_kins:
    # Split into test and train
    data = df_kins[df_kin]
    X = data.drop(columns=[df_kin]).set_index('SMILES')
    y = data[df_kin]
    X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                   random_state=123, 
                                   test_size=0.2, 
                                   shuffle=True)
    # Standardize the data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = train_img = scaler.transform(X_test)
    
    # Apply PCA attempting to retain 95% of variance
    pca = PCA(n_components=33)
    pca.fit(X_train)
    print("# of components for " + df_kin + ": " + str(pca.n_components_))
    
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    
    # Transform back to df
    col_names = []
    for x in range(pca.n_components_):
        col_names.append("PC" + str(x + 1))
    X_train = pd.DataFrame(X_train, columns=col_names)
    X_test = pd.DataFrame(X_test, columns=col_names)
    
    X_train['target'] = y_train.values
    X_test['target'] = y_test.values
    
    train = X_train
    test = X_test
        
    # Save output as csv files
    train_name = "data/train_" + df_kin + ".csv"
    test_name = "data/test_" + df_kin + ".csv"
    train.to_csv(train_name)
    test.to_csv(test_name)


# of components for JAK1: 33
# of components for JAK2: 33
# of components for JAK3: 33
# of components for TYK2: 33
