In [1]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.3


In [2]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.Chem.Fragments as f
import rdkit.Chem.rdMolDescriptors as d
from rdkit.Chem import Lipinski as l

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, roc_curve

import matplotlib.pyplot as plt

## Importing Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Filepath to shared drive (you have to add the 'Programming for Data Science' folder to your drive)
filepath = '/content/drive/MyDrive/Programming for Data Science/Assignment 4/'

# Filepath to your own drive folder and files if above doesnt work
#filepath = '/content/drive/MyDrive/Colab Notebooks/ID2214/'

In [5]:
train_data = pd.read_csv(filepath + "training_smiles.csv")

In [6]:
y = train_data["ACTIVE"]

In [7]:
def get_atom_features(dataframe):
    df = dataframe.copy()

    # Features in arrays
    FRAC_CSP3 = []
    HEAVY_ATOMS = []
    MOL_WEIGHT = []
    NHOH = []
    NO = []
    ALIPH_CARB = []
    ALIPH_HETE = []
    ALIPH_RING = []
    AROM_CARB = []
    AROM_HETER = []
    AROM_RING = []
    H_ACCEPTORS = []
    H_DONORS = []
    H_ATOMS = []
    SATUR_CARBS = []
    SATUR_HETER = []
    SATUR_RING = []
    RING = []


    for row in df.index:
        smile = df.loc[row,'SMILES']
        m = Chem.MolFromSmiles(smile)

        FRAC_CSP3.append(l.FractionCSP3(m))
        HEAVY_ATOMS.append(l.HeavyAtomCount(m))
        MOL_WEIGHT.append(d.CalcExactMolWt(m))
        NHOH.append(l.NHOHCount(m))
        NO.append(l.NOCount(m))
        ALIPH_CARB.append(l.NumAliphaticCarbocycles(m))
        ALIPH_HETE.append(l.NumAliphaticHeterocycles(m))
        ALIPH_RING.append(l.NumAliphaticRings(m))
        AROM_CARB.append(l.NumAromaticCarbocycles(m))
        AROM_HETER.append(l.NumAromaticHeterocycles(m))
        AROM_RING.append(l.NumAromaticRings(m))
        H_ACCEPTORS.append(l.NumHAcceptors(m))
        H_DONORS.append(l.NumHDonors(m))
        H_ATOMS.append(l.NumHeteroatoms(m))
        SATUR_CARBS.append(l.NumSaturatedCarbocycles(m))
        SATUR_HETER.append(l.NumSaturatedHeterocycles(m))
        SATUR_RING.append(l.NumSaturatedRings(m))
        RING.append(l.RingCount(m))


    df['FRAC_CSP3'] = FRAC_CSP3
    df['HEAVY_ATOMS'] = HEAVY_ATOMS
    df['MOL_WEIGHT'] = MOL_WEIGHT
    df['NHOH'] = NHOH
    df['NO'] = NO
    df['ALIPH_CARB'] = ALIPH_CARB
    df['ALIPH_HETE'] = ALIPH_HETE
    df['ALIPH_RING'] = ALIPH_RING
    df['AROM_CARB'] = AROM_CARB
    df['AROM_HETER'] = AROM_HETER
    df['AROM_RING'] = AROM_RING
    df['H_ACCEPTORS'] = H_ACCEPTORS
    df['H_DONORS'] = H_DONORS
    df['H_ATOMS'] = H_ATOMS
    df['SATUR_CARBS'] =  SATUR_CARBS
    df['SATUR_HETER'] = SATUR_HETER
    df['SATUR_RING'] = SATUR_RING
    df['RING'] = RING

    df_atom = df.drop(["INDEX", 'SMILES'], axis=1)
    #display(df_atom.head())

    return df_atom

In [8]:
def get_morgan_features(dataframe):

    df = dataframe.copy()
    MORGAN = []

    for row in df.index:
        smile = df.loc[row,'SMILES']
        m = Chem.MolFromSmiles(smile)

        MORGAN.append(np.array(AllChem.GetMorganFingerprintAsBitVect(m,2,nBits=124)))

    df_morgan = pd.DataFrame.from_records(MORGAN, columns=['f{}'.format(i) for i in range(MORGAN[0].size)])
    #display(df_morgan)

    return df_morgan

In [9]:
def get_complete_features(dataframe):
    df_complete = pd.merge(df_atom, df_morgan, left_index=True, right_index=True)
    display(df_complete)

    return df_complete

In [None]:
df_atom = get_atom_features(train_data).drop("ACTIVE", axis = 1)

df_morgan = get_morgan_features(train_data)

df_complete = pd.merge(df_atom, df_morgan, left_index=True, right_index=True)

## Split and Normalization

In [11]:
def split(data, y):
    x_train, x_val, y_train, y_val = train_test_split(data, y, test_size=0.2, random_state=1, stratify = y)
    return x_train, x_val, y_train, y_val


# features to normalize
norm_features = list(df_atom.columns)

def scale(x_train, x_val, label_list):
    train = x_train.loc[:, label_list]
    val = x_val.loc[:, label_list]

    scaler = MinMaxScaler()

    train = scaler.fit_transform(train)
    val = scaler.transform(val)

    x_train.loc[:, label_list] = np.array(train)
    x_val.loc[:, label_list] = np.array(val)

    return x_train, x_val

## Models

In [12]:
def mlp(x_train, x_val,  y_train, y_val, params=None):

    if params == None:
        model = MLPClassifier(random_state = 10)
    else:
        model = MLPClassifier(random_state = 10, **params)

    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    fpr, tpr, thre = metrics.roc_curve(y_val,prediction[:, 1])
    auc_ = metrics.auc(fpr,tpr)
    return model, auc_, prediction


def random_forest(x_train, x_val,  y_train, y_val, params=None):

    if params == None:
        model = RandomForestClassifier(class_weight='balanced', random_state = 10)
    else:
        model = RandomForestClassifier(random_state = 10, **params)

    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    fpr, tpr, thre = metrics.roc_curve(y_val,prediction[:, 1])
    auc_ = metrics.auc(fpr,tpr)
    return model, auc_, prediction

def logistic_regression(x_train, x_val,  y_train, y_val, params=None):

    if params == None:
        model = LogisticRegression(class_weight='balanced', max_iter=2000)
    else:
        model = LogisticRegression(max_iter=2000, **params)

    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    fpr, tpr, thre = metrics.roc_curve(y_val,prediction[:, 1])
    auc_ = metrics.auc(fpr,tpr)

    return model, auc_, prediction


# basic models results, without tuning
def baseline(train, y, seed = 0, norm=False, norm_list=norm_features):

    np.random.seed(seed)
    x_train, x_val, y_train, y_val = split(train, y)

    if norm == True:
        x_train, x_val = scale(x_train, x_val, norm_list)

    lr, lr_auc, lr_prediction = logistic_regression(x_train,x_val,y_train, y_val)
    nn, nn_auc, nn_prediction = mlp(x_train,x_val,y_train, y_val)
    rf, rf_auc, rf_prediction = random_forest(x_train,x_val,y_train, y_val)
    baseline_r = [lr_auc, nn_auc, rf_auc]

    # print("lr auc without tuning: " + str(lr_auc))
    # print("nn auc without tuning: " + str(nn_auc))
    # print("rf auc without tuning: " + str(rf_auc))
    print("Basaline without tuning: calculation finished")

    return baseline_r

In [13]:
# baseline our AUC
baseline_rows = ["basic", "m_f", "basic+m_f"]
baseline_columns = ["logistic_reg", "neural_net", "random_forest"]

r1 = baseline(df_atom, y)
r2 = baseline(df_morgan, y)
r3 = baseline(df_complete, y)

baseline_r = [r1, r2, r3]
r_df = pd.DataFrame(baseline_r, columns=baseline_columns, index=baseline_rows)
display(r_df)

Basaline without tuning: calculation finished
Basaline without tuning: calculation finished
Basaline without tuning: calculation finished


Unnamed: 0,logistic_reg,neural_net,random_forest
basic,0.762281,0.681536,0.704159
m_f,0.730916,0.734402,0.730848
basic+m_f,0.807149,0.800142,0.788136


## Tuning Parameters

In [14]:
def hyper_tuning(x_train,  y_train, model, kf, params):
    grid_search = GridSearchCV(model, param_grid=params, cv=kf, scoring='roc_auc')
    grid_search.fit(x_train, y_train)
    best_params = grid_search.best_params_

    return best_params, grid_search

### Neural Network

In [15]:
nn_params = {
    'hidden_layer_sizes': [(100,),(150,)],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.005]
}

kf = StratifiedKFold(n_splits=5)

x_train, x_val, y_train, y_val = split(df_complete, y)
#x_train, x_val = scale(x_train, x_val, norm_features)

nn, auc, pred = mlp(x_train, x_val,  y_train, y_val)

In [16]:
best_params_nn, grid_search_nn = hyper_tuning(x_train, y_train, model=nn, kf=kf, params=nn_params)
nn_updated, nn_auc_updated, nn_prediction_updated = mlp(x_train, x_val,  y_train, y_val, best_params_nn)

results_nn = pd.DataFrame.from_dict(grid_search_nn.cv_results_)
# display(results_nn)

### Random Forest

In [17]:
rf_params = {
    'n_estimators': [200, 300, 400],
    'class_weight': ['balanced'],
    'max_depth': [30, 50]
}

kf = StratifiedKFold(n_splits=5)

x_train, x_val, y_train, y_val = split(df_complete, y)
#x_train, x_val = scale(x_train, x_val, norm_features)

rf, auc, pred = random_forest(x_train, x_val,  y_train, y_val)

In [18]:
best_params_rf, grid_search_rf = hyper_tuning(x_train, y_train, model=rf, kf=kf, params=rf_params)
rf_updated, rf_auc_updated, rf_prediction_updated = random_forest(x_train, x_val,  y_train, y_val, best_params_rf)

results_rf = pd.DataFrame.from_dict(grid_search_rf.cv_results_)
# display(results_rf)

### Logistic Regression

In [19]:
lg_params = {
    'fit_intercept': [True, False],
    'class_weight': ['balanced'],
    'penalty': ["l2", None]
}

kf = StratifiedKFold(n_splits=5)

x_train, x_val, y_train, y_val = split(df_complete, y)
#x_train, x_val = scale(x_train, x_val, norm_features)

lg, auc, pred = logistic_regression(x_train, x_val,  y_train, y_val)

In [20]:
best_params_lg, grid_search_lg = hyper_tuning(x_train, y_train, model=lg, kf=kf, params=lg_params)
lg_updated, lg_auc_updated, lg_prediction_updated = logistic_regression(x_train, x_val, y_train, y_val, best_params_lg)

results_lg = pd.DataFrame.from_dict(grid_search_lg.cv_results_)
# display(results_lg)

## Validation

Validation of best parameters on a validation set

In [21]:
#Neural Networks
print("AUC Neural Networks: ", nn_auc_updated, "with best parameters: ", best_params_nn)
print()

# Random forest
print("AUC Random Forest: ", rf_auc_updated, "with best parameters: ", best_params_rf)
print()

# Logistic regression
print("AUC Logistic Regression: ", lg_auc_updated, "with best parameters: ", best_params_lg)

AUC Neural Networks:  0.8053911562526535 with best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (150,), 'solver': 'adam'}

AUC Random Forest:  0.8543416097815154 with best parameters:  {'class_weight': 'balanced', 'max_depth': 30, 'n_estimators': 400}

AUC Logistic Regression:  0.8071488634752628 with best parameters:  {'class_weight': 'balanced', 'fit_intercept': True, 'penalty': 'l2'}


## Best Model

In [22]:
#Choose the best model: random forest

final_params = best_params_rf #parameters

final_model, final_auc, final_val_prediction  = rf_updated, rf_auc_updated, rf_prediction_updated #model, auc and predictions

print("best params: " + str(final_params))
print("final auc on val data: " + str(round(final_auc, 4)))

best params: {'class_weight': 'balanced', 'max_depth': 30, 'n_estimators': 400}
final auc on val data: 0.8543


## Test Data

In [None]:
# output the results on test dataset
test_data = pd.read_csv(filepath+"test_smiles.csv")

# add features
test_data = pd.merge(get_atom_features(test_data), get_morgan_features(test_data), left_index=True, right_index=True)

In [24]:
#predictions
final_results = final_model.predict_proba(test_data)
print(final_results)

[[0.99   0.01  ]
 [0.9925 0.0075]
 [0.9825 0.0175]
 ...
 [0.9925 0.0075]
 [0.995  0.005 ]
 [1.     0.    ]]
