In [None]:
import math
import numpy as np
import pandas as pd
import random as rd
from scipy import stats
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

import optuna
from optuna.trial import TrialState
optuna.logging.set_verbosity(optuna.logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

In [None]:
def prepare_data(train_df, test_df, cell_features):

    train_Y = np.array(train_df['auc'])
    test_Y = np.array(test_df['auc'])

    train_X = np.empty(shape = (len(train_df), len(cell_features[0, :])))
    test_X = np.empty(shape = (len(test_df), len(cell_features[0, :])))

    for i, row in train_df.iterrows():
        train_X[i] = cell_features[int(cell_map[row['cell']])]

    for i, row in test_df.iterrows():
        test_X[i] = cell_features[int(cell_map[row['cell']])]
        
    return train_X, train_Y, test_X, test_Y

In [None]:
def prepare_data_feat(df, gene_list, cell_map, cell_features):

    Y = np.array(df['auc'])

    X = np.empty(shape = (len(df), len(cell_features[0, :])))

    for i, row in df.iterrows():
        X[i] = cell_features[int(cell_map[row['cell']])]

    return X, Y

In [None]:
def split_train_data(all_df):
    
    train_cell_lines = list(set(all_df['cell']))
    val_cell_lines = []
    val_size = int(len(train_cell_lines)/5)
    
    for _ in range(val_size):
        r = rd.randint(0, len(train_cell_lines) - 1)
        val_cell_lines.append(train_cell_lines.pop(r))
        
    val_df = all_df.query('cell in @val_cell_lines').reset_index(drop=True)
    train_df = all_df.query('cell in @train_cell_lines').reset_index(drop=True)
    
    return train_df, val_df

In [None]:
def optimize_elasticnet(trial, train_df, val_df, cell_features):
    
    l1_ratio = trial.suggest_float('l1_ratio', 0.01, 1.0, log=True)
    alpha = trial.suggest_float('alpha', 0.01, 1.0, log=True)

    train_X, train_Y, val_X, val_Y = prepare_data(train_df, val_df, cell_features)
    
    regr = ElasticNet(fit_intercept=False, max_iter=3000, tol=1e-3, l1_ratio=l1_ratio, alpha=alpha)
    regr.fit(train_X, train_Y)
    predicted_Y = regr.predict(val_X)
    corr = stats.pearsonr(val_Y, predicted_Y)[0]
    if math.isnan(corr):
        corr = 0.0
    return corr

In [None]:
def run_elastic_net(dataset, ont, drug, cell_features, folds=5):
    
    fold_corr_list = []
    corr_sum = 0.0
    for i in range(1, folds+1):

        all_df = pd.read_csv("../data/training_files_av/" + str(i) + "_train_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=(['cell', 'smiles', 'auc', 'dataset']))
        train_df, val_df = split_train_data(all_df)

        study = optuna.create_study(direction="maximize")
        study.optimize(lambda trial: optimize_elasticnet(trial, train_df, val_df, cell_features), n_trials=50)
        best_params = study.best_params
        
        test_df = pd.read_csv("../data/training_files_av/" + str(i) + "_test_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=['cell', 'smiles', 'auc', 'dataset'])
        train_X, train_Y, test_X, test_Y = prepare_data(all_df, test_df, cell_features)
        
        regr = ElasticNet(fit_intercept=False, max_iter=3000, tol=1e-3, l1_ratio=best_params['l1_ratio'], alpha=best_params['alpha'])
        regr.fit(train_X, train_Y)
        predicted_Y = regr.predict(test_X)

        corr = stats.pearsonr(test_Y, predicted_Y)[0]
        if math.isnan(corr):
            corr = 0.0
        fold_corr_list.append(corr)
        corr_sum += corr

        modeldir = "../models/elastic_net/"
        np.savetxt(modeldir + "predict_" + dataset + '_' + drug + '_' + str(i) + ".txt", predicted_Y, fmt = '%.4e')

    return fold_corr_list, (corr_sum/folds)


In [None]:
def run_random_forest(dataset, ont, drug, cell_features, folds=5):
    
    fold_corr_list = []
    corr_sum = 0.0
    for i in range(1, folds+1):

        train_df = pd.read_csv("../data/training_files_av/" + str(i) + "_train_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=(['cell', 'smiles', 'auc', 'dataset']))
        test_df = pd.read_csv("../data/training_files_av/" + str(i) + "_test_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=['cell', 'smiles', 'auc', 'dataset'])
        train_X, train_Y, test_X, test_Y = prepare_data(train_df, test_df, cell_features)
        
        regr = RandomForestRegressor(random_state=0, n_jobs=-2)
        regr.fit(train_X, train_Y)
        predicted_Y = regr.predict(test_X)
    
        corr = stats.pearsonr(test_Y, predicted_Y)[0]
        fold_corr_list.append(corr)
        corr_sum += corr

        modeldir = "../models/random_forest/"
        np.savetxt(modeldir + "predict_" + dataset + '_' + drug + '_' + str(i) + ".txt", predicted_Y, fmt = '%.4e')

    return fold_corr_list, (corr_sum/folds)


In [None]:
def optimize_mlp(trial, train_df, val_df, cell_features):
    
    layers = (30, 84, 150, 240, 258, 18, 6)
    alpha = trial.suggest_float('alpha', 1e-5, 1e-2, log=True)

    train_X, train_Y, val_X, val_Y = prepare_data(train_df, val_df, cell_features)
    
    regr = MLPRegressor(shuffle=True, early_stopping=True, learning_rate='constant', batch_size=64, activation='tanh', hidden_layer_sizes=layers, alpha=alpha)
    regr.fit(train_X, train_Y)
    predicted_Y = regr.predict(val_X)
    corr = stats.pearsonr(val_Y, predicted_Y)[0]
    if math.isnan(corr):
        corr = 0.0
    return corr

In [None]:
def run_mlp(dataset, ont, drug, cell_features, folds=5):
    
    fold_corr_list = []
    corr_sum = 0.0
    for i in range(1, folds+1):

        all_df = pd.read_csv("../data/training_files_av/" + str(i) + "_train_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=(['cell', 'smiles', 'auc', 'dataset']))
        train_df, val_df = split_train_data(all_df)

        study = optuna.create_study(direction="maximize")
        study.optimize(lambda trial: optimize_mlp(trial, train_df, val_df, cell_features), n_trials=10)
        best_params = study.best_params
        
        test_df = pd.read_csv("../data/training_files_av/" + str(i) + "_test_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=['cell', 'smiles', 'auc', 'dataset'])
        train_X, train_Y, test_X, test_Y = prepare_data(all_df, test_df, cell_features)
        
        layers = (30, 84, 150, 240, 258, 18, 6)
        regr = MLPRegressor(shuffle=True, early_stopping=True, learning_rate='constant', batch_size=64, activation='tanh', hidden_layer_sizes=layers, alpha=best_params['alpha'])
        regr.fit(train_X, train_Y)
        predicted_Y = regr.predict(test_X)

        corr = stats.pearsonr(test_Y, predicted_Y)[0]
        fold_corr_list.append(corr)
        corr_sum += corr

        modeldir = "../models/mlp/"
        np.savetxt(modeldir + "predict_" + dataset + '_' + drug + '_' + str(i) + ".txt", predicted_Y, fmt = '%.4e')

    return fold_corr_list, (corr_sum/folds)


In [None]:
def run_elastic_net_genie(dataset, ont, drug, gene_list, cell_map_train, cell_features_train, cell_map_test, cell_features_test, folds=5):
    
    for i in range(1, folds+1):

        train_df = pd.read_csv("../data/training_files_av/" + str(i) + "_train_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=(['cell', 'smiles', 'auc', 'dataset']))
        train_X, train_Y = prepare_data_feat(train_df, gene_list, cell_map_train, cell_features_train)
        
        test_df = pd.read_csv("../data/GENIE/GENIE_test_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=['cell', 'smiles', 'auc', 'dataset'])
        test_X, test_Y = prepare_data_feat(test_df, gene_list, cell_map_test, cell_features_test)
        
        regr = ElasticNetCV(fit_intercept=False, cv=10, max_iter=3000, tol=1e-3, n_jobs=-2)
        regr.fit(train_X, train_Y)
        predicted_Y = regr.predict(test_X)
        
        modeldir = "../models/elastic_net/"
        np.savetxt(modeldir + "predict_genie_" + dataset + '_' + drug + '_' + str(i) + ".txt", predicted_Y, fmt = '%.4e')


In [None]:
def run_random_forest_genie(dataset, ont, drug, gene_list, cell_map_train, cell_features_train, cell_map_test, cell_features_test, folds=5):
    
    for i in range(1, folds+1):

        train_df = pd.read_csv("../data/training_files_av/" + str(i) + "_train_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=(['cell', 'smiles', 'auc', 'dataset']))
        train_X, train_Y = prepare_data_feat(train_df, gene_list, cell_map_train, cell_features_train)
        
        test_df = pd.read_csv("../data/GENIE/GENIE_test_" + dataset + '_' + drug + ".txt", sep='\t', header=None, names=['cell', 'smiles', 'auc', 'dataset'])
        test_X, test_Y = prepare_data_feat(test_df, gene_list, cell_map_test, cell_features_test)
        
        regr = RandomForestRegressor(random_state=0, n_jobs=-2)
        regr.fit(train_X, train_Y)
        predicted_Y = regr.predict(test_X)
        
        modeldir = "../models/random_forest/"
        np.savetxt(modeldir + "predict_genie_" + dataset + '_' + drug + '_' + str(i) + ".txt", predicted_Y, fmt = '%.4e')


In [None]:
dataset = 'av'
ont = 'ctg'
zscore_method = 'auc'

gene_index = pd.read_csv('../data/training_files_av/gene2ind_' + ont + '_' + dataset + '.txt', sep='\t', header=None, names=(['I', 'G']))
gene_list = gene_index['G']

cell_index = pd.read_csv('../data/training_files_av/cell2ind_' + dataset + '.txt', sep='\t', header=None, names=(['I', 'C']))
cell_map = dict(zip(cell_index['C'], cell_index['I']))

mutations = pd.read_csv('../data/training_files_av/cell2mutation_' + ont + '_' + dataset + '.txt', header=None, names=gene_list)
cn_deletions = pd.read_csv('../data/training_files_av/cell2cndeletion_' + ont + '_' + dataset + '.txt', header=None, names=gene_list)
cn_amplifications = pd.read_csv('../data/training_files_av/cell2cnamplification_' + ont + '_' + dataset + '.txt', header=None, names=gene_list)

#cell_features = np.concatenate([mutations, cn_deletions, cn_amplifications])
cell_features = np.array(mutations | cn_deletions | cn_amplifications)

In [None]:
drugs = list(pd.read_csv('../data/training_files_av/drugname_av.txt', header=None, names=['D'])['D'])

In [None]:
elasticnet_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    
    corr_list, avg_corr = run_elastic_net(dataset, ont, drug, cell_features)
    print(drug, corr_list, avg_corr)
    
    elasticnet_df.loc[i]['Drug'] = drug
    elasticnet_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        elasticnet_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
elasticnet_df.to_csv('../models/elastic_net/corr_50.txt', sep='\t', index=False)

In [None]:
randomforest_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    
    corr_list, avg_corr = run_random_forest(dataset, ont, drug, cell_features)
    print(drug, corr_list, avg_corr)
    
    randomforest_df.loc[i]['Drug'] = drug
    randomforest_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        randomforest_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
randomforest_df.to_csv('../models/random_forest/corr_50.txt', sep='\t', index=False)

In [None]:
mlp_df = pd.DataFrame(index=range(len(drugs)), columns=['Drug', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5', 'Average'])

for i, drug in enumerate(drugs):
    
    corr_list, avg_corr = run_mlp(dataset, ont, drug, cell_features)
    print(drug, corr_list, avg_corr)
    
    mlp_df.loc[i]['Drug'] = drug
    mlp_df.loc[i]['Average'] = avg_corr
    for k in range(5):
        mlp_df.loc[i]['Fold_'+str(k+1)] = corr_list[k]

In [None]:
mlp_df.to_csv('../models/mlp/corr_50.txt', sep='\t', index=False)

In [None]:
dataset = 'av'
ont = 'ctg'

gene_index = pd.read_csv('../data/training_files_av/gene2ind_' + ont + '_' + dataset + '.txt', sep='\t', header=None, names=(['I', 'G']))
gene_list = gene_index['G']

cell_index = pd.read_csv('../data/training_files_av/cell2ind_' + dataset + '.txt', sep='\t', header=None, names=(['I', 'C']))
cell_map = dict(zip(cell_index['C'], cell_index['I']))
cell_features = pd.read_csv('../data/training_files_av/cell2mutation_' + ont + '_' + dataset + '.txt', header=None, names=gene_list)

cell_index_genie = pd.read_csv('../data/GENIE/GENIE_all_cell2ind.txt', sep='\t', header=None, names=(['I', 'C']))
cell_map_genie = dict(zip(cell_index_genie['C'], cell_index_genie['I']))
cell_features_genie = pd.read_csv('../data/GENIE/GENIE_cell2mutation_' + dataset + '.txt', header=None, names=gene_list)

In [None]:
drugs = ["Palbociclib"]

for drug in drugs:
    run_elastic_net_genie(dataset, ont, drug, gene_list, cell_map, cell_features, cell_map_genie, cell_features_genie)

In [None]:
for drug in drugs:
    run_random_forest_genie(dataset, ont, drug, gene_list, cell_map, cell_features, cell_map_genie, cell_features_genie)