In [None]:
import copy
from lifelines import KaplanMeierFitter
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lifelines.statistics import logrank_test
from lifelines import CoxPHFitter
from scipy.stats import norm

In [None]:
plt.rcParams['svg.fonttype'] = 'none'

SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
def create_line_curve(X, Y, x_title, y_title, x_lim, y_lim):
    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(111)
    ax.plot(X, Y)
    ax.grid(False)
    ax.set_xlim(x_lim)
    ax.set_ylim(y_lim)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    plt.show()
    return fig

In [None]:
def create_kaplan_meier(T1, T2, E1, E2, entity):
    
    fig, ax = plt.subplots(figsize=(5, 5))
    kmf_d = KaplanMeierFitter()
    
    kmf_d = kmf_d.fit(T1, event_observed=E1, label='Sensitive (n = ' + str(len(T1)) + ')')
    kmf_d.plot(ci_show=False, ax=ax)
    print('Median Survival (Sensitive): {:.1f}'.format(kmf_d.median_survival_time_))
    
    kmf_d = kmf_d.fit(T2, event_observed=E2, label='Resistant (n = ' + str(len(T2)) + ')')
    kmf_d.plot(ci_show=False, ax=ax)
    print('Median Survival (Resistant): {:.1f}'.format(kmf_d.median_survival_time_))
    
    #ax.set_xlim(0, 60)
    ax.set_ylim(0.0, 1.1)
    ax.set_xlabel('Overall Survival Months')
    ax.set_ylabel('Surviving Fractions')
    plt.title(entity + " Survival plot")
    plt.show()
    
    return fig

In [None]:
def categorize_samples(genie_drug_data, drug, fraction):

    data_size = len(genie_drug_data)
    category_size = int(data_size * fraction)
    genie_drug_data.sort_values(by=drug, inplace=True, ignore_index=True)

    survival_list = []
    for i in range(category_size):
        survival = genie_drug_data.at[i, 'Overall_Survival_Months']
        status = genie_drug_data.at[i, 'Overall_Survival_Status']
        event = 1
        if 'LIVING' in status:
            event = 0
        temp_df = pd.DataFrame([[survival, 1, event]], columns=['Overall_Survival_Months', 'response', 'event'])
        survival_list.append(temp_df)
    
    for i in range(category_size, data_size):
        survival = genie_drug_data.at[i, 'Overall_Survival_Months']
        status = genie_drug_data.at[i, 'Overall_Survival_Status']
        event = 1
        if 'LIVING' in status:
            event = 0
        temp_df = pd.DataFrame([[survival, 0, event]], columns=['Overall_Survival_Months', 'response', 'event'])
        survival_list.append(temp_df)

    survival_df = pd.concat(survival_list, axis=0, ignore_index=True)
    return survival_df

In [None]:
def categorize_samples_cl(genie_drug_data, drug, cl_median, cl_std):

    survival_list = []
    for i, row in genie_drug_data.iterrows():
        
        status = row['Overall_Survival_Status']
        event = 1
        if 'LIVING' in status:
            event = 0
            
        response = 0
        if row[drug] <= cl_median - cl_std:
            response = 1

        temp_df = pd.DataFrame([[row['Overall_Survival_Months'], response, event]], columns=['Overall_Survival_Months', 'response', 'event'])
        survival_list.append(temp_df)
    
    survival_df = pd.concat(survival_list, axis=0, ignore_index=True)
    return survival_df

In [None]:
def get_best_param(genie_drug_data, drug, cl_median, cl_std, th_type=None):
    
    df_list = []
    best_pval = 1.0
    best_f = 0
    best_df = 0
    
    f_min = 0.1
    f_max = 0.91

    f = f_min
    while f < f_max:
        
        survival_df = categorize_samples(genie_drug_data, drug, f)
        if th_type == 'cl':
            survival_df = categorize_samples_cl(genie_drug_data, drug, cl_median, cl_std)

        T1 = list(survival_df[survival_df['response'] == 1]['Overall_Survival_Months'])
        T2 = list(survival_df[survival_df['response'] == 0]['Overall_Survival_Months'])
        E1 = list(survival_df[survival_df['response'] == 1]['event'])
        E2 = list(survival_df[survival_df['response'] == 0]['event'])

        pval = logrank_test(durations_A=T1, durations_B=T2, event_observed_A=E1, event_observed_B=E2).p_value
        if best_pval > pval:
            best_pval = pval
            best_f = f
            best_df = survival_df
        df = pd.DataFrame([[f, pval]], columns=['Cut_off', 'P_Value'])
        df_list.append(df)
        
        f += 0.01

    print('Most optimal cut-off: {:.4f}'.format(best_f))
    pval_df = pd.concat(df_list, axis=0, ignore_index=True)
    create_line_curve(pval_df['Cut_off'], pval_df['P_Value'], 'Cut_off', 'P_Value', (f_min, f_max), (0.0, 0.5))
    
    return best_df

In [None]:
def genie_analysis_folds(test_data, genie_drug_data, d, fold_size=5, th_type=None):
    
    genie_drug_data[d] = 0.0

    corr_sum = 0.0
    count = 0
    for k in range(1, fold_size+1):
        
        cl_predict_data = np.loadtxt('../models/model_ctg_av_' + d + '_auc_' + str(k) + '/predict.txt')
        cl_median = np.median(cl_predict_data)
        cl_std = np.std(cl_predict_data)
        
        predict_data = np.loadtxt('../models/model_ctg_av_' + d + '_auc_' + str(k) + '/predict_genie_428.txt')
        pred_df = pd.Series(predict_data, name='pred')
        pred_df = pd.concat([test_data, pred_df], axis=1)[['cell_line', 'smiles', 'pred']]

        for i, row in genie_drug_data.iterrows():
            sampleId = row['Sample_ID']
            p_auc = list(pred_df[pred_df['cell_line'] == sampleId]['pred'])[0]
            genie_drug_data.at[i, d] = p_auc

        genie_drug_data.sort_values(by=drug, inplace=True, ignore_index=True)
        survival_df = get_best_param(genie_drug_data, d, cl_median, cl_std, th_type=th_type)
        
        T1 = list(survival_df[survival_df['response'] == 1]['Overall_Survival_Months'])
        T2 = list(survival_df[survival_df['response'] == 0]['Overall_Survival_Months'])
        E1 = list(survival_df[survival_df['response'] == 1]['event'])
        E2 = list(survival_df[survival_df['response'] == 0]['event'])
        print('p-value: {:.4f}'.format(logrank_test(durations_A=T1, durations_B=T2, event_observed_A=E1, event_observed_B=E2).p_value))
        km_fig = create_kaplan_meier(T1, T2, E1, E2, d)
        # km_fig.savefig('../plots/genie_survival_plot_' + str(k) + '.svg')

In [None]:
def genie_analysis_avg(test_data, genie_drug_data, d, fold_size=5, th_type=None):
    
    genie_drug_data[d] = 0.0
    
    all_cl_predict_data = []
    for k in range(1, fold_size+1):    
        cl_predict_data = np.loadtxt('../models/model_ctg_av_' + d + '_auc_' + str(k) + '/predict.txt')
        all_cl_predict_data.extend(cl_predict_data)
    cl_median = np.median(all_cl_predict_data)
    cl_std = np.std(all_cl_predict_data)
    print(cl_median, cl_std)

    all_predict_data = np.zeros(len(test_data))
    for k in range(1, fold_size+1):    
        predict_data = np.loadtxt('../models/model_ctg_av_' + d + '_auc_' + str(k) + '/predict_genie_428.txt')
        all_predict_data += predict_data
    all_predict_data /= fold_size
    print(np.median(all_predict_data))
    
    pred_df = pd.Series(all_predict_data, name='pred')
    pred_df = pd.concat([test_data, pred_df], axis=1)[['cell_line', 'smiles', 'pred']]

    for i, row in genie_drug_data.iterrows():
        sampleId = row['Sample_ID']
        p_auc = list(pred_df[pred_df['cell_line'] == sampleId]['pred'])[0]
        genie_drug_data.at[i, d] = p_auc

    genie_drug_data.sort_values(by=drug, inplace=True, ignore_index=True)
    survival_df = get_best_param(genie_drug_data, d, cl_median, cl_std, th_type=th_type)
    
    T1 = list(survival_df[survival_df['response'] == 1]['Overall_Survival_Months'])
    T2 = list(survival_df[survival_df['response'] == 0]['Overall_Survival_Months'])
    E1 = list(survival_df[survival_df['response'] == 1]['event'])
    E2 = list(survival_df[survival_df['response'] == 0]['event'])
    print('p-value: {:.4f}'.format(logrank_test(durations_A=T1, durations_B=T2, event_observed_A=E1, event_observed_B=E2).p_value))
    km_fig = create_kaplan_meier(T1, T2, E1, E2, d)
    # km_fig.savefig('../plots/genie_survival_plot.svg')

In [None]:
func_map = {}
func_map['Palbociclib'] = 'CDK4_6_Inhibitor_Overall'

smiles_map = {'Palbociclib':"CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCNCC4)C5CCCC5)C(=O)C"}

In [None]:
#Common data

genie_data = pd.read_csv('../data/GENIE/brca_akt1_genie_2019_clinical_data.tsv', sep='\t')
genie_data.columns = genie_data.columns.str.replace(' ','_', regex=False)
genie_data.columns = genie_data.columns.str.replace('/','_', regex=False)
genie_data.columns = genie_data.columns.str.replace('(','', regex=False)
genie_data.columns = genie_data.columns.str.replace(')','', regex=False)

all_genie_genes = pd.read_csv('../data/GENIE/GENIE_gene_list.txt', header=None, names=['G'])['G']

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

#genie_analysis_folds(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

#genie_analysis_folds(test_data, genie_drug_data, drug, th_type='cl')

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['mTOR_Inhibitor_Overall'] == 'No']
genie_drug_data = genie_drug_data[genie_drug_data['AKT_Inhibitor_Overall'] == 'No']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

#genie_analysis_folds(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['Sample_Type'] == 'Metastatic']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

#genie_analysis_folds(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['Sample_Type'] == 'Primary']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

#genie_analysis_folds(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['mTOR_Inhibitor_Overall'] == 'No']
genie_drug_data = genie_drug_data[genie_drug_data['AKT_Inhibitor_Overall'] == 'No']
genie_drug_data = genie_drug_data[genie_drug_data['Sample_Type'] == 'Metastatic']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

#genie_analysis_folds(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

genie_analysis_avg(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

#genie_analysis_avg(test_data, genie_drug_data, drug, th_type='cl')

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['Sample_Type'] == 'Metastatic']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

genie_analysis_avg(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['Sample_Type'] == 'Primary']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

genie_analysis_avg(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['mTOR_Inhibitor_Overall'] == 'No']
genie_drug_data = genie_drug_data[genie_drug_data['AKT_Inhibitor_Overall'] == 'No']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

genie_analysis_avg(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['mTOR_Inhibitor_Overall'] == 'No']
genie_drug_data = genie_drug_data[genie_drug_data['AKT_Inhibitor_Overall'] == 'No']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

genie_analysis_avg(test_data, genie_drug_data, drug, th_type='cl')

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['mTOR_Inhibitor_Overall'] == 'No']
genie_drug_data = genie_drug_data[genie_drug_data['AKT_Inhibitor_Overall'] == 'No']
genie_drug_data = genie_drug_data[genie_drug_data['Sample_Type'] == 'Metastatic']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

genie_analysis_avg(test_data, genie_drug_data, drug)

In [None]:
drug = 'Palbociclib'

test_data = pd.read_csv("../data/GENIE/test_428_" + drug + ".txt", sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])
    
genie_drug_data = copy.deepcopy(genie_data)
genie_drug_data = genie_drug_data[genie_drug_data[func_map[drug]] == 'Yes']
genie_drug_data = genie_drug_data[genie_drug_data['mTOR_Inhibitor_Overall'] == 'No']
genie_drug_data = genie_drug_data[genie_drug_data['AKT_Inhibitor_Overall'] == 'No']
genie_drug_data = genie_drug_data[genie_drug_data['Sample_Type'] == 'Primary']
print('Sample size:', len(genie_drug_data))
genie_drug_data.reset_index(drop=True, inplace=True)

genie_analysis_avg(test_data, genie_drug_data, drug)