In [None]:
import copy
import math
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
def create_bar_plot(data, x_title, y_title):
    fig = plt.figure(figsize=(9, 8))
    ax = fig.add_subplot(111)
    ax.hist(x=data, bins='auto', alpha=0.7, rwidth=0.85)
    ax.grid(False)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    plt.show()
    return fig

In [None]:
#Separating data for each drug/cell

def get_pos_map(obj_list, test_df, col):
    pos_map = {obj:[] for obj in obj_list}
    for i, row in test_df.iterrows():
        pos_map[row[col]].append(i)
    return pos_map

In [None]:
#Arrange the obj_list in the descending order of variance

def sort_var(obj_list, var_list):
    var_map = {}
    for i, obj in enumerate(obj_list):
        var_map[obj] = var_list[i]
    return {obj:var for obj,var in sorted(var_map.items(), key=lambda item:item[1], reverse=True)}

In [None]:
def calc_stddev(obj_list, train_df, col):
    var_list = [0.0] * len(obj_list)
    pos_map = get_pos_map(obj_list, train_df, col)
    for i, obj in enumerate(obj_list):
        train_vals = np.take(train_df['auc'], pos_map[obj])
        var_list[i] = np.std(train_vals)
    return sort_var(obj_list, var_list)

In [None]:
def get_filtered_data(df, fraction):
    
    fig = 0

    df.sort_values(by='auc', inplace=True, ignore_index=True)
    
    data_size = len(df)
    class_size = int(data_size * fraction)
    sens_df = df.iloc[ : class_size]
    res_df = df.iloc[data_size - class_size : data_size]
    
    filtered_df = pd.concat([sens_df, res_df], axis=0, ignore_index=True)
    filtered_df = filtered_df.sample(frac=1).reset_index(drop=True)
    return filtered_df

In [None]:
def create_strict_cv_data(train_df, cell_lines, dataset, drug, fold_size=5):
    
    cell_count = len(cell_lines)
    for k in range(1, fold_size+1):
        
        cv_size = int(cell_count/fold_size) + k%2
        k_cell_lines = []

        for i in range(cv_size):
            r = rd.randint(0, len(cell_lines) - 1)
            k_cell_lines.append(cell_lines.pop(r))

        k_test_data = train_df.query('cell_line in @k_cell_lines')
        k_train_data = train_df.drop(k_test_data.index)

        k_test_data.to_csv("../data/training_files_av/" + str(k) + "_test_" + dataset + "_" + drug + ".txt", sep="\t", header=False, index=False)
        k_train_data.to_csv("../data/training_files_av/" + str(k) + "_train_" + dataset + "_" + drug + ".txt", sep="\t", header=False, index=False)

In [None]:
def create_select_drug_data(cell_list, drugs, drug_name_map, train_df, dataset):
    for drug in drugs:
        drug_smiles = drug_name_map[drug]
        drug_train_df = train_df.query("smiles == @drug_smiles")
        #drug_train_df = get_filtered_data(drug_train_df, 0.3)
        drug_train_df.to_csv("../data/training_files_av/train_" + dataset + "_" + drug + ".txt", sep="\t", header=False, index=False)
        
        cells = copy.deepcopy(cell_list)
        create_strict_cv_data(drug_train_df, cells, dataset, drug)

In [None]:
dataset = "av"

cell_list = list(pd.read_csv("../data/training_files_av/cell2ind_" + dataset + ".txt", sep="\t", header=None, names=['I', 'C'])['C'])

drug_list = list(pd.read_csv("../data/training_files_av/drug2ind_" + dataset + ".txt", sep="\t", header=None, names=['I', 'D'])['D'])

all_df = pd.read_csv("../data/training_files_av/train_" + dataset + ".txt", sep="\t", header=None, names=['cell_line', 'smiles', 'auc', 'dataset'])

drug_info = pd.read_csv("../data/master_druglist_smiles_final.csv")[['name', 'isomeric_smiles']]
drug_info['name'] = drug_info['name'].str.replace(' ','-')
drug_info['name'] = drug_info['name'].str.replace('"','')

drug_smiles_map = dict(zip(drug_info.isomeric_smiles, drug_info.name))
drug_name_map = dict(zip(drug_info.name, drug_info.isomeric_smiles))

In [None]:
stddev_map = calc_stddev(drug_list, all_df, 'smiles')
av_drug_hist = create_bar_plot(stddev_map.values(), 'Std Dev', '# of Drugs')

In [None]:
std_vals = np.array(list(stddev_map.values()))

In [None]:
threshold = np.median(std_vals) + 1.5*np.std(std_vals)
threshold

In [None]:
top_drugs = []
for i, drug in enumerate(stddev_map.keys()):
    if i == 50:
        break
    top_drugs.append(drug)

top_drugs = [drug_smiles_map[d] for d in top_drugs if '.' not in d]

In [None]:
top_drugs

In [None]:
top_drugs_df = pd.DataFrame(top_drugs)

In [None]:
#top_drugs_df.to_csv("../data/training_files_av/drugname_av.txt", header=False, index=False)

In [None]:
cmap_drugs = pd.read_csv("../data/CMAP/drugs_cmap.txt", header=None, names=['name'])
cmap_drugs['name'] = cmap_drugs['name'].str.replace(' ','-')
cmap_drugs['name'] = cmap_drugs['name'].str.replace('"','')

In [None]:
cmap_drugs.to_csv("../data/training_files_av/drugname_cmap.txt", header=False, index=False)

In [None]:
drugs = list(cmap_drugs['name'])

In [None]:
create_select_drug_data(cell_list, drugs, drug_name_map, all_df, dataset)