# Generating The Connectivity Scores

## *1. Import the DRH datasets*

In [1]:
# import the data manipulation tool
import pandas as pd

In [2]:
### initialize dictionary to store the DRH datasets
corrs = {}
### store a list of the cell lines used
cell_lines = ["A375", "MCF7", "PC3"]

### iterate through each cell line
for cell_line in cell_lines:
    ## store the DRH and datasets in their respective dictionaries
    file_path = "./DRH_clin_data/DRH_data/" + cell_line + "_unique_ind.txt"
    cell_line_corrs = pd.read_csv(file_path)
    ## store only the data with compounds with a high transcriptional response
    above_tas1 = cell_line_corrs["drug1 tas"] > 0.2
    above_tas2 = cell_line_corrs["drug2 tas"] > 0.2
    corrs[cell_line] = cell_line_corrs[above_tas1 & above_tas2]

corrs["MCF7"].head()

Unnamed: 0,drug1,drug1 tas,drug2,drug2 tas,spearman corr,indication,known ind
0,midostaurin,0.799386,altrenogest,0.812819,0.908978,Estrus,False
1,altrenogest,0.812819,midostaurin,0.799386,0.908978,"Leukemia, Myeloid, Acute",False
2,ixazomib-citrate,0.747696,ixazomib,0.791817,0.889649,Multiple Myeloma,True
3,ixazomib,0.791817,ixazomib-citrate,0.747696,0.889649,Multiple Myeloma,True
4,carfilzomib,0.763093,ixazomib-citrate,0.747696,0.839315,Multiple Myeloma,True


## *2. Generate the commands to input the Clue Command App to obtain the connectivity scores*

In [3]:
# get the ids of the compounds
file_path = "./ref_data/GSE70138_Broad_LINCS_pert_info.txt"
pert_info = pd.read_csv(file_path, usecols=["pert_id", "pert_iname"], sep="\t")
pert_info.head()

Unnamed: 0,pert_id,pert_iname
0,BRD-K70792160,10-DEBC
1,BRD-K68552125,phorbol-myristate-acetate
2,BRD-K92301463,"16,16-dimethylprostaglandin-e2"
3,BRD-A29731977,17-hydroxyprogesterone-caproate
4,BRD-K07954936,2-iminobiotin


In [4]:
# import package for accessing file location
import os

In [5]:
### function to get the list of all drugs available
def gen_clue_commands(file_name):
    ## initialize a list to store all compounds across all cell lines
    all_cps = set()
    ## iterate through each cell line
    for cell_line, cell_line_data in corrs.items():
        ## get all the unique compounds
        cps = set(cell_line_data["drug1"]) | set(cell_line_data["drug2"])
        all_cps.update(cps)

    ## convert set to a list of all compounds available
    all_cps = list(all_cps)
    ## get the pert ids of the compounds
    pert_ids = pert_info.loc[pert_info["pert_iname"].isin(all_cps), "pert_id"].tolist()
    
    # ## store the location of the file
    # file_path = "./conn_scores/clue_app/" + file_name
    # file_path = os.path.expanduser(file_path)
    # ## get the total number of compounds available
    # num_cps = len(pert_ids)
    # chunk_size = 50
    # ## create a new file to save the name of the compounds
    # with open(file_path, "w") as file:
    #     file.write("Drug Repurposing Hub Compound IDs to Obtain Connectivity Scores for in Clue.IO Command App:\n")
    #     # iterate through every 50 compounds
    #     for idx in range(0, num_cps, chunk_size):
    #         # subset that 50 compounds in a string
    #         subset_cps = pert_ids[idx:idx+chunk_size]
    #         subset_cps = " ".join(subset_cps)
    #         # add those cps to file
    #         file.write("/conn " + subset_cps + "\n")
    #         file.write("\n")
    
    return all_cps

In [6]:
file_name = "clue_commands.txt"
all_cps = gen_clue_commands(file_name)

This generated text file contains compounds in both the Drug Repurposing Hub and clinical trials. Each section was one command entered into the Broad Institute's [Clue.io app](https://clue.io). There were a total of 8 batches, each with at most 50 compounds each.

The following steps were then performed to download the connectivity scores:
1. Next to the see all connections sections, the list option was selected.
2. Underneath cell line, only the summary option was selected (to get connectivity scores across all cell lines)
3. Underneath perturbagen type, only compounds were selected.
4. File –> Save Dataset –> select GCT version 1.3 and selection only –> save as a .txt file (I saved it in the format of "conn-scores(batch #).txt"
5. Upload all batches to the same directory as the clue commands file


***NOTE:*** not all compounds had connectivity scores recorded in the web app. BRD-A17883755 and BRD-K54997624 caused an error when using the command app so they were manually removed from clue_commands.txt.

In [7]:
### function to restructure the dataframe
def summarize_cs_scores(cs_scores, unique_cps):
    ## store the drug1s
    cols = cs_scores.columns[8:]
    drug1s = cs_scores.loc["name", cols].tolist()
    ## store all the drugs with a recorded connectivity score to the drug1s
    rows = cs_scores.index[25:]
    conn_drugs = cs_scores.loc[rows, "name"].tolist()
    ## store the connectivity scores
    cs_matrix = cs_scores.loc[rows, cols]
    cs_matrix.columns = drug1s
    cs_matrix.index = conn_drugs
    ## remove drug2s that are not in DRH or clinical trials
    cs_matrix = cs_matrix.loc[cs_matrix.index.isin(unique_cps)]
    drug2s = cs_matrix.index.tolist()
    ## remove duplicate drug2s
    cs_matrix = cs_matrix.astype(float)
    cs_matrix = cs_matrix.groupby(level=0).mean()
    cs_matrix = cs_matrix.groupby(level=0, axis=1).mean()
    
    ## create a new dataframe with all the connectivity scores
    cs = pd.DataFrame(columns=["drug1", "drug2", "conn score"])
    
    ## iterate through each of the drug1s
    for drug1 in drug1s:
        ## and drug2s
        for drug2 in drug2s:
            # get the connectivity score between the two drugs
            conn_score = cs_matrix.loc[drug2, drug1]
            # add this to the running dataframe for all drug pairs
            cs = cs.append({"drug1": drug1, "drug2": drug2, "conn score": conn_score}, ignore_index=True)
    
    return cs

In [9]:
### directory containing all the connectivity scores
file_path = "./conn_scores/clue_app/"

### create a new dataframe
cs = pd.DataFrame()

### iterate through each batch
for num in range(1, 15):
    # import the batch of connectivity scores
    file_name = file_path + "conn-scores(" + str(num) + ").txt"
    batch_cs = pd.read_csv(file_name, sep="\t", skiprows=2, index_col=0)
    ## summarize the connectivity scores for all compounds in DRH
    subset_cs = summarize_cs_scores(batch_cs, all_cps)
    ## add the subset results to the running dataframe for all drug pairs
    cs = cs.append(subset_cs, ignore_index=True)

### remove duplicate rows
cs = cs.dropna()
cs = cs.drop_duplicates()
### save file for future acces
# cs.to_csv("./conn_scores/conn_scores.txt", index=False)
cs.head()

Unnamed: 0,drug1,drug2,conn score
0,acyclovir,cabergoline,98.41
1,acyclovir,trimebutine,59.39
2,acyclovir,secnidazole,96.7
3,acyclovir,ipratropium,85.87
4,acyclovir,mestinon,96.12
