In [25]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from bisect import bisect_left
from scipy import stats
import warnings

In [4]:
# trim 2 dataframes to match
def trim(dataframe_1, dataframe_2):
    def process(df):
        df.sort_index(inplace=True)
        temp = df.iloc[:,0].str.strip()
        df.drop(columns=df.columns[0], inplace = True)
        df.sort_index(axis=1, inplace=True)
        df.insert(0,temp.name,temp)
        return df
    
    df1 = process(dataframe_1.copy())
    df2 = process(dataframe_2.copy())
    
    # match indexes
    mask = df1.iloc[:, 0].isin(df2.iloc[:, 0])
    df1 = df1[mask]
    mask = df2.iloc[:, 0].isin(df1.iloc[:, 0])
    df2 = df2[mask]
    
    mask = df1.columns.str.strip().isin(df2.columns.str.strip())
    df1 = df1.loc[:, mask]
    mask = df2.columns.str.strip().isin(df1.columns.str.strip())
    df2 = df2.loc[:, mask]
    
    return df1, df2

In [5]:
def mut_trim(mutation, dataframe):
    df_out = dataframe.copy()
    df_out.iloc[:,0] = dataframe.iloc[:,0].str.strip()
    # initialize dataframe
    df = pd.DataFrame(columns = np.append(df_out.columns.str.strip(), 'throwaway'))
    df.iloc[:,0] = np.append(df_out.iloc[:,0], 'throwaway')
    df.iloc[:,1:] = 0
    
    # add mutations
    r = {key: i for i, key in enumerate(df.iloc[:, 0])}
    c = {key: i for i, key in enumerate(df.columns)}
    for i in range(len(mutation)): 
        df.iloc[r.get(mutation.iloc[i, 0], -1), c.get(mutation.iloc[i, -1], -1)] = 1
    
    return df.iloc[:-1, :-1], df_out

In [6]:
# normalize dataframe to 0
def normalize(dataframe):
    df = dataframe.copy()
    mean = df.iloc[:, 1:].mean(axis=1, numeric_only=True)
    df.iloc[:, 1:] = df.iloc[:, 1:].sub(mean, axis=0)
    return df

In [7]:
def get_std(df1, df2):
    xstd = np.median(np.nanstd(df1.iloc[:,1:].to_numpy(dtype='float64'), axis=1))
    ystd = np.median(np.nanstd(df2.iloc[:,1:].to_numpy(dtype='float64'), axis=1))
    return xstd, ystd

In [10]:
# extracts a specific gene and creates density centers
# if continuous, boxes = number of divisions between max and min
# if discrete, boxes = array of values
def extract(df, gene_name, cutoff=False, std=1, max=7):
    values = df[df.iloc[:, 0] == gene_name].iloc[:, 1:]

    if cutoff:
        # set values greater than max in graph to be max or values less than min in graph to be min
        values[values > std*max] = std*max
        values[values < -std*max] = -std*max

    return values

In [23]:
def correlation(df1, df2, list, type='pearson', *args):
    key = {key: i for i, key in enumerate(dataset.iloc[:, 0])}
    temp = pd.DataFrame(index=range(len(list)), columns = [df1.name+'.'+df2.name])
    
    if 'mut' in args:
        df1_t, df2_t = mut_trim(df1, df2)
    else:
        df1_t, df2_t = trim(df1, df2)

    for i, gene_name in enumerate(df1_t.iloc[:,0]):
        x = df1_t.iloc[i:i+1, 1:]
        y = df2_t.iloc[i:i+1, 1:]
        # drop NaN
        df = pd.concat([x,y])
        df.dropna(axis=1, inplace=True)
        x = df.iloc[0]
        y = df.iloc[1]
        
        if type == 'pearson':
            temp.iloc[key[gene_name]] = stats.pearsonr(x,y).statistic
        elif type == 'spearman':
            temp.iloc[key[gene_name]] = stats.spearmanr(x,y).statistic

    return temp;
        

In [12]:
def build(df1, df2, boxes=7, *args):
    key = {key: i for i, key in enumerate(dataset.iloc[:, 0])}
    
    if 'mut' in args:
        temp = pd.DataFrame(index=range(len(genes)), 
                            columns = [df1.name+'.'+df2.name+'.'+value for value in map(str, range(2*boxes))])
        df1_t, df2_t = mut_trim(df1, df2)
        if 'cn' in args:
            for gene_name in df2_t.iloc[:,0]:
                y, x = drop_na_extract(df2_t, df1_t, gene_name)
                mat = mut_cn_densitymap(x,y,[0,1,2,3,4,6,8])
                temp.iloc[key[gene_name]] = mat.flatten()
        else:
            _, xstd = get_std(df1_t, df2_t)
            for gene_name in df2_t.iloc[:,0]:
                x, y, yd = single_extract(df1_t, df2_t, xstd, gene_name, 7, boxes)
                mat = mut_densitymap(x,y,yd,0.2)
                temp.iloc[key[gene_name]] = mat.flatten()
    else:
        temp = pd.DataFrame(index=range(len(genes)), 
                            columns = [df1.name+'.'+df2.name+'.'+value for value in map(str, range(boxes*boxes))])
        df1_t, df2_t = trim(df1, df2)
        if'cn' in args:
            xstd, _ = get_std(df1_t, df2_t)
            for gene_name in df1_t.iloc[:,0]:
                y, x, xd = single_extract(df2_t, df1_t, xstd, gene_name, 7, boxes)
                mat = cn_densitymap(x,y,xd,[0,1,2,3,4,6,8],0.2)
                temp.iloc[key[gene_name]] = mat.flatten()
        else:
            xstd, ystd = get_std(df1_t, df2_t)
            for gene_name in df1_t.iloc[:,0]:
                x, y, xd, yd = extract(df1_t, df2_t, xstd, ystd, gene_name, 7, boxes)
                mat = densitymap(x,y,xd,yd,0.2)
                temp.iloc[key[gene_name]] = mat.flatten()
    
    temp += 1/len(df1_t.columns)
    temp = temp.applymap(np.log)
    
    return temp

In [16]:
gene_exp = pd.read_csv(r"/Users/justinxu/Documents/Coding_Project/Han Xu/DepMap_data/DepMap_data/CCLE_gene_expression_trimmed_Wei.csv")
gene_exp = normalize(gene_exp)
gene_exp.name = 'gene_exp'

copy_num = pd.read_csv(r"/Users/justinxu/Documents/Coding_Project/Han Xu/DepMap_data/DepMap_data/CCLE_gene_cn_trimmed_Wei.csv")
copy_num.iloc[:,1:] *= 2
def take_closest(myList, myNumber):
    pos = bisect_left(myList, myNumber)
    if pos == 0:
        return myList[0]
    if pos == len(myList):
        return myList[-1]
    before = myList[pos - 1]
    after = myList[pos]
    if after - myNumber < myNumber - before:
        return after
    else:
        return before 
# List of values to compare for each element in the DataFrame
values_to_compare = [0, 1, 2, 3, 4, 6, 8]
# Apply the take_closest function to every value in the DataFrame
copy_num.iloc[:,1:] = copy_num.iloc[:,1:].map(lambda x: take_closest(values_to_compare, x))
copy_num.name = 'copy_num'

shRNA = pd.read_csv(r"/Users/justinxu/Documents/Coding_Project/Han Xu/DepMap_data/DepMap_data/shRNA_Broad_Trimmed_Wei.csv")
shRNA.name = 'shRNA'

gene_mut = pd.read_csv(r"/Users/justinxu/Documents/Coding_Project/Han Xu/DepMap_data/DepMap_data/CCLE_gene_mutation_trimmed_Wei.csv")
gene_mut.drop(columns=gene_mut.columns[0],inplace=True)
gene_mut.name = 'gene_mut'

CRISPR = pd.read_csv(r"/Users/justinxu/Documents/Coding_Project/Han Xu/DepMap_data/DepMap_data/Avana_gene_effect_20Q3_Trimmed_Wei.csv")
CRISPR.name = 'CRISPR'

  gene_mut = pd.read_csv(r"/Users/justinxu/Documents/Coding_Project/Han Xu/DepMap_data/DepMap_data/CCLE_gene_mutation_trimmed_Wei.csv")


In [27]:
warnings.filterwarnings("ignore")
genes = sorted(set(gene_exp.iloc[:,0].str.strip()) | 
               set(copy_num.iloc[:,0].str.strip()) | 
               set(shRNA.iloc[:,0].str.strip()) | 
               set(CRISPR.iloc[:,0].str.strip()))
type = 'pearson'
dataset = pd.DataFrame({'gene name':genes})
dataset = pd.concat([dataset, correlation(gene_exp, copy_num, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_exp, shRNA, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_mut, gene_exp, genes, type, 'mut')], axis=1)
dataset = pd.concat([dataset, correlation(gene_exp, CRISPR, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(shRNA, copy_num, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_mut, copy_num, genes, type, 'mut')], axis=1)
dataset = pd.concat([dataset, correlation(CRISPR, copy_num, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_mut, shRNA, genes, type, 'mut')], axis=1)
dataset = pd.concat([dataset, correlation(shRNA, CRISPR, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_mut, CRISPR, genes, type, 'mut')], axis=1)
warnings.filterwarnings("default")

In [28]:
dataset.to_csv(r"/Users/justinxu/Documents/Coding_Project/Han Xu/pearson.csv", index=False)

In [29]:
warnings.filterwarnings("ignore")
genes = sorted(set(gene_exp.iloc[:,0].str.strip()) | 
               set(copy_num.iloc[:,0].str.strip()) | 
               set(shRNA.iloc[:,0].str.strip()) | 
               set(CRISPR.iloc[:,0].str.strip()))
type = 'spearman'
dataset = pd.DataFrame({'gene name':genes})
dataset = pd.concat([dataset, correlation(gene_exp, copy_num, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_exp, shRNA, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_mut, gene_exp, genes, type, 'mut')], axis=1)
dataset = pd.concat([dataset, correlation(gene_exp, CRISPR, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(shRNA, copy_num, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_mut, copy_num, genes, type, 'mut')], axis=1)
dataset = pd.concat([dataset, correlation(CRISPR, copy_num, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_mut, shRNA, genes, type, 'mut')], axis=1)
dataset = pd.concat([dataset, correlation(shRNA, CRISPR, genes, type)], axis=1)
dataset = pd.concat([dataset, correlation(gene_mut, CRISPR, genes, type, 'mut')], axis=1)
warnings.filterwarnings("default")

In [30]:
dataset.to_csv(r"/Users/justinxu/Documents/Coding_Project/Han Xu/spearman.csv", index=False)