In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from bisect import bisect_left

ModuleNotFoundError: No module named 'numpy'

In [3]:
# trim 2 dataframes to match
def trim(dataframe_1, dataframe_2):
    def process(df):
        df.sort_index(inplace=True)
        temp = df.iloc[:,0].str.strip()
        df.drop(columns=df.columns[0], inplace = True)
        df.sort_index(axis=1, inplace=True)
        df.insert(0,temp.name,temp)
        return df
    
    df1 = process(dataframe_1.copy())
    df2 = process(dataframe_2.copy())
    
    # match indexes
    mask = df1.iloc[:, 0].isin(df2.iloc[:, 0])
    df1 = df1[mask]
    mask = df2.iloc[:, 0].isin(df1.iloc[:, 0])
    df2 = df2[mask]
    
    mask = df1.columns.str.strip().isin(df2.columns.str.strip())
    df1 = df1.loc[:, mask]
    mask = df2.columns.str.strip().isin(df1.columns.str.strip())
    df2 = df2.loc[:, mask]
    
    return df1, df2

In [4]:
# normalize dataframe to 0
def normalize(dataframe):
    df = dataframe.copy()
    mean = df.iloc[:, 1:].mean(axis=1, numeric_only=True)
    df.iloc[:, 1:] = df.iloc[:, 1:].sub(mean, axis=0)
    return df

In [5]:
def get_std(df1, df2):
    xstd = np.median(np.nanstd(df1.iloc[:,1:].to_numpy(dtype='float64'), axis=1))
    ystd = np.median(np.nanstd(df2.iloc[:,1:].to_numpy(dtype='float64'), axis=1))
    return xstd, ystd

In [6]:
# extracts a specific gene and creates density centers
def extract(df1, df2, xstd, ystd, gene_name, std, boxes):
    df = pd.concat([df1[df1.iloc[:, 0] == gene_name].iloc[:, 1:], 
                    df2[df2.iloc[:, 0] == gene_name].iloc[:, 1:]])

    # drop NaN
    df.dropna(axis=1, inplace=True)
    x = df.iloc[0]
    y = df.iloc[1]
    
    # set values greater than max in graph to be max or values less than min in graph to be min
    x[x > xstd*std] = xstd*std
    y[y > ystd*std] = ystd*std
    x[x < -xstd*std] = -xstd*std
    y[y < -ystd*std] = -ystd*std
    
    # create matrix points
    xd = np.linspace(-xstd*std,xstd*std, endpoint=True, num=boxes)
    yd = np.linspace(-ystd*std,ystd*std, endpoint=True, num=boxes)
    
    return x, y, xd, yd

In [7]:
# convert 2 vectors into a heatmap
def densitymap(x, y, xDensityCenters, yDensityCenters, sigma):
    if len(x) != len(y):
        return "inconsistent size of x and y vectors"
    # Ensure sigma is a scalar
    sigma_sq_inv = (1/np.asarray(sigma))**2
    
    mat = np.zeros((len(yDensityCenters), len(xDensityCenters)))
    
    for i, center_x in enumerate(xDensityCenters):
        for j, center_y in enumerate(yDensityCenters):
            dist_sq = (x - center_x)**2 + (y - center_y)**2
            mat[j, i] = np.sum(np.exp(-0.5 * dist_sq * sigma_sq_inv))
            
    # Normalize the kernel
    mat /= np.sum(mat)
    
    return mat

In [8]:
def mut_trim(mutation, dataframe):
    df_out = dataframe.copy()
    df_out.iloc[:,0] = dataframe.iloc[:,0].str.strip()
    # initialize dataframe
    df = pd.DataFrame(columns = np.append(df_out.columns.str.strip(), 'throwaway'))
    df.iloc[:,0] = np.append(df_out.iloc[:,0], 'throwaway')
    df.iloc[:,1:] = 0
    
    # add mutations
    r = {key: i for i, key in enumerate(df.iloc[:, 0])}
    c = {key: i for i, key in enumerate(df.columns)}
    for i in range(len(mutation)): 
        df.iloc[r.get(mutation.iloc[i, 0], -1), c.get(mutation.iloc[i, -1], -1)] = 1
    
    return df.iloc[:-1, :-1], df_out

In [9]:
# extracts a specific gene and creates density centers for static_dfation data
def single_extract(static_df, df1, xstd, gene_name, std, boxes):
    df = pd.concat([static_df[static_df.iloc[:, 0] == gene_name].iloc[:, 1:], 
                    df1[df1.iloc[:, 0] == gene_name].iloc[:, 1:]])
    
    # drop NaN
    df.dropna(axis=1, inplace=True)
    x = df.iloc[0].to_numpy(dtype='float64')
    y = df.iloc[1].to_numpy(dtype='float64')
    
    # set values greater than max in graph to be max or values less than min in graph to be min
    y[y > xstd*std] = xstd*std
    y[y < -xstd*std] = -xstd*std
    
    
    # create matrix points
    yd = np.linspace(-xstd*std,xstd*std, endpoint=True, num=boxes)
    
    return x, y, yd

In [10]:
# extracts a specific gene and creates density centers for static_dfation data
def drop_na_extract(df1, df2, gene_name):
    df = pd.concat([df1[df1.iloc[:, 0] == gene_name].iloc[:, 1:], 
                    df2[df2.iloc[:, 0] == gene_name].iloc[:, 1:]])
    
    # drop NaN
    df.dropna(axis=1, inplace=True)
    x = df.iloc[0].to_numpy(dtype='float64')
    y = df.iloc[1].to_numpy(dtype='float64')

    return x, y

In [11]:
# convert 2 vectors into a heatmap for mutation data
def mut_densitymap(x, y, yDensityCenters, sigma):
    if len(x) != len(y):
        return "inconsistent size of x and y vectors"
    # Ensure sigma is a scalar
    sigma_sq_inv = (1/np.asarray(sigma))**2
    
    mat = np.zeros((2, len(yDensityCenters)))
    
    for i, center_y in enumerate(yDensityCenters):
        dist_sq = (y[x==0] - center_y)**2
        mat[0, i] = np.sum(np.exp(-0.5 * dist_sq * sigma_sq_inv))
        dist_sq = (y[x==1] - center_y)**2
        mat[1, i] = np.sum(np.exp(-0.5 * dist_sq * sigma_sq_inv))
    
    # Normalize the kernel
    mat /= np.sum(mat)
    
    return mat

In [12]:
# convert 2 vectors into a heatmap for mutation data
def cn_densitymap(x, y, xDensityCenters, yDensityCenters, sigma):
    if len(y) != len(x):
        return "inconsistent size of y and x vectors"
    
    # Ensure sigma is a scalar
    sigma_sq_inv = (1/np.asarray(sigma))**2
    
    mat = np.zeros((len(xDensityCenters), len(yDensityCenters)))
    
    for i, center_x in enumerate(xDensityCenters):
        for j, center_y in enumerate(yDensityCenters):
            dist_sq = (x[y==center_y] - center_x)**2
            mat[i, j] = np.sum(np.exp(-0.5 * dist_sq * sigma_sq_inv))
    
    # Normalize the kernel
    mat /= np.sum(mat)
    
    return mat

In [13]:
# convert 2 vectors into a heatmap for mutation and copy number data
def mut_cn_densitymap(x, y, yDensityCenters):
    if len(y) != len(x):
        return "inconsistent size of y and x vectors"
    
    mat = np.zeros((2, len(yDensityCenters)))
    
    for j, center_y in enumerate(yDensityCenters):
        arr = x[y==center_y]
        mat[0, j] = len(arr[arr==0])
        mat[1, j] = len(arr[arr==0])
    
    # Normalize the kernel
    mat /= np.sum(mat)
    
    return mat

In [23]:
def build(df1, df2, boxes=7, *args):
    key = {key: i for i, key in enumerate(dataset.iloc[:, 0])}
    
    if 'mut' in args:
        temp = pd.DataFrame(index=range(
            len(genes)), 
                            columns = [df1.name+'.'+df2.name+'.'+value for value in map(str, range(2*boxes))])
        df1_t, df2_t = mut_trim(df1, df2)
        if 'cn' in args:
            for gene_name in df2_t.iloc[:,0]:
                y, x = drop_na_extract(df2_t, df1_t, gene_name)
                mat = mut_cn_densitymap(x,y,[0,1,2,3,4,6,8])
                temp.iloc[key[gene_name]] = mat.flatten()
        else:
            _, xstd = get_std(df1_t, df2_t)
            for gene_name in df2_t.iloc[:,0]:
                x, y, yd = single_extract(df1_t, df2_t, xstd, gene_name, 7, boxes)
                mat = mut_densitymap(x,y,yd,0.2)
                temp.iloc[key[gene_name]] = mat.flatten()
    else:
        temp = pd.DataFrame(index=range(len(genes)), 
                            columns = [df1.name+'.'+df2.name+'.'+value for value in map(str, range(boxes*boxes))])
        df1_t, df2_t = trim(df1, df2)
        if'cn' in args:
            xstd, _ = get_std(df1_t, df2_t)
            for gene_name in df1_t.iloc[:,0]:
                y, x, xd = single_extract(df2_t, df1_t, xstd, gene_name, 7, boxes)
                mat = cn_densitymap(x,y,xd,[0,1,2,3,4,6,8],0.2)
                temp.iloc[key[gene_name]] = mat.flatten()
        else:
            xstd, ystd = get_std(df1_t, df2_t)
            for gene_name in df1_t.iloc[:,0]:
                x, y, xd, yd = extract(df1_t, df2_t, xstd, ystd, gene_name, 7, boxes)
                mat = densitymap(x,y,xd,yd,0.2)
                temp.iloc[key[gene_name]] = mat.flatten()
    
    temp += 1/len(df1_t.columns)
    temp = temp.applymap(np.log)
    
    return temp

In [24]:
gene_exp = pd.read_csv(r"C:\Users\justi\Coding\Project\Han Xu\DepMap_data\DepMap_data\CCLE_gene_expression_trimmed_Wei.csv")
gene_exp = normalize(gene_exp)
gene_exp.name = 'gene_exp'

copy_num = pd.read_csv(r"C:\Users\justi\Coding\Project\Han Xu\DepMap_data\DepMap_data\CCLE_gene_cn_trimmed_Wei.csv")
copy_num.iloc[:,1:] *= 2
def take_closest(myList, myNumber):
    pos = bisect_left(myList, myNumber)
    if pos == 0:
        return myList[0]
    if pos == len(myList):
        return myList[-1]
    before = myList[pos - 1]
    after = myList[pos]
    if after - myNumber < myNumber - before:
        return after
    else:
        return before 
# List of values to compare for each element in the DataFrame
values_to_compare = [0, 1, 2, 3, 4, 6, 8]
# Apply the take_closest function to every value in the DataFrame
copy_num.iloc[:,1:] = copy_num.iloc[:,1:].applymap(lambda x: take_closest(values_to_compare, x))
copy_num.name = 'copy_num'

shRNA = pd.read_csv(r"C:\Users\justi\Coding\Project\Han Xu\DepMap_data\DepMap_data\shRNA_Broad_Trimmed_Wei.csv")
shRNA.name = 'shRNA'

gene_mut = pd.read_csv(r"C:\Users\justi\Coding\Project\Han Xu\DepMap_data\DepMap_data\CCLE_gene_mutation_trimmed_Wei.csv")
gene_mut.drop(columns=gene_mut.columns[0],inplace=True)
gene_mut.name = 'gene_mut'

CRISPR = pd.read_csv(r"C:\Users\justi\Coding\Project\Han Xu\DepMap_data\DepMap_data\Avana_gene_effect_20Q3_Trimmed_Wei.csv")
CRISPR.name = 'CRISPR'

  copy_num.iloc[:,1:] = copy_num.iloc[:,1:].applymap(lambda x: take_closest(values_to_compare, x))
  gene_mut = pd.read_csv(r"C:\Users\justi\Coding\Project\Han Xu\DepMap_data\DepMap_data\CCLE_gene_mutation_trimmed_Wei.csv")


In [26]:
genes = sorted(set(gene_exp.iloc[:,0].str.strip()) | 
               set(copy_num.iloc[:,0].str.strip()) | 
               set(shRNA.iloc[:,0].str.strip()) | 
               set(CRISPR.iloc[:,0].str.strip()))
dataset = pd.DataFrame({'gene name':genes})
dataset = pd.concat([dataset, build(gene_exp, copy_num, 7, 'cn')], axis=1)
dataset = pd.concat([dataset, build(gene_exp, shRNA, 7)], axis=1)
dataset = pd.concat([dataset, build(gene_mut, gene_exp, 7, 'mut')], axis=1)
dataset = pd.concat([dataset, build(gene_exp, CRISPR, 7)], axis=1)
dataset = pd.concat([dataset, build(shRNA, copy_num, 7, 'cn')], axis=1)
dataset = pd.concat([dataset, build(gene_mut, copy_num, 7, 'cn', 'mut')], axis=1)
dataset = pd.concat([dataset, build(CRISPR, copy_num, 7, 'cn')], axis=1)
dataset = pd.concat([dataset, build(gene_mut, shRNA, 7, 'mut')], axis=1)
dataset = pd.concat([dataset, build(shRNA, CRISPR, 7)], axis=1)
dataset = pd.concat([dataset, build(gene_mut, CRISPR, 7, 'mut')], axis=1)

In [27]:
dataset.dropna(inplace=True)
dataset

Unnamed: 0,gene name,gene_exp.copy_num.0,gene_exp.copy_num.1,gene_exp.copy_num.2,gene_exp.copy_num.3,gene_exp.copy_num.4,gene_exp.copy_num.5,gene_exp.copy_num.6,gene_exp.copy_num.7,gene_exp.copy_num.8,...,gene_mut.CRISPR.4,gene_mut.CRISPR.5,gene_mut.CRISPR.6,gene_mut.CRISPR.7,gene_mut.CRISPR.8,gene_mut.CRISPR.9,gene_mut.CRISPR.10,gene_mut.CRISPR.11,gene_mut.CRISPR.12,gene_mut.CRISPR.13
0,A1BG,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.118440,...,-1.053758,-3.201021,-5.819234,-6.671945,-6.628317,-5.493931,-4.052666,-4.288081,-5.905126,-6.654196
2,A1CF,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,...,-1.257186,-3.589475,-6.307292,-6.671578,-6.539213,-4.854680,-3.489741,-3.998311,-5.907700,-6.655811
3,A2M,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,...,-2.047062,-4.878248,-6.515009,-6.656328,-5.852582,-3.893201,-3.336245,-4.560143,-6.400044,-6.670077
5,A2ML1,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,...,-0.884446,-2.641586,-5.327741,-6.671650,-6.560098,-4.958254,-3.241637,-3.136396,-4.856608,-6.578655
8,A4GALT,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,...,-1.892202,-4.604324,-6.400253,-6.669687,-6.446942,-5.251609,-4.752306,-5.667741,-6.603665,-6.671719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28545,ZXDC,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.066659,...,-1.073525,-3.150751,-5.776114,-6.669076,-6.429197,-5.029861,-3.883235,-4.134244,-5.694593,-6.636581
28546,ZYG11A,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,...,-1.235497,-3.529519,-5.938482,-6.671310,-6.519521,-4.977586,-3.725286,-4.098734,-5.851332,-6.654371
28548,ZYX,-7.165493,-7.165493,-3.509135,-4.439900,-7.165493,-7.165493,-7.165493,-7.165493,-6.982534,...,-0.969288,-2.955374,-6.064832,-6.671853,-6.621805,-5.573597,-4.126896,-4.272159,-5.884935,-6.654416
28549,ZZEF1,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165456,...,-2.785875,-5.367652,-6.269565,-6.267631,-4.554152,-3.125132,-3.140908,-4.841502,-6.508759,-6.670974


In [28]:
dataset.to_csv(r"C:\Users\justi\Coding\Project\Han Xu\dataset_trimmed_v2.csv", index=False)