In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from bisect import bisect_left

In [2]:
# trim 2 dataframes to match
def trim(dataframe_1, dataframe_2):
    def process(df):
        df.sort_index(inplace=True)
        temp = df.iloc[:,0].str.strip()
        df.drop(columns=df.columns[0], inplace = True)
        df.sort_index(axis=1, inplace=True)
        df.insert(0,temp.name,temp)
        return df
    
    df1 = process(dataframe_1.copy())
    df2 = process(dataframe_2.copy())
    
    # match indexes
    mask = df1.iloc[:, 0].isin(df2.iloc[:, 0])
    df1 = df1[mask]
    mask = df2.iloc[:, 0].isin(df1.iloc[:, 0])
    df2 = df2[mask]
    
    mask = df1.columns.str.strip().isin(df2.columns.str.strip())
    df1 = df1.loc[:, mask]
    mask = df2.columns.str.strip().isin(df1.columns.str.strip())
    df2 = df2.loc[:, mask]
    
    return df1, df2

def mut_trim(mutation, dataframe):
    df_out = dataframe.copy()
    df_out.iloc[:,0] = dataframe.iloc[:,0].str.strip()
    # initialize dataframe
    df = pd.DataFrame(columns = np.append(df_out.columns.str.strip(), 'throwaway'))
    df.iloc[:,0] = np.append(df_out.iloc[:,0], 'throwaway')
    df.iloc[:,1:] = 0
    
    # add mutations
    r = {key: i for i, key in enumerate(df.iloc[:, 0])}
    c = {key: i for i, key in enumerate(df.columns)}
    for i in range(len(mutation)): 
        df.iloc[r.get(mutation.iloc[i, 0], -1), c.get(mutation.iloc[i, -1], -1)] = 1
    
    return df.iloc[:-1, :-1], df_out

In [3]:
# normalize dataframe to 0
def normalize(matrix):
    mat = np.copy(matrix)
    mean = np.nanmean(matrix, axis=1)[:,np.newaxis]
    return mat-mean

In [None]:
def get_std(mat1, mat2):
    return np.nanstd(mat1), np.nanstd(mat2);

def avg_std(mat1, mat2):
    return math.sqrt((np.nanvar(mat1)+np.nanvar(mat2))/2)

In [9]:
# extracts a specific gene and creates density centers
# cutoff must exist to have density centers
def extract(df, name, cutoff=False, std=1, max=7, density_center=False, boxes=7):
    values = df[df.iloc[:, 0] == name].iloc[:, 1:].to_numpy()[0]

    if cutoff:
        # set values greater than max in graph to be max or values less than min in graph to be min
        values[values > std*max] = std*max
        values[values < -std*max] = -std*max
    else:
        return values

    if density_center:
        centers = np.linspace(-std*max, std*max, num=boxes*2, endpoint=False)[1::2]
        return values, centers
    else:
        return values

In [10]:
# extracts a specific gene and creates density centers for static_dfation data
def drop_nan(x, y):
    combined = np.array(np.concatenate(([x],[y]), axis=0))
    # drop NaN
    combined = combined[:, ~pd.isna(combined).any(axis=0)]

    return combined[0], combined[1]

In [11]:
# convert 2 vectors into a heatmap
def densitymap(x, y, xDensityCenters, yDensityCenters, xdiscrete=False, ydiscrete=False, sigma=1):
    if len(x) != len(y):
        return "inconsistent size of x and y vectors"
    
    sigma_sq_inv = (1/sigma)**2
    mat = np.zeros((len(yDensityCenters), len(xDensityCenters)))

    if not xdiscrete and not ydiscrete:
        for pt in range(len(x)):
            temp = np.zeros((len(yDensityCenters), len(xDensityCenters)))
            for i, center_x in enumerate(xDensityCenters):
                for j, center_y in enumerate(yDensityCenters):
                    dist_sq = (x[pt] - center_x)**2+(y[pt] - center_y)**2
                    temp[j, i] = np.exp(-0.5*sigma_sq_inv*dist_sq)
                    
            temp /= np.sum(temp)
            mat += temp
            
    elif xdiscrete and ydiscrete:
        for i, center_x in enumerate(xDensityCenters):
            for j, center_y in enumerate(yDensityCenters):
                mat[j, i] += np.sum((x[y==center_y]==center_x))
                
    elif xdiscrete:
        for pt in range(len(x)):
            temp = np.zeros(len(yDensityCenters))
            for i, center_y in enumerate(yDensityCenters):
                dist_sq = (y[pt] - center_y)**2
                temp[i] = np.exp(-0.5*sigma_sq_inv*dist_sq)
                    
            temp /= np.sum(temp)
            mat[:, xDensityCenters.index(x[pt])] += temp
    else:
        for pt in range(len(y)):
            temp = np.zeros(len(xDensityCenters))
            for i, center_x in enumerate(xDensityCenters):
                dist_sq = (x[pt] - center_x)**2
                temp[i] = np.exp(-0.5*sigma_sq_inv*dist_sq)
                    
            temp /= np.sum(temp)
            mat[yDensityCenters.index(y[pt])] += temp
    
    # Normalize the kernel
    mat /= len(x)
    
    return mat

In [12]:
def build(df1, df2, *args):
    key = {key: i for i, key in enumerate(dataset.iloc[:, 0])}
    
    if 'mut' in args:
        temp = pd.DataFrame(index=range(len(genes)), 
                            columns = [df1.name+'.'+df2.name+'.'+value for value in map(str, range(2*7))])
        df1_t, df2_t = mut_trim(df1, df2)
        if 'cn' in args:
            for gene_name in df2_t.iloc[:,0]:
                x = extract(df1_t, gene_name, cutoff=False)
                y = extract(df2_t, gene_name, cutoff=False)
                x, y = drop_nan(x, y)
                mat = densitymap(x, y, [0,1], [0,1,2,3,4,6,8], xdiscrete=True, ydiscrete=True)
                temp.iloc[key[gene_name]] = mat.flatten()
        else:
            df2_t.iloc[:,1:] = normalize(df2_t.iloc[:,1:])
            std = np.nanstd(df2_t.iloc[:,1:])
            for gene_name in df2_t.iloc[:,0]:
                x = extract(df1_t, gene_name, cutoff=False)
                y, yd = extract(df2_t, gene_name, cutoff=True, std=std, max=7, density_center=True, boxes=7)
                x, y = drop_nan(x, y)
                mat = densitymap(x, y, [0, 1], yd, xdiscrete=True, sigma=std)
                temp.iloc[key[gene_name]] = mat.flatten()
    else:
        temp = pd.DataFrame(index=range(len(genes)), 
                            columns = [df1.name+'.'+df2.name+'.'+value for value in map(str, range(7*7))])
        df1_t, df2_t = trim(df1, df2)
        if'cn' in args:
            df1_t.iloc[:,1:] = normalize(df1_t.iloc[:,1:])
            std = np.nanstd(df1_t.iloc[:,1:])
            for gene_name in df1_t.iloc[:,0]:
                x, xd = extract(df1_t, gene_name, cutoff=True, std=std, max=7, density_center=True, boxes=7)
                y = extract(df2_t, gene_name, cutoff=False)
                x, y = drop_nan(x, y)
                mat = densitymap(x,y,xd,[0,1,2,3,4,6,8], ydiscrete=True, sigma=std)
                temp.iloc[key[gene_name]] = mat.flatten()
        else:
            df1_t.iloc[:,1:] = normalize(df1_t.iloc[:,1:])
            df2_t.iloc[:,1:] = normalize(df2_t.iloc[:,1:])
            xstd = np.nanstd(df1_t.iloc[:,1:].to_numpy())
            ystd = np.nanstd(df2_t.iloc[:,1:].to_numpy())
            avgstd = avg_std(df1_t.iloc[:,1:].to_numpy(), df2_t.iloc[:,1:].to_numpy())
            for gene_name in df1_t.iloc[:,0]:
                x, xd = extract(df1_t, gene_name, cutoff=True, std=xstd, max=7, density_center=True, boxes=7)
                y, yd = extract(df2_t, gene_name, cutoff=True, std=ystd, max=7, density_center=True, boxes=7)
                x, y = drop_nan(x, y)
                mat = densitymap(x,y,xd,yd,sigma=avgstd)
                temp.iloc[key[gene_name]] = mat.flatten()
    
    temp += 1/len(df1_t.columns)
    temp = temp.applymap(np.log)
    
    return temp

In [13]:
gene_exp = pd.read_csv(r"C:\Users\justi\Coding\Coding_Project\Han Xu\DepMap_data\CCLE_gene_expression_trimmed_Wei.csv")
gene_exp.name = 'gene_exp'

copy_num = pd.read_csv(r"C:\Users\justi\Coding\Coding_Project\Han Xu\DepMap_data\CCLE_gene_cn_trimmed_Wei.csv")
copy_num.iloc[:,1:] *= 2
def take_closest(myList, myNumber):
    pos = bisect_left(myList, myNumber)
    if pos == 0:
        return myList[0]
    if pos == len(myList):
        return myList[-1]
    before = myList[pos - 1]
    after = myList[pos]
    if after - myNumber < myNumber - before:
        return after
    else:
        return before 
# List of values to compare for each element in the DataFrame
values_to_compare = [0, 1, 2, 3, 4, 6, 8]
# Apply the take_closest function to every value in the DataFrame
copy_num.iloc[:,1:] = copy_num.iloc[:,1:].map(lambda x: take_closest(values_to_compare, x))
copy_num.name = 'copy_num'

shRNA = pd.read_csv(r"C:\Users\justi\Coding\Coding_Project\Han Xu\DepMap_data\shRNA_Broad_Trimmed_Wei.csv")
shRNA.name = 'shRNA'

gene_mut = pd.read_csv(r"C:\Users\justi\Coding\Coding_Project\Han Xu\DepMap_data\CCLE_gene_mutation_trimmed_Wei.csv")
gene_mut.drop(columns=gene_mut.columns[0],inplace=True)
gene_mut.name = 'gene_mut'

CRISPR = pd.read_csv(r"C:\Users\justi\Coding\Coding_Project\Han Xu\DepMap_data\Avana_gene_effect_20Q3_Trimmed_Wei.csv")
CRISPR.name = 'CRISPR'

  gene_mut = pd.read_csv(r"C:\Users\justi\Coding\Coding_Project\Han Xu\DepMap_data\CCLE_gene_mutation_trimmed_Wei.csv")


In [82]:
genes = sorted(set(gene_exp.iloc[:,0].str.strip()) | 
               set(copy_num.iloc[:,0].str.strip()) | 
               set(shRNA.iloc[:,0].str.strip()) | 
               set(CRISPR.iloc[:,0].str.strip()))
dataset = pd.DataFrame({'gene name':genes})
dataset = pd.concat([dataset, build(gene_exp, copy_num, 'cn')], axis=1)
dataset = pd.concat([dataset, build(gene_exp, shRNA)], axis=1)
dataset = pd.concat([dataset, build(gene_mut, gene_exp, 'mut')], axis=1)
dataset = pd.concat([dataset, build(gene_exp, CRISPR)], axis=1)
dataset = pd.concat([dataset, build(shRNA, copy_num, 'cn')], axis=1)
dataset = pd.concat([dataset, build(gene_mut, copy_num, 'cn', 'mut')], axis=1)
dataset = pd.concat([dataset, build(CRISPR, copy_num, 'cn')], axis=1)
dataset = pd.concat([dataset, build(gene_mut, shRNA, 'mut')], axis=1)
dataset = pd.concat([dataset, build(shRNA, CRISPR)], axis=1)
dataset = pd.concat([dataset, build(gene_mut, CRISPR, 'mut')], axis=1)

  temp = temp.applymap(np.log)
  temp = temp.applymap(np.log)
  temp = temp.applymap(np.log)
  temp = temp.applymap(np.log)
  temp = temp.applymap(np.log)
  temp = temp.applymap(np.log)
  temp = temp.applymap(np.log)
  temp = temp.applymap(np.log)
  temp = temp.applymap(np.log)
  temp = temp.applymap(np.log)


In [85]:
dataset.dropna(inplace=True)
dataset

Unnamed: 0,gene name,gene_exp.copy_num.0,gene_exp.copy_num.1,gene_exp.copy_num.2,gene_exp.copy_num.3,gene_exp.copy_num.4,gene_exp.copy_num.5,gene_exp.copy_num.6,gene_exp.copy_num.7,gene_exp.copy_num.8,...,gene_mut.CRISPR.4,gene_mut.CRISPR.5,gene_mut.CRISPR.6,gene_mut.CRISPR.7,gene_mut.CRISPR.8,gene_mut.CRISPR.9,gene_mut.CRISPR.10,gene_mut.CRISPR.11,gene_mut.CRISPR.12,gene_mut.CRISPR.13
0,A1BG,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.111943,-5.179541,...,-1.782302,-5.072673,-0.478568,-3.791168,-1.781793,-4.861424,-4.888588,-6.573848,-6.381836,-6.671959
2,A1CF,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165489,-7.101136,...,-1.794366,-4.684001,-0.500070,-3.278549,-1.801564,-4.504519,-4.994704,-6.507721,-6.648477,-6.671731
3,A2M,-7.165493,-7.163255,-6.911970,-6.646370,-7.136520,-7.165470,-7.165493,-7.165456,-6.973452,...,-1.853513,-4.366357,-0.476633,-3.151028,-1.880186,-4.388499,-5.019282,-6.482807,-6.519536,-6.671790
5,A2ML1,-7.165493,-7.164623,-6.992000,-6.602891,-7.111818,-7.165420,-7.165493,-7.165490,-7.117275,...,-1.847606,-3.983835,-0.531421,-2.807655,-1.872339,-4.097288,-4.847229,-6.525058,-6.164724,-6.671956
8,A4GALT,-7.165493,-7.165479,-7.140712,-6.429531,-6.142751,-7.062264,-7.165326,-7.157518,-5.725482,...,-1.736648,-5.568323,-0.471883,-4.612959,-1.740518,-5.543676,-4.816489,-6.636474,-6.357813,-6.672016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28545,ZXDC,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.157736,-6.636592,...,-1.763331,-4.633987,-0.512129,-3.676785,-1.744760,-4.629632,-4.915740,-6.470520,-5.989535,-6.671743
28546,ZYG11A,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165493,-7.165424,-6.952662,...,-1.753182,-4.737137,-0.495659,-3.529987,-1.802538,-4.511773,-5.026294,-6.502216,-5.977413,-6.671866
28548,ZYX,-7.165493,-7.165493,-7.165311,-7.080693,-6.586422,-7.045641,-7.165115,-7.150132,-6.260137,...,-1.744461,-5.071845,-0.523929,-3.823703,-1.670068,-4.936807,-5.122723,-6.600936,-6.665583,-6.671997
28549,ZZEF1,-7.165493,-7.165132,-7.007078,-6.226425,-6.925665,-7.164591,-7.165493,-7.159180,-5.986186,...,-1.738894,-3.969380,-0.629756,-2.763245,-1.758880,-3.873473,-4.495934,-6.170050,-5.960955,-6.669458


In [86]:
dataset.to_csv(r"C:\Users\justi\Coding\Coding_Project\Han Xu\Trimmed data\dataset_trimmed_v3.csv", index=False)