In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from bisect import bisect_left

In [98]:
# trim 2 dataframes to match
def trim(dataframe, genes):
    def process(df):
        df.sort_index(inplace=True)
        temp = df.iloc[:,0].str.strip()
        df.drop(columns=df.columns[0], inplace = True)
        df.sort_index(axis=1, inplace=True)
        df.insert(0,temp.name,temp)
        return df
    
    df = process(dataframe.copy())
    
    # match indexes
    mask = df.iloc[:, 0].isin(genes)
    df = df[mask]
    
    return df

def mut_trim(mutation, genes):
    df = pd.DataFrame(columns=list(sorted(set(mutation.iloc[:,-1]))), index=range(len(genes)+1))
    df.insert(0, 'gene name', list(genes)+['throwaway'])
    # initialize dataframe
    df.insert(len(df.columns), 'throwaway', np.zeros(len(genes)+1))
    df.iloc[:,1:] = 0
    # add mutations
    r = {key: i for i, key in enumerate(df.iloc[:, 0])}
    c = {key: i for i, key in enumerate(df.columns)}
    for i in range(len(mutation)): 
        df.iloc[r.get(mutation.iloc[i, 0], -1), c.get(mutation.iloc[i, -1], -1)] = 1
    
    return df.iloc[:-1, :-1]

# normalize dataframe to 0
def normalize(matrix):
    mat = np.copy(matrix)
    mean = np.nanmean(matrix, axis=1)[:,np.newaxis]
    return mat-mean

In [3]:
# create points 2 standard deviations apart around 0
def density_centers(df, num):
    std = np.nanstd(df.iloc[:,1:])
    return np.linspace(-std*num, std*num, num=num*2, endpoint=False)[1::2]
    

In [4]:
# extracts a specific gene
def extract(df, name):
    return df[df.iloc[:, 0] == name].iloc[:, 1:].to_numpy()[0]

In [5]:
# extracts a specific gene and creates density centers for static_dfation data
def drop_nan(x, y):
    combined = np.array(np.concatenate(([x],[y]), axis=0))
    # drop NaN
    combined = combined[:, ~pd.isna(combined).any(axis=0)]

    return combined[0], combined[1]

In [6]:
# convert 2 vectors into a heatmap
def densitymap(x, y, xDensityCenters, yDensityCenters, xdiscrete=False, ydiscrete=False, sigma=1):
    if len(x) != len(y):
        return "inconsistent size of x and y vectors"
    
    sigma_sq_inv = (1/sigma)**2
    mat = np.zeros((len(yDensityCenters), len(xDensityCenters)))

    if not xdiscrete and not ydiscrete:
        for pt in range(len(x)):
            temp = np.zeros((len(yDensityCenters), len(xDensityCenters)))
            for i, center_x in enumerate(xDensityCenters):
                for j, center_y in enumerate(yDensityCenters):
                    dist_sq = (x[pt] - center_x)**2+(y[pt] - center_y)**2
                    temp[j, i] = np.exp(-0.5*sigma_sq_inv*dist_sq)
                    
            temp /= np.sum(temp)
            mat += temp
            
    elif xdiscrete and ydiscrete:
        for i, center_x in enumerate(xDensityCenters):
            for j, center_y in enumerate(yDensityCenters):
                mat[j, i] += np.sum((x[y==center_y]==center_x))
                
    elif xdiscrete:
        for pt in range(len(x)):
            temp = np.zeros(len(yDensityCenters))
            for i, center_y in enumerate(yDensityCenters):
                dist_sq = (y[pt] - center_y)**2
                temp[i] = np.exp(-0.5*sigma_sq_inv*dist_sq)
                    
            temp /= np.sum(temp)
            mat[:, xDensityCenters.index(x[pt])] += temp
    else:
        for pt in range(len(y)):
            temp = np.zeros(len(xDensityCenters))
            for i, center_x in enumerate(xDensityCenters):
                dist_sq = (x[pt] - center_x)**2
                temp[i] = np.exp(-0.5*sigma_sq_inv*dist_sq)
                    
            temp /= np.sum(temp)
            mat[yDensityCenters.index(y[pt])] += temp
    
    # Normalize the kernel
    mat /= len(x)
    
    return mat

In [7]:
def build_density_map(datasets, pairs, density_points, continuous):

    out = pd.DataFrame({'pair':[f'{p1}.{p2}' for p1, p2 in pairs]})

    for i in range(len(datasets)):
        for j in range(len(datasets)):
            
            # initialize dataframes and variables
            df1 = datasets[i]
            df2 = datasets[j]
            mask = df1.columns.str.strip().isin(df2.columns.str.strip())
            mask[0] = True
            df1 = df1.loc[:, mask]
            mask = df2.columns.str.strip().isin(df1.columns.str.strip())
            mask[0] = True
            df2 = df2.loc[:, mask]
    
            df1_pts = density_points[i]
            df2_pts = density_points[j]

            df1_cont = continuous[i]
            df2_cont = continuous[j]

            temp = pd.DataFrame(index=range(len(out)), 
                                columns=[f'{datasets[i].name}.{datasets[j].name}.{value}' for value in range(len(df1_pts) * len(df2_pts))])

            #calculate bandwidth
            if df1_cont:
                if df2_cont:
                    std = math.sqrt((np.nanstd(df1.iloc[:,1:].to_numpy())**2
                                     +np.nanstd(df2.iloc[:,1:].to_numpy())**2)/2)
                else:
                    std = np.nanstd(df1.iloc[:,1:].to_numpy())
            else:
                std = np.nanstd(df2.iloc[:,1:].to_numpy())

            for index in range(len(pairs)):
                p1, p2 = pairs[index]
                x = extract(df1, p1)
                y = extract(df2, p2)
                x, y = drop_nan(x, y)
                mat = densitymap(x, y, df1_pts, df2_pts, xdiscrete=not df1_cont, ydiscrete=not df2_cont, sigma=std)
                temp.iloc[index] = mat.flatten()
    
            temp += 1/len(df1.columns)
            temp = temp.map(np.log)
            out = pd.concat([out, temp], axis=1)
    
    return out

In [80]:
def trim_pairs(pairs, lim):
    out = np.array([[None,None]])
    for x, y in pairs:
        if x in lim and y in lim:
            out = np.append(out, [[x, y]], axis=0)

    out = np.delete(out, 0, 0)
    return out

In [100]:
gene_exp = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/DepMap_Trimmed/Gene_Expression_Trimmed.csv")
gene_exp.name = 'gene_exp'

copy_num = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/DepMap_Trimmed/Copy_Number_Trimmed.csv")
copy_num.name = 'copy_num'

shRNA = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/DepMap_Trimmed/shRNA_Trimmed.csv")
shRNA.name = 'shRNA'

gene_mut = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/DepMap_Trimmed/Gene_Mutation_Trimmed.csv")
gene_mut.name = 'gene_mut'

CRISPR = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/DepMap_Trimmed/CRISPR_Trimmed.csv")
CRISPR.name = 'CRISPR'

In [101]:
genes = sorted(set(gene_exp.iloc[:,0].str.strip()) & 
               set(copy_num.iloc[:,0].str.strip()) & 
               set(shRNA.iloc[:,0].str.strip()) & 
               set(CRISPR.iloc[:,0].str.strip()))
genes = [i.strip() for i in genes]

In [102]:
pos1 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/FirstPositiveControl.csv").to_numpy()
neg1 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/FirstNegativeControl.csv").to_numpy()
pos2 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/SecondPositiveControl.csv").to_numpy()
neg2 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/SecondNegativeControl.csv").to_numpy()
pos3 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/ThirdPositiveControl.csv").to_numpy()
neg3 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/ThirdNegativeControl.csv").to_numpy()
pos4 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/FourthPositiveControl.csv").to_numpy()
neg4 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/FourthNegativeControl.csv").to_numpy()
pos5 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/FifthPositiveControl.csv").to_numpy()
neg5 = pd.read_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/FifthNegativeControl.csv").to_numpy()

In [118]:
neg1_t = np.unique(trim_pairs(neg1, genes).astype(str), axis=0)

In [119]:
datasets = gene_exp, copy_num, shRNA, gene_mut, CRISPR
points = density_centers(gene_exp, 7), [0,1,2,3,4,6,8], density_centers(shRNA, 7), [0, 1], density_centers(CRISPR, 7)
cont = True, False, True, False, True

In [120]:
temp_time = time.time()
neg1_feat = build_density_map(datasets, neg1_t, points, cont)
print(time.time()-temp_time)
neg1_feat

  mat /= len(x)


3604.6341824531555


Unnamed: 0,pair,gene_exp.gene_exp.0,gene_exp.gene_exp.1,gene_exp.gene_exp.2,gene_exp.gene_exp.3,gene_exp.gene_exp.4,gene_exp.gene_exp.5,gene_exp.gene_exp.6,gene_exp.gene_exp.7,gene_exp.gene_exp.8,...,CRISPR.CRISPR.39,CRISPR.CRISPR.40,CRISPR.CRISPR.41,CRISPR.CRISPR.42,CRISPR.CRISPR.43,CRISPR.CRISPR.44,CRISPR.CRISPR.45,CRISPR.CRISPR.46,CRISPR.CRISPR.47,CRISPR.CRISPR.48
0,A2M.DSCC1,-7.174724,-7.174554,-7.157760,-7.141341,-7.172309,-7.172085,-7.172412,-7.174719,-7.144849,...,-4.689490,-6.448612,-6.668577,-6.672032,-6.670303,-6.555058,-6.183949,-6.196300,-6.639901,-6.671985
1,AAAS.PSMC1,-7.174393,-7.144082,-7.115925,-7.169203,-7.174505,-7.174724,-7.174724,-7.168663,-6.729593,...,-5.189384,-6.507487,-6.671615,-6.671766,-6.668362,-6.628685,-6.532982,-6.604901,-6.666088,-6.672018
2,ABCB11.MDC1,-7.174724,-7.174717,-7.172125,-7.156777,-7.172229,-7.174697,-7.174720,-7.174724,-7.172381,...,-5.709300,-6.611334,-6.671886,-6.672033,-6.671865,-6.666331,-6.655905,-6.663235,-6.671623,-6.672032
3,ABCC9.SMAD4,-7.174724,-7.172225,-6.730567,-5.971708,-7.001745,-7.174258,-7.174722,-7.174722,-7.147761,...,-5.030053,-6.449968,-6.670390,-6.672008,-6.661660,-6.221731,-5.455820,-5.533175,-6.482656,-6.670980
4,ABCD4.PMS2,-7.174724,-7.174669,-7.172676,-7.168241,-7.173642,-7.174719,-7.174724,-7.174712,-7.158338,...,-5.671319,-6.554111,-6.671556,-6.672018,-6.670745,-6.654263,-6.611147,-6.654258,-6.670790,-6.672028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5512,ZW10.SUB1,-7.174723,-7.174568,-7.169873,-7.159961,-7.173487,-7.174721,-7.174724,-7.174063,-7.106732,...,-5.265754,-6.476497,-6.667716,-6.672002,-6.671684,-6.658056,-6.471247,-6.212852,-6.636834,-6.671987
5513,ZWILCH.WDR26,-7.174722,-7.174414,-7.172621,-7.173610,-7.174620,-7.174723,-7.174724,-7.174267,-7.090641,...,-4.649362,-6.077010,-6.623039,-6.671824,-6.667341,-6.546485,-6.085414,-6.227602,-6.630828,-6.671291
5514,ZWINT.IKBKE,-7.173633,-7.172929,-7.160303,-7.096337,-7.145357,-7.174166,-7.174724,-7.010416,-6.850537,...,-5.546236,-6.408221,-6.669130,-6.648643,-6.482798,-6.629349,-6.650492,-6.585950,-6.658978,-6.671988
5515,ZWINT.ITGA2B,-7.174724,-7.174716,-7.174663,-7.174548,-7.174677,-7.174724,-7.174724,-7.173446,-7.127863,...,-5.834633,-6.508649,-6.519676,-6.671879,-6.670743,-6.641885,-6.644297,-6.662161,-6.670622,-6.671402


In [121]:
neg1_feat.to_csv(r"/home/hxu/CCLE_ML_project/CCLE_ML_project/Data/PPI_Pairs/neg1_features.csv", index=False)