Partial correlation method to find moonlighting gene trios

The data file resulting from this analysis is available as Supplementary Table S1.

Import packages and load functions:

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import sys
import matplotlib.pyplot as plt


In [2]:
# Creates list of gene pairs and their edge value
def ut_as_list( dframe, diag=1, cols=['Row','Column','Value'] ):
  """
  for a symmetric dataframe, where cols=rows, get the upper triangle as a list of row/column pairs
  diag = 1 (default): ignore diagonal
  diag = 0: include diagonal
  """
  #if (dframe.index.name == dframe.columns.name):
  dframe.index.name = cols[0]
  dframe.columns.name = cols[1]
  #             dframe.index.name = dframe.index.name + '.1'
  #             dframe.index.name = dframe.index.name + '.2'
  d = dframe.where( np.triu( np.ones( dframe.shape ), k=diag).astype(bool))
  d = d.stack().reset_index()
  d.columns=cols
  return d

# Covariance normalization using PCA whitening transformation
def PCA_whitening(X):
    centered_X = X - np.mean(X, axis = 0)
    cov = np.cov(centered_X.T)
    # Calculate the eigenvalues and eigenvectors of the covariance matrix
    eigVals, eigVecs = np.linalg.eig(cov)
    # Apply the eigenvectors to X
    transf_x = centered_X @ eigVecs
    whitened_x = transf_x / np.sqrt(eigVals + 1e-5)
    return whitened_x

# Get Partial correlation coefficient for two genes after accounting for the effect of a third gene vector. 
def partial(x,y,z,cc):
    #
    # x, y, z = gene (row/column) names
    # cc = dataframe; symmetric matrix of pearson correlations
    #
    pxy = cc.loc[x,y]
    pxz = cc.loc[x,z]
    pyz = cc.loc[y,z]
    pxy_z = (pxy - pxz*pyz) / (np.sqrt(1-pxz**2) * np.sqrt(1-pyz**2) )
#     pxy_z['ratio']= pxy_z**2/ pxy**2
    
    return pxy_z

# Get Partial correlation coefficients for two genes after accounting for the effect of all possible third genes
def get_all_partials( g1, g2, cc):
    pxy = cc.loc[g1][g2]
    pxy_vect = np.array( list([pxy])*(cc.shape[0]) ) #vector
    pxz = cc.loc[g1]                              #vector
    pyz = cc.loc[g2]                              #vector
    pxy_all = (pxy_vect -  np.multiply(pxz, pyz)) / ( np.sqrt( 1-pxz**2) * np.sqrt( 1-pyz**2) )
    framename = 'pc_' + g1 + '_' + g2
    pxy_all = pxy_all.to_frame(framename)
    pxy_all.drop( [g1, g2], axis=0, inplace=True) # don't include these!
    pxy_all['ratio'] = pxy_all[framename]**2 / pxy**2
    pxy_all.sort_values('ratio', ascending=False, inplace=True)
    return pxy_all

Data files can be downloaded from: https://doi.org/10.6084/m9.figshare.21379761

Load data:

In [3]:
data= pd.read_csv('Ceres_dataframe.csv',header=0,index_col=0)
print(data.shape)

(17834, 730)


Apply covariance normalization via PCA whitening transformation to the data:

In [4]:
w_data=PCA_whitening(data)

Calculate correlation matrix, and create gene pairs with the edge being the corresponding PCC value.

In [5]:
ccvals = np.corrcoef( w_data )
np.fill_diagonal( ccvals, 0) 
cc = pd.DataFrame( index=data.index.values, columns=data.index.values, data= ccvals )

# # Create PCC Pairs
PCC_pairs = ut_as_list(cc,cols=['Gene1','Gene2','G1G2_PCC']).sort_values(by='G1G2_PCC',key=abs, ascending=False)

Filter the pairs to include only those with PCC>0.15 and PCC<-0.15

In [22]:
filtered_list=PCC_pairs[~PCC_pairs['G1G2_PCC'].between(-0.15,0.15)]

Create new columns in the dataframe to save the partial correlation information

In [23]:
filtered_list=filtered_list.reindex(columns = filtered_list.columns.tolist() + ['Gene3',"G1G2.G3_Partial","G1G2.G3_Ratio",
                                                                                "G1G3_PCC","G1G3.G2_Partial","G1G3.G2_Ratio",
                                                                               "G2G3_PCC","G2G3.G1_Partial","G2G3.G1_Ratio"])
filtered_list.reset_index(drop=True,inplace=True)

In [24]:
filtered_list

Unnamed: 0,Gene1,Gene2,G1G2_PCC,Gene3,G1G2.G3_Partial,G1G2.G3_Ratio,G1G3_PCC,G1G3.G2_Partial,G1G3.G2_Ratio,G2G3_PCC,G2G3.G1_Partial,G2G3.G1_Ratio
0,TSC1,TSC2,0.841137,,,,,,,,,
1,KIDINS220,XPR1,0.785930,,,,,,,,,
2,OTUD5,UBR5,0.755237,,,,,,,,,
3,SUPT20H,TAF5L,0.744304,,,,,,,,,
4,DEPDC5,NPRL2,0.741498,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
49479,ACRV1,HTR3B,0.150004,,,,,,,,,
49480,CBY3,ZNF454,0.150004,,,,,,,,,
49481,CRMP1,NSG1,0.150003,,,,,,,,,
49482,POLQ,WAS,0.150001,,,,,,,,,


Calculate partial correlation coefficients for each pair with respect to all possible genes, and save the gene with the highest effect

Calculate ratios of correlation coefficients as described in methods

In [26]:
for i in range (0,len(filtered_list)):
    g1=filtered_list.loc[i,'Gene1']
    g2=filtered_list.loc[i,'Gene2']
    
#     get all partials for g1 g2
    partials_g1g2=get_all_partials(g1,g2,cc)
    
#     save (first) partial, ratio, and gene 3 on new columns 

    filtered_list.loc[i,"G1G2.G3_Partial"]=partials_g1g2.iloc[0,0]
    filtered_list.loc[i,"Gene3"]=partials_g1g2.index[0]
    
    g3=filtered_list.loc[i,'Gene3']
    
    filtered_list.loc[i,"G1G2.G3_Ratio"]=((partial(g1,g2,g3,cc))**2)/(cc.loc[g1][g2])**2
    
    filtered_list.loc[i,"G1G3_PCC"]=cc.loc[g1][g3]
    filtered_list.loc[i,"G2G3_PCC"]=cc.loc[g2][g3]
    
    partial_g1g3_g2=partial(g1,g3,g2,cc)
    ratio_g1g3_g2=(partial_g1g3_g2)**2/(cc.loc[g1][g3])**2
    filtered_list.loc[i,"G1G3.G2_Partial"]=partial_g1g3_g2
    filtered_list.loc[i,"G1G3.G2_Ratio"]=ratio_g1g3_g2
    
    partial_g2g3_g1=partial(g2,g3,g1,cc)
    ratio_g2g3_g1=(partial_g2g3_g1)**2/(cc.loc[g2][g3])**2
    filtered_list.loc[i,"G2G3.G1_Partial"]=partial_g2g3_g1
    filtered_list.loc[i,"G2G3.G1_Ratio"]=ratio_g2g3_g1
    

In [27]:
filtered_list.shape

(49484, 12)

Filter the list such that at least one of the three ratios is >2.

In [34]:
moon=filtered_list[(filtered_list['G1G2.G3_Ratio']>2) | (filtered_list['G1G3.G2_Ratio'].values>2) | (filtered_list['G2G3.G1_Ratio'].values>2) ]
moon.reset_index(drop=True,inplace=True)
moon.shape

(9219, 12)

Keep only cases where two PCC values are positive and one PCC is negative:

In [55]:
for i in range (0,len(moon)):
    if moon.loc[i,'G1G2_PCC']<0 and moon.loc[i,'G1G3_PCC']<0:
        moon.drop(index=i,axis=0,inplace=True)
    elif moon.loc[i,'G1G2_PCC']<0 and moon.loc[i,'G2G3_PCC']<0:
        moon.drop(index=i,axis=0,inplace=True)
    elif moon.loc[i,'G1G3_PCC']<0 and moon.loc[i,'G2G3_PCC']<0:
        moon.drop(index=i,axis=0,inplace=True)
        
moon.reset_index(drop=True,inplace=True)
moon.shape

(9144, 13)

Write another column to specify which one is the moonlighting gene in the trio: 

In [54]:
moon['Moonlighter Gene']=""

for i in range (0,len(moon)):
    if moon.loc[i,'G1G2_PCC']<0:
        moon.loc[i,'Moonlighter Gene']=moon.loc[i,'Gene3']
    elif moon.loc[i,'G1G3_PCC']<0:
        moon.loc[i,'Moonlighter Gene']=moon.loc[i,'Gene2']
    elif  moon.loc[i,'G2G3_PCC']<0:
        moon.loc[i,'Moonlighter Gene']=moon.loc[i,'Gene1']
        

Further filter the moonlighting trios such that all ratios are greater than 1.135

In [44]:
stronger_moons=moon[(moon['G1G2.G3_Ratio']>1.135) & (moon['G1G3.G2_Ratio'].values>1.135) & (moon['G2G3.G1_Ratio'].values>1.135) ]
stronger_moons.reset_index(drop=True,inplace=True)
stronger_moons.shape

(1741, 13)

Sorting and removing duplicates:

Further filtering and duplicate removal can be done if needed

In [48]:
stronger_moons.sort_values(by='Moonlighter Gene')
stronger_moons['G1G2_PCC']=stronger_moons['G1G2_PCC'].astype(float).round(4)
stronger_moons['G1G3_PCC']=stronger_moons['G1G3_PCC'].astype(float).round(4)
stronger_moons['G2G3_PCC']=stronger_moons['G2G3_PCC'].astype(float).round(4)


In [46]:
stronger_moons.shape

(1741, 13)

In [53]:
stronger_moons.drop_duplicates(subset=['Gene1','G2G3_PCC'],inplace=True)
stronger_moons.drop_duplicates(subset=['Gene2','G1G3_PCC'],inplace=True)
stronger_moons.drop_duplicates(subset=['Gene3','G1G2_PCC'],inplace=True)

stronger_moons.reset_index(drop=True,inplace=True)
stronger_moons.shape

(1403, 13)