In [2]:
import pandas as pd
import numpy as np
import numpy.ma as ma
import pickle
from scipy.spatial import distance

# Data Preprocessing

In [3]:
proteomeHD_df = pd.read_csv('./data_sources/ProteomeHD_v1_1.csv')
proteomeHD_feature_names = [col for col in proteomeHD_df.columns if 'Ratio' in col]
proteomeHD_feature_matrix = proteomeHD_df[proteomeHD_feature_names].to_numpy()
# Keep only proteins quantified in at least 95 experiments 
rows_to_keep = [i for i in range(len(proteomeHD_feature_matrix)) if np.sum(~np.isnan(proteomeHD_feature_matrix[i])) >= 95]
proteomeHD_df = proteomeHD_df.iloc[rows_to_keep]
proteomeHD_feature_matrix = proteomeHD_df[proteomeHD_feature_names].to_numpy()

# Pearson Correlation

In [76]:
proteomeHD_pearson_corr = pd.DataFrame(proteomeHD_feature_matrix.T).corr()
proteomeHD_pearson_corr.columns = proteomeHD_df['Gene_names'].to_numpy()
pickle.dump(proteomeHD_pearson_corr, open("proteomeHD_pearson_corr.p", "wb"))

In [77]:
proteomeHD_pearson_corr

Unnamed: 0,RBM47,UBA6,ESYT2,KIAA1598,ARHGAP10,ILVBL,NBAS,DENND3,VWA8,SBNO1,...,MAP3K4,DDX49,CAPN7,WASF2,MAU2,ENPP4,ZHX2,MORC2,IVNS1ABP,SEC23IP
0,1.000000,-0.431411,-0.094415,0.455848,0.071829,0.384105,0.044797,-0.025830,0.036665,-0.214021,...,-0.220737,0.205795,0.278254,0.010129,0.006238,0.425007,0.449022,0.304477,-0.612468,-0.000718
1,-0.431411,1.000000,-0.159706,-0.091524,0.240889,-0.131397,-0.124255,-0.134933,-0.017076,-0.391442,...,0.019252,-0.336302,-0.057930,-0.140297,-0.505185,-0.084101,-0.509618,-0.602308,0.321049,0.334175
2,-0.094415,-0.159706,1.000000,0.138347,0.322340,0.361987,0.319078,0.607401,-0.270201,0.244635,...,-0.070447,-0.004480,-0.136747,0.487370,0.255446,0.222310,-0.152601,0.041395,-0.088946,-0.294995
3,0.455848,-0.091524,0.138347,1.000000,-0.289084,0.287946,0.252261,0.002012,0.078278,0.009210,...,-0.006613,-0.059126,0.368434,-0.031533,0.143856,-0.362575,0.367958,0.160309,-0.105514,0.273601
4,0.071829,0.240889,0.322340,-0.289084,1.000000,-0.011932,0.028757,0.497475,0.286646,-0.345136,...,0.074987,0.008556,-0.099811,-0.025742,-0.063591,0.237251,-0.239788,-0.140437,0.035715,-0.011225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5008,0.425007,-0.084101,0.222310,-0.362575,0.237251,0.046153,0.029913,0.341321,-0.136022,-0.006928,...,-0.011801,-0.079180,0.024860,0.384848,-0.334204,1.000000,-0.235221,-0.231404,-0.328365,-0.507690
5009,0.449022,-0.509618,-0.152601,0.367958,-0.239788,-0.026363,-0.127298,0.006246,-0.177634,0.341963,...,0.006577,0.304447,0.273708,-0.216538,0.168282,-0.235221,1.000000,0.669537,-0.441846,-0.215904
5010,0.304477,-0.602308,0.041395,0.160309,-0.140437,0.115064,0.056445,-0.260511,-0.139058,0.403243,...,0.018868,0.302319,0.070261,0.115117,0.498486,-0.231404,0.669537,1.000000,-0.008747,-0.062775
5011,-0.612468,0.321049,-0.088946,-0.105514,0.035715,-0.237479,0.138608,-0.134134,0.020129,0.393377,...,0.396069,-0.157808,-0.271083,0.373383,0.154335,-0.328365,-0.441846,-0.008747,1.000000,0.335174


# Cosine Similarity

In [4]:
proteomeHD_cosine_sim = np.empty((proteomeHD_feature_matrix.shape[0],proteomeHD_feature_matrix.shape[0]))
for i in range(len(proteomeHD_cosine_sim)):
    g1 = proteomeHD_cosine_sim[i]
    for j in range(len(proteomeHD_cosine_sim)):
        g2 = proteomeHD_cosine_sim[j]
        shared_index = np.where(~np.logical_or(np.isnan(g1),np.isnan(g2)))
        proteomeHD_cosine_sim[i,j] = distance.cosine(g1[shared_index],g2[shared_index])

In [9]:
B = np.array([2, 3, 4, 5.25, np.NaN, 100])
C = np.array([np.NaN,3,5,np.NaN,np.NaN,np.NaN])
shared_index = np.where(~np.logical_or(np.isnan(B),np.isnan(C)))

In [12]:
C[shared_index]

array([3., 5.])

In [68]:
A = [1, 2, 3, 4, 5, np.NaN]
B = [2, 3, 4, 5.25, np.NaN, 100]
C = [np.NaN,3,5,np.NaN,np.NaN,np.NaN]

all_masked = ma.masked_invalid(np.array([A,B,C]))

corr_test = ma.corrcoef(all_masked)
print(corr_test)

[[1.0 8.163509630537018 1.0]
 [8.163509630537018 1.0 1.0]
 [1.0 1.0 1.0]]


In [70]:
corr_test[1,0]

8.163509630537018

In [65]:
all_masked

masked_array(
  data=[[1.0, 2.0, 3.0, 4.0, 5.0, --],
        [2.0, 3.0, 4.0, 5.25, --, 100.0],
        [--, --, --, --, --, --]],
  mask=[[False, False, False, False, False,  True],
        [False, False, False, False,  True, False],
        [ True,  True,  True,  True,  True,  True]],
  fill_value=1e+20)

In [64]:
A = [1, 2, 3, 4, 5, np.NaN]
B = [2, 3, 4, 5.25, np.NaN, 100]

print(ma.corrcoef(ma.masked_invalid(A), ma.masked_invalid(B)))

[[1.0 0.99838143945703]
 [0.99838143945703 1.0]]


In [62]:
all_masked

NameError: name 'all_masked' is not defined

In [52]:
all

array([[  1.  ,   2.  ,   3.  ,   4.  ,   5.  ,    nan],
       [  2.  ,   3.  ,   4.  ,   5.25,    nan, 100.  ],
       [  5.  ,    nan,    nan,    nan,    nan,    nan]])

In [46]:
test_mask = ma.masked_invalid(proteomeHD_feature_matrix)
test_mask

masked_array(
  data=[[-0.27417159739853, --, -0.52595032717654, ..., --, --, --],
        [-0.49353328343526504, 0.660839594607945, -0.229160422469731,
         ..., -0.23352518491260202, -0.447211404750219,
         0.28350680549737295],
        [-0.9973247935546, --, --, ..., 0.12339527179752302,
         -0.336180938653175, 0.47882133187683096],
        ...,
        [0.0014285990454098699, -0.639854352993868, 0.20637246326225,
         ..., -0.38426005804287894, --, --],
        [--, --, --, ..., --, --, --],
        [-0.345849093341339, --, 0.0630522421837596, ...,
         0.150796069705211, -0.35406344171976, -0.542727632709854]],
  mask=[[False,  True, False, ...,  True,  True,  True],
        [False, False, False, ..., False, False, False],
        [False,  True,  True, ..., False, False, False],
        ...,
        [False, False, False, ..., False,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [False,  True, False, ..., False, False, False]]

In [42]:
len(proteomeHD_df['Simplified_protein_ID'].unique())

4976

In [44]:
len(proteomeHD_df['Gene_names'].unique())

4959

In [6]:
unique_gene_names = np.unique(proteomeHD_df['Gene_names'].to_numpy())
unique_protein_names = np.unique(proteomeHD_df['Simplified_protein_ID'].to_numpy())
print(len(unique_gene_names))
print(len(unique_protein_names))

TypeError: '<' not supported between instances of 'float' and 'str'

In [9]:
len(proteomeHD_df['Gene_names'].unique())

9857

In [10]:
len(proteomeHD_df['Simplified_protein_ID'].unique())

9987