In [1]:
import pandas as pd
import numpy as np

In [2]:
path = '/home/ec2-user/MLNotebooks/'

In [3]:
# read the localization label dataset
labels = pd.read_csv(path + 'Datasets/SubCellBarcode.MCF7.0622.txt', sep='\t')

In [4]:
# rename the column
labels = labels.rename(columns={'Protein':'Gene'})
labels.set_index('Gene')

Unnamed: 0_level_0,Localization
Gene,Unnamed: 1_level_1
ABR,Cytosol
ABTB1,Cytosol
ACTR1A,Cytosol
ACTR2,Cytosol
ACTR3,Cytosol
...,...
ZNF813,Unclassified
ZNF816,Unclassified
ZNF827,Unclassified
ZSWIM1,Unclassified


In [5]:
# drop Unclassified protein
labels = labels.drop(labels[labels['Localization'] == 'Unclassified'].index)
print(labels)

        Gene Localization
0        ABR      Cytosol
1      ABTB1      Cytosol
2     ACTR1A      Cytosol
3      ACTR2      Cytosol
4      ACTR3      Cytosol
...      ...          ...
9440  ZNF471    Secretory
9441  ZNF587    Secretory
9442   ZNF93    Secretory
9443   ZNRF1    Secretory
9444    ZW10    Secretory

[9445 rows x 2 columns]


In [6]:
### construct proteome and transcriptome dataset as training set
kr_proteome = pd.read_csv(path + 'Datasets/kr_pro_z.csv')
kr_transcriptome = pd.read_csv(path + 'Datasets/kr_rna_z.csv')

# merge
kr = kr_proteome.merge(kr_transcriptome, how='inner', on='Gene',suffixes=('_prot', '_mrna'))

# dropna
kr.set_index('Gene', inplace=True)
kr.dropna(inplace=True, axis=0)

kr
# df shape 7460x244 fully quant

Unnamed: 0_level_0,CPT000814_prot,CPT001846_prot,X01BR001_prot,X01BR008_prot,X01BR009_prot,X01BR018_prot,X01BR020_prot,X01BR026_prot,X01BR027_prot,X01BR031_prot,...,X13BR009_mrna,X16BR012_mrna,X18BR003_mrna,X18BR006_mrna,X21BR001_mrna,X06BR006_mrna,X09BR004_mrna,X11BR058_mrna,X11BR059_mrna,X18BR009_mrna
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.987411,0.510677,0.963885,-0.884379,0.408660,0.917514,-0.619700,1.194873,-0.684982,-0.329661,...,-0.018198,0.218828,0.144107,0.764447,0.220329,-0.714911,0.663401,0.910886,-0.389751,-0.077204
A2M,-0.438113,0.522434,0.707772,-0.104417,1.845912,1.202257,-0.671425,1.869524,0.736195,0.067866,...,-1.378654,-0.219278,-0.343996,0.083925,-0.327921,1.889726,-1.360694,0.604746,1.485863,0.909722
A2ML1,1.661429,-0.544436,-0.041102,1.280762,1.663777,1.202683,-0.110906,2.219136,2.370290,0.652244,...,-0.944147,-0.620095,-0.869702,-0.552137,-1.064193,0.569962,-0.232299,0.578424,0.192208,0.950777
AAAS,2.207973,1.134415,0.397335,0.667814,-0.678440,-0.057840,-0.031918,-0.626766,1.597606,1.044369,...,0.772149,0.821988,1.053310,0.108760,1.795527,-0.529808,1.399996,0.475302,0.110150,-0.811366
AACS,-0.245457,-0.604077,-0.235807,-1.317636,-0.629167,-1.635303,-0.258907,-1.174333,-0.816438,-0.356915,...,0.475600,0.164188,-0.910929,-0.391370,0.622088,0.683623,-0.360852,2.476981,0.569896,-1.812279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWINT,1.371329,0.083257,-0.171917,0.860499,-0.081454,0.925838,1.335048,-1.007033,0.628282,-0.120866,...,-0.326715,1.527251,0.207676,0.630735,0.411982,-0.698508,-0.812772,-0.435766,1.195212,0.179110
ZYG11B,1.409412,0.078361,0.743565,-0.461876,0.753376,0.562789,-0.954989,1.885959,-0.128470,1.061210,...,0.630116,-0.601207,-0.525623,0.243973,-1.139624,0.258679,-0.750952,0.130281,-0.019938,0.156530
ZYX,-1.176195,2.096937,0.340245,-0.645866,-0.185159,-0.899925,0.091207,1.277302,-0.592755,0.402433,...,-0.214509,0.443438,-0.546257,0.069830,0.649147,-0.131948,0.394178,1.710765,0.612839,0.356367
ZZEF1,-2.011132,-0.237296,-2.132052,-0.001250,-0.153514,-0.034971,-1.830495,0.957046,-1.300320,-0.294340,...,0.971523,0.802856,0.602635,0.299360,2.183494,-0.308316,0.393773,0.362344,-0.108846,0.649340


### Approach 1: randomize the abundance for each row

In [7]:
# Define a function to shuffle the elements of a row
def shuffle_row(row):
    return np.random.permutation(row)

# Apply the shuffle_row function to each row using apply
kr_random = kr.apply(shuffle_row, axis=1) # return an object of series

# put the series back into df
kr_random = pd.DataFrame(kr_random.apply(pd.Series).values)
kr_random.columns = [f'value_{i}' for i in range(1, kr_random.shape[1]+1)] # assign column names

# add gene column back to sum_df
kr_random['Gene'] = kr.index

# merge the label and set index back to Gene
kr_random_labeled = kr_random.merge(labels, how='inner', on='Gene')
kr_random_labeled.set_index('Gene')
# 6173x245

Unnamed: 0_level_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8,value_9,value_10,...,value_236,value_237,value_238,value_239,value_240,value_241,value_242,value_243,value_244,Localization
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.684982,1.487665,-0.873946,-0.372234,-2.368934,0.218828,-2.368934,0.929104,0.764447,-0.753597,...,-0.016364,-0.087714,-0.477134,-0.128742,-0.615642,-0.161130,0.526370,1.596059,0.315470,Cytosol
A2ML1,-1.107029,-1.073807,-0.530867,1.565693,1.826199,0.047460,1.807365,-0.407958,1.515778,-0.111325,...,1.360732,0.426785,0.695144,-0.385205,0.519065,-0.552137,-0.833298,-0.700035,-0.129245,Secretory
AAAS,-1.469593,0.267608,0.888665,0.322297,0.017423,-0.233839,0.064879,-2.761225,-0.400969,0.040059,...,0.540231,0.716552,-3.560761,-0.607247,-1.043570,0.886321,-0.261554,-1.775363,0.258673,Secretory
AACS,0.073879,-0.892244,-0.725788,-1.001737,-0.745449,-0.711003,-1.291848,0.315784,0.059430,-0.990152,...,1.275181,-0.950995,-1.635303,0.646752,-0.263913,-0.519672,-0.346527,0.955299,-1.226926,Cytosol
AAGAB,-0.267219,0.017855,-1.201778,0.355585,0.345680,-0.343336,-1.093600,1.840910,-0.753951,2.106567,...,-1.517207,0.814337,-0.798370,-0.010722,0.493326,0.255390,-1.034628,-2.386296,0.893765,Cytosol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWINT,0.364198,-0.251383,-0.090364,-1.580633,-0.382703,-1.433946,0.000762,0.584856,0.067344,1.215688,...,0.897713,0.626282,-0.120866,0.310535,-1.268548,0.763695,-0.991879,0.753180,-1.104949,Cytosol
ZYG11B,-0.267268,0.436254,0.498939,-0.169000,0.054381,0.684861,0.243973,0.188853,0.515633,-0.268072,...,-0.351545,-0.893423,-0.128470,-0.153554,-0.926039,1.061210,0.761900,0.078258,-0.025698,Cytosol
ZYX,0.430146,-0.163284,2.525005,-0.517436,-1.176195,0.077809,-0.592755,-0.602798,1.519581,-2.204408,...,1.275467,0.438003,1.277302,-0.813483,0.129833,-0.630090,0.876755,0.748244,0.469062,Cytosol
ZZEF1,0.316014,0.521794,-2.226008,-0.001160,-0.809214,1.964616,-1.524631,0.563834,-0.331606,1.322033,...,2.183494,-0.315731,0.362344,-0.836102,-0.453288,-2.041139,-0.466955,-0.179343,-0.059386,Cytosol


In [8]:
# export
kr_random_labeled.to_csv(path + "Datasets/kr_ScrambledRow_0922.csv")

### Approach 2: Keep patient locations

In [9]:
# merge the label and set the index back to gene
kr_keep_labeled = kr.merge(labels, how='inner', on='Gene')
kr_keep_labeled.set_index('Gene')
# 6173x245

Unnamed: 0_level_0,CPT000814_prot,CPT001846_prot,X01BR001_prot,X01BR008_prot,X01BR009_prot,X01BR018_prot,X01BR020_prot,X01BR026_prot,X01BR027_prot,X01BR031_prot,...,X16BR012_mrna,X18BR003_mrna,X18BR006_mrna,X21BR001_mrna,X06BR006_mrna,X09BR004_mrna,X11BR058_mrna,X11BR059_mrna,X18BR009_mrna,Localization
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.987411,0.510677,0.963885,-0.884379,0.408660,0.917514,-0.619700,1.194873,-0.684982,-0.329661,...,0.218828,0.144107,0.764447,0.220329,-0.714911,0.663401,0.910886,-0.389751,-0.077204,Cytosol
A2ML1,1.661429,-0.544436,-0.041102,1.280762,1.663777,1.202683,-0.110906,2.219136,2.370290,0.652244,...,-0.620095,-0.869702,-0.552137,-1.064193,0.569962,-0.232299,0.578424,0.192208,0.950777,Secretory
AAAS,2.207973,1.134415,0.397335,0.667814,-0.678440,-0.057840,-0.031918,-0.626766,1.597606,1.044369,...,0.821988,1.053310,0.108760,1.795527,-0.529808,1.399996,0.475302,0.110150,-0.811366,Secretory
AACS,-0.245457,-0.604077,-0.235807,-1.317636,-0.629167,-1.635303,-0.258907,-1.174333,-0.816438,-0.356915,...,0.164188,-0.910929,-0.391370,0.622088,0.683623,-0.360852,2.476981,0.569896,-1.812279,Cytosol
AAGAB,-0.583678,-0.826627,-1.386869,-1.335151,-2.354911,-0.718082,-0.080367,-0.536340,-0.133962,-1.627420,...,-0.683927,0.689205,-0.241635,0.163983,-2.243191,-2.240611,-1.676689,1.603091,-0.974182,Cytosol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWINT,1.371329,0.083257,-0.171917,0.860499,-0.081454,0.925838,1.335048,-1.007033,0.628282,-0.120866,...,1.527251,0.207676,0.630735,0.411982,-0.698508,-0.812772,-0.435766,1.195212,0.179110,Cytosol
ZYG11B,1.409412,0.078361,0.743565,-0.461876,0.753376,0.562789,-0.954989,1.885959,-0.128470,1.061210,...,-0.601207,-0.525623,0.243973,-1.139624,0.258679,-0.750952,0.130281,-0.019938,0.156530,Cytosol
ZYX,-1.176195,2.096937,0.340245,-0.645866,-0.185159,-0.899925,0.091207,1.277302,-0.592755,0.402433,...,0.443438,-0.546257,0.069830,0.649147,-0.131948,0.394178,1.710765,0.612839,0.356367,Cytosol
ZZEF1,-2.011132,-0.237296,-2.132052,-0.001250,-0.153514,-0.034971,-1.830495,0.957046,-1.300320,-0.294340,...,0.802856,0.602635,0.299360,2.183494,-0.308316,0.393773,0.362344,-0.108846,0.649340,Cytosol


In [10]:
# export
kr_keep_labeled.to_csv(path + "Datasets/kr_KeepPatientOrder_0922.csv")

### Approach 3: using linear regression residuals to represent the relationship between transcript and protein

In [11]:
# split independent X and dependent values Y
X = kr.iloc[:, 122:]
y = kr.iloc[:, :122]

In [12]:
import statsmodels.api as sm

# calculate regression [prot ~ mRNA + resid]
reg = sm.OLS(y, X).fit()

In [13]:
# output residual df
kr_residual = reg.resid
kr_residual

Unnamed: 0_level_0,CPT000814_prot,CPT001846_prot,X01BR001_prot,X01BR008_prot,X01BR009_prot,X01BR018_prot,X01BR020_prot,X01BR026_prot,X01BR027_prot,X01BR031_prot,...,X13BR009_prot,X16BR012_prot,X18BR003_prot,X18BR006_prot,X21BR001_prot,X06BR006_prot,X09BR004_prot,X11BR058_prot,X11BR059_prot,X18BR009_prot
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.428397,0.632741,0.714860,-0.216814,0.003049,1.571048,-0.104028,0.939949,-0.299244,0.309520,...,-0.364714,-0.404542,0.447681,-0.708321,-0.783076,0.413488,-0.548017,-1.168481,0.602587,0.393177
A2M,0.484823,0.680691,0.882404,0.083628,1.542729,1.160896,-0.060047,0.635415,0.379577,0.058821,...,0.088124,-0.886540,0.893283,0.077576,-0.499725,0.133153,0.348082,-1.745231,-0.956153,0.428676
A2ML1,0.177501,-0.385697,-0.798055,0.211434,0.567466,0.122445,-0.379474,1.563342,0.802457,-0.186343,...,0.483736,-1.632262,0.374781,-0.821216,-0.030467,0.068524,-0.403067,0.060489,-0.972529,0.491907
AAAS,2.747478,1.864533,0.846761,1.005282,-0.457167,0.810961,-0.323901,-0.805313,1.744198,1.524762,...,-0.015438,0.316267,-1.218437,-1.197569,1.607351,-0.346654,0.954192,0.053741,0.429967,0.554172
AACS,-0.067016,0.006335,0.064977,-0.382847,-0.908316,-0.497448,-0.616533,-0.699774,-0.225334,-0.409042,...,-0.394950,0.432241,0.440479,-0.537804,-0.147735,-0.109245,0.397111,2.050887,-0.302141,-0.590884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWINT,0.801067,0.855827,0.729429,-0.205681,0.174815,0.284026,-0.138617,-0.796412,0.336887,-0.387944,...,-1.076471,-0.387472,-0.110248,-0.309184,-1.229598,-0.268905,1.576081,-0.469565,0.935214,0.055569
ZYG11B,2.584546,-0.087429,1.138499,0.232528,0.936461,0.699567,-0.961456,1.254038,1.067808,0.307376,...,1.279602,0.086629,-0.747837,0.790458,0.356393,0.544993,2.490441,0.096187,-2.925766,0.420583
ZYX,0.050541,1.777402,0.333086,-0.515380,0.019564,-0.319402,0.346951,0.617725,-0.347883,0.053035,...,0.214857,-0.353297,-0.823292,0.501815,-0.842878,0.017274,-0.179888,0.224492,0.163504,-0.380900
ZZEF1,-0.986692,0.350519,-1.923514,0.081334,-0.572563,0.202120,-1.675649,0.844309,-0.690597,-0.578376,...,1.594051,0.405646,1.317365,0.219886,1.768327,-1.989569,0.569998,0.388572,0.124167,0.028789


In [14]:
# append localization label
kr_residual_labeled = kr_residual.merge(labels, how='inner', on='Gene')
kr_residual_labeled.set_index('Gene')
# 6173x123

Unnamed: 0_level_0,CPT000814_prot,CPT001846_prot,X01BR001_prot,X01BR008_prot,X01BR009_prot,X01BR018_prot,X01BR020_prot,X01BR026_prot,X01BR027_prot,X01BR031_prot,...,X16BR012_prot,X18BR003_prot,X18BR006_prot,X21BR001_prot,X06BR006_prot,X09BR004_prot,X11BR058_prot,X11BR059_prot,X18BR009_prot,Localization
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.428397,0.632741,0.714860,-0.216814,0.003049,1.571048,-0.104028,0.939949,-0.299244,0.309520,...,-0.404542,0.447681,-0.708321,-0.783076,0.413488,-0.548017,-1.168481,0.602587,0.393177,Cytosol
A2ML1,0.177501,-0.385697,-0.798055,0.211434,0.567466,0.122445,-0.379474,1.563342,0.802457,-0.186343,...,-1.632262,0.374781,-0.821216,-0.030467,0.068524,-0.403067,0.060489,-0.972529,0.491907,Secretory
AAAS,2.747478,1.864533,0.846761,1.005282,-0.457167,0.810961,-0.323901,-0.805313,1.744198,1.524762,...,0.316267,-1.218437,-1.197569,1.607351,-0.346654,0.954192,0.053741,0.429967,0.554172,Secretory
AACS,-0.067016,0.006335,0.064977,-0.382847,-0.908316,-0.497448,-0.616533,-0.699774,-0.225334,-0.409042,...,0.432241,0.440479,-0.537804,-0.147735,-0.109245,0.397111,2.050887,-0.302141,-0.590884,Cytosol
AAGAB,0.924279,0.111016,-0.000608,-1.078924,-1.369541,-0.914368,-0.104404,0.034012,0.218586,-1.288338,...,-0.153103,0.419460,0.093975,-1.169247,-0.580612,-0.253193,0.952608,1.052753,-0.412946,Cytosol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWINT,0.801067,0.855827,0.729429,-0.205681,0.174815,0.284026,-0.138617,-0.796412,0.336887,-0.387944,...,-0.387472,-0.110248,-0.309184,-1.229598,-0.268905,1.576081,-0.469565,0.935214,0.055569,Cytosol
ZYG11B,2.584546,-0.087429,1.138499,0.232528,0.936461,0.699567,-0.961456,1.254038,1.067808,0.307376,...,0.086629,-0.747837,0.790458,0.356393,0.544993,2.490441,0.096187,-2.925766,0.420583,Cytosol
ZYX,0.050541,1.777402,0.333086,-0.515380,0.019564,-0.319402,0.346951,0.617725,-0.347883,0.053035,...,-0.353297,-0.823292,0.501815,-0.842878,0.017274,-0.179888,0.224492,0.163504,-0.380900,Cytosol
ZZEF1,-0.986692,0.350519,-1.923514,0.081334,-0.572563,0.202120,-1.675649,0.844309,-0.690597,-0.578376,...,0.405646,1.317365,0.219886,1.768327,-1.989569,0.569998,0.388572,0.124167,0.028789,Cytosol


In [15]:
# export
kr_residual_labeled.to_csv(path + "Datasets/kr_Residuals_0922.csv")