# Correlation between the fission yeast transcriptome and proteome

In [251]:
%matplotlib inline
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading data

In [252]:
# set data path
path_data_mRNA = "data/pat1_average_modified.txt"
path_data_prot = "data/wtratioall.csv"

# read data from files
raw_data_mRNA = pd.read_csv(path_data_mRNA, sep='\t', header=None)
raw_data_prot = pd.read_csv(path_data_prot, index_col=0)

In [253]:
# preview of the raw mRNA data
raw_data_mRNA.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,aap1,AAP1,SPBC1652.02 SPBC16A3.20C,1.0,22.703531,0.96036,0.54446,0.896232,1.160251,1.252143,2.309153,2.31075,2.449327,2.706837,3.064384
1,abc1: C2D10.18,ABC1,SPBC2D10.18,1.0,1.043491,0.622283,0.663191,0.619409,0.648403,0.662236,0.655953,0.580761,0.606205,0.752322,0.798264
2,abc1: C9E9.12c,ABC1,SPAC9E9.12C,1.0,2.112546,0.906822,0.565746,0.53522,1.224398,10.170195,7.380023,4.455168,2.804587,1.657471,1.31063
3,abp1,CBP1 ABP1,SPBC1105.04C,1.0,1.351277,0.884985,0.705397,0.401055,0.202862,0.17536,0.82992,0.959749,1.121036,1.122949,1.103435
4,abp2,ABP2,SPBC1861.02,1.0,0.816301,1.969788,1.736116,1.057102,0.494822,0.372226,0.533438,0.851486,0.989702,1.115665,1.159631


In [254]:
# preview of the raw protein data
raw_data_prot.head()

Unnamed: 0,A_00,A_01,A_02,A_03,A_04,A_05,A_06,A_07,A_08,A_09,...,C_01,C_02,C_03,C_04,C_05,C_06,C_07,C_08,C_09,C_10
SPAC1002.02,1,1.451188,3.734827,6.181905,7.607555,5.215707,3.439898,2.598756,2.18323,1.81129,...,1.555395,3.67654,6.030714,6.63578,5.153358,3.210663,2.511956,1.853247,1.525539,1.33268
SPAC1002.03c,1,0.922745,0.763064,0.74945,0.742595,0.771641,0.819817,0.859544,0.849914,0.838128,...,0.901151,0.828585,0.779982,0.767321,0.813157,0.830826,0.875965,0.938763,0.819587,0.850455
SPAC1002.04c,1,1.063774,1.109541,1.105289,1.256196,1.193269,0.834514,0.698355,0.71428,0.716728,...,0.834014,0.82909,0.911089,1.179868,1.106116,0.8248,0.652223,0.654474,0.618405,0.651751
SPAC1002.07c,1,0.946289,1.270676,1.577819,2.059521,2.07945,2.026755,1.934475,1.886784,1.820797,...,1.145764,1.396319,1.819543,2.141494,2.219369,2.134959,2.000173,1.98348,1.959289,1.90266
SPAC1002.09c,1,0.909058,0.871595,0.90703,0.941867,0.963464,0.963832,0.978117,1.026633,1.043288,...,0.917147,0.879789,0.881237,0.897882,0.903383,0.907655,0.899867,0.909359,0.945886,0.975173


Processing proteins data

In [255]:
# split protein data into 3 sets
data_prot_A = pd.DataFrame(raw_data_prot.ix[:,0:11].values)
data_prot_B = pd.DataFrame(raw_data_prot.ix[:,11:22].values)
data_prot_C = pd.DataFrame(raw_data_prot.ix[:,22:33].values)

# compute average of the 3 data sets
data_prot = (data_prot_A + data_prot_B + data_prot_C).copy()/3.0

# set columns and rows indexes
data_prot.columns = ['plt'+str(k) for k in range(0,11)]
data_prot.index = raw_data_prot.index

# show begining of dataframe
data_prot.head()

Unnamed: 0,plt0,plt1,plt2,plt3,plt4,plt5,plt6,plt7,plt8,plt9,plt10
SPAC1002.02,1.0,1.393371,3.257203,5.591645,6.932151,5.479337,3.574659,2.726113,2.114409,1.764191,1.439884
SPAC1002.03c,1.0,0.910152,0.785974,0.750362,0.746,0.770242,0.800074,0.843706,0.872157,0.819914,0.839864
SPAC1002.04c,1.0,0.990121,0.980455,1.050549,1.248047,1.217455,0.880529,0.704999,0.712103,0.701598,0.795477
SPAC1002.07c,1.0,1.046894,1.307253,1.640936,2.070811,2.12554,2.078482,1.9979,1.900363,1.857979,1.780422
SPAC1002.09c,1.0,0.904997,0.85532,0.872414,0.904201,0.923575,0.93211,0.93983,0.961271,0.991512,1.018891


Processing mRNA data

In [256]:
# remove first 2 columns (TODO: need to check if it removes useful names)
data_mRNA = raw_data_mRNA.drop([0,1], axis=1).copy()

# set columns indexes
data_mRNA.set_index([2], inplace=True)
data_mRNA.index.name = None
data_mRNA.columns = ['mlt'+str(k) for k in range(0,12)]

# drop data without values
data_mRNA.dropna(how='all', inplace=True)

# show begining of dataframe
data_mRNA.head()

Unnamed: 0,mlt0,mlt1,mlt2,mlt3,mlt4,mlt5,mlt6,mlt7,mlt8,mlt9,mlt10,mlt11
SPBC1652.02 SPBC16A3.20C,1.0,22.703531,0.96036,0.54446,0.896232,1.160251,1.252143,2.309153,2.31075,2.449327,2.706837,3.064384
SPBC2D10.18,1.0,1.043491,0.622283,0.663191,0.619409,0.648403,0.662236,0.655953,0.580761,0.606205,0.752322,0.798264
SPAC9E9.12C,1.0,2.112546,0.906822,0.565746,0.53522,1.224398,10.170195,7.380023,4.455168,2.804587,1.657471,1.31063
SPBC1105.04C,1.0,1.351277,0.884985,0.705397,0.401055,0.202862,0.17536,0.82992,0.959749,1.121036,1.122949,1.103435
SPBC1861.02,1.0,0.816301,1.969788,1.736116,1.057102,0.494822,0.372226,0.533438,0.851486,0.989702,1.115665,1.159631


Some of the values in mRNA are duplicates and need to be dealt with

In [257]:
duplicated_data_mRNA = raw_data_mRNA[raw_data_mRNA[[2]].isin(data_mRNA[data_mRNA.index.duplicated()].index).values].copy()
duplicated_data_mRNA = duplicated_data_mRNA.sort_values([2])

# save duplicated mRNA data
duplicated_data_mRNA.to_csv('data/duplicated_mRNA.csv')

# show preview
duplicated_data_mRNA.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
3590,cox4,COX4,SPAC1296.02,1.0,1.180228,0.838821,0.896886,0.820098,0.888293,1.23681,0.459174,0.457562,0.383835,0.39315,0.420465
3815,I18_cox4,COX4,SPAC1296.02,1.0,2.53896,1.66195,1.57096,1.67528,1.84294,2.96642,1.721756,,1.75828,1.69303,1.982149
507,C13G7.02c.B,,SPAC13G7.02C,1.0,0.480851,1.82155,2.200407,2.647362,3.204608,2.962069,0.816963,0.817025,0.790903,1.049258,1.244611
505,C13G7.02c,,SPAC13G7.02C,,,,,,,,,,,,
506,C13G7.02c.A,,SPAC13G7.02C,1.0,1.139014,5.82592,7.595293,19.73755,24.382723,27.60216,7.971517,6.919817,9.305873,12.082448,14.4608


Extracting data with common index between the protein and mRNA data

In [286]:
# create the pattern and the filter
pattern = '|'.join(data_prot.index.values)
filter_ = data_mRNA.index.str.contains(pattern, case=False, na=False)

# extract mRNA data with corresponding protein name
data_mRNA = data_mRNA[filter_]

# show preview
data_mRNA.head()

Unnamed: 0,mlt0,mlt1,mlt2,mlt3,mlt4,mlt5,mlt6,mlt7,mlt8,mlt9,mlt10,mlt11
SPBC2D10.18,1.0,1.043491,0.622283,0.663191,0.619409,0.648403,0.662236,0.655953,0.580761,0.606205,0.752322,0.798264
SPBC1105.04C,1.0,1.351277,0.884985,0.705397,0.401055,0.202862,0.17536,0.82992,0.959749,1.121036,1.122949,1.103435
SPBC32H8.12C,1.0,0.621846,0.325697,0.390388,0.419048,0.435039,0.539601,0.403898,0.337072,0.281851,0.265122,0.281973
SPAC630.03,1.0,0.945575,0.576106,0.698916,0.638058,0.42313,0.352959,0.746192,0.82919,0.843786,0.889201,0.842393
SPBC106.04,1.0,0.871751,0.869876,1.023361,0.727945,0.261486,0.14392,0.437443,0.805628,0.984534,0.974699,1.019219


In [287]:
# comparing size of protein data and filtered mRNA data
print('Size of the protein data:', data_prot.shape)
print('Size of the filtered mRNA data:', data_mRNA.shape)
print('Lost entries:', data_prot.shape[0]-data_mRNA.shape[0])

Size of the protein data: (2969, 11)
Size of the filtered mRNA data: (2898, 12)
Lost entries: 71
Is the key of data_mRNA unique ? False


In [310]:
data_mRNA.iloc[list(idxname)].head(20)

Unnamed: 0,mlt0,mlt1,mlt2,mlt3,mlt4,mlt5,mlt6,mlt7,mlt8,mlt9,mlt10,mlt11
SPAC1002.02,1.0,1.0353,2.905877,4.153496,4.78312,2.603015,1.64706,1.48123,1.616345,1.347246,1.122512,1.064692
SPAC1002.03C,1.0,0.958027,0.676712,0.703215,0.747138,0.86194,1.257074,1.235414,0.950157,0.988842,1.161624,1.166776
SPAC1002.04C,1.0,1.133477,1.140642,1.189899,1.622498,2.308321,3.020603,1.853997,1.714919,1.682955,1.530989,1.539161
SPAC1002.07C,1.0,0.783431,1.362667,1.602399,1.836938,1.938045,1.969526,0.570085,0.593492,0.524142,0.43759,0.482167
SPAC1002.09C,1.0,0.728322,0.701878,1.35481,1.140213,0.637736,0.368306,0.276762,0.236933,0.258554,0.260154,0.297513
SPAC1002.12C,1.0,1.608574,0.740508,1.210665,1.416733,1.500539,1.063208,0.820899,0.702938,0.905588,1.085427,1.362559
SPAC1002.13C,1.0,0.672564,1.84892,1.496198,0.884597,0.854808,0.301855,0.361461,0.48657,0.681512,0.751734,0.785421
SPAC1002.15C,1.0,0.93002,0.991516,1.05032,1.073042,1.011421,1.288473,1.118586,1.053621,0.920575,0.941023,0.936172
SPAC1002.17C,1.0,3.202583,0.174271,0.305421,0.565029,1.280423,6.943263,5.443547,5.201326,2.353243,1.082166,0.837624
SPAC1006.04C,1.0,2.456557,5.559843,10.215177,17.291292,19.61267,46.32715,7.68241,7.362668,5.751654,3.342538,2.470042


In [311]:
data_prot.head(20)

Unnamed: 0,plt0,plt1,plt2,plt3,plt4,plt5,plt6,plt7,plt8,plt9,plt10
SPAC1002.02,1.0,1.393371,3.257203,5.591645,6.932151,5.479337,3.574659,2.726113,2.114409,1.764191,1.439884
SPAC1002.03c,1.0,0.910152,0.785974,0.750362,0.746,0.770242,0.800074,0.843706,0.872157,0.819914,0.839864
SPAC1002.04c,1.0,0.990121,0.980455,1.050549,1.248047,1.217455,0.880529,0.704999,0.712103,0.701598,0.795477
SPAC1002.07c,1.0,1.046894,1.307253,1.640936,2.070811,2.12554,2.078482,1.9979,1.900363,1.857979,1.780422
SPAC1002.09c,1.0,0.904997,0.85532,0.872414,0.904201,0.923575,0.93211,0.93983,0.961271,0.991512,1.018891
SPAC1002.12c,1.0,0.902816,0.869738,0.91538,1.045233,1.067834,1.061747,1.104626,1.150851,1.169323,1.300029
SPAC1002.13c,1.0,1.438538,1.335639,1.184806,1.008284,0.962637,1.012037,0.981468,1.082934,1.045613,0.880193
SPAC1002.15c,1.0,1.053569,1.240176,1.473709,1.753027,1.80134,1.707459,1.453056,1.395625,1.254985,1.360234
SPAC1002.17c,1.0,0.740731,0.585468,0.46623,0.391166,0.321337,0.279159,0.25792,0.235647,0.214654,0.196099
SPAC1006.04c,1.0,1.350476,2.732501,6.64345,12.557676,14.02191,7.81393,6.476014,5.715364,5.434059,5.058818


We lost 71 entries in the protein data which are not in the mRNA data. 5 entries might correspond to the last 5 entries of the protein data. The others come from the non available data in the mRNA data.

In [288]:
# get list of indexes and names to sort and finalizing mergin of the two data frames
idxname = np.array([], dtype=int)
listname = np.array([], dtype=object)
for name in data_prot.index.values:
    filter_ = data_mRNA.index.str.contains(name, case=False, na=False)
    
    if len(data_mRNA[filter_].index)==1:
        idxname = np.append(idxname, data_mRNA.index.get_loc(data_mRNA[filter_].index[0]))
        listname = np.append(listname, name)

In [306]:
# sorting data and removing duplicated keys for the mRNA data
data_mRNA_sorted = data_mRNA.iloc[list(idxname)]
data_mRNA_sorted.set_index([listname.tolist()], inplace=True)
data_mRNA_sorted.head()

Unnamed: 0,mlt0,mlt1,mlt2,mlt3,mlt4,mlt5,mlt6,mlt7,mlt8,mlt9,mlt10,mlt11
SPAC1002.02,1.0,1.0353,2.905877,4.153496,4.78312,2.603015,1.64706,1.48123,1.616345,1.347246,1.122512,1.064692
SPAC1002.03c,1.0,0.958027,0.676712,0.703215,0.747138,0.86194,1.257074,1.235414,0.950157,0.988842,1.161624,1.166776
SPAC1002.04c,1.0,1.133477,1.140642,1.189899,1.622498,2.308321,3.020603,1.853997,1.714919,1.682955,1.530989,1.539161
SPAC1002.07c,1.0,0.783431,1.362667,1.602399,1.836938,1.938045,1.969526,0.570085,0.593492,0.524142,0.43759,0.482167
SPAC1002.09c,1.0,0.728322,0.701878,1.35481,1.140213,0.637736,0.368306,0.276762,0.236933,0.258554,0.260154,0.297513


In [300]:
# removing non existant keys in protein data
data_prot_filtered = data_prot.loc[listname]
data_prot_filtered.head()

Unnamed: 0,plt0,plt1,plt2,plt3,plt4,plt5,plt6,plt7,plt8,plt9,plt10
SPAC1002.02,1.0,1.393371,3.257203,5.591645,6.932151,5.479337,3.574659,2.726113,2.114409,1.764191,1.439884
SPAC1002.03c,1.0,0.910152,0.785974,0.750362,0.746,0.770242,0.800074,0.843706,0.872157,0.819914,0.839864
SPAC1002.04c,1.0,0.990121,0.980455,1.050549,1.248047,1.217455,0.880529,0.704999,0.712103,0.701598,0.795477
SPAC1002.07c,1.0,1.046894,1.307253,1.640936,2.070811,2.12554,2.078482,1.9979,1.900363,1.857979,1.780422
SPAC1002.09c,1.0,0.904997,0.85532,0.872414,0.904201,0.923575,0.93211,0.93983,0.961271,0.991512,1.018891


In [297]:
print('Size of the filtered protein data:', data_prot_filtered.shape)
print('Size of the filtered and sorted mRNA data:', data_mRNA_sorted.shape)

Size of the filtered protein data: (2830, 11)
Size of the filtered and sorted mRNA data: (2830, 12)


Joining the two tables

In [304]:
data = data_mRNA_sorted.join(data_prot_filtered)
print('Size of joined data:', data.shape)
data.head()

Size of joined data: (2830, 23)


Unnamed: 0,mlt0,mlt1,mlt2,mlt3,mlt4,mlt5,mlt6,mlt7,mlt8,mlt9,...,plt1,plt2,plt3,plt4,plt5,plt6,plt7,plt8,plt9,plt10
SPAC1002.02,1.0,1.0353,2.905877,4.153496,4.78312,2.603015,1.64706,1.48123,1.616345,1.347246,...,1.393371,3.257203,5.591645,6.932151,5.479337,3.574659,2.726113,2.114409,1.764191,1.439884
SPAC1002.03c,1.0,0.958027,0.676712,0.703215,0.747138,0.86194,1.257074,1.235414,0.950157,0.988842,...,0.910152,0.785974,0.750362,0.746,0.770242,0.800074,0.843706,0.872157,0.819914,0.839864
SPAC1002.04c,1.0,1.133477,1.140642,1.189899,1.622498,2.308321,3.020603,1.853997,1.714919,1.682955,...,0.990121,0.980455,1.050549,1.248047,1.217455,0.880529,0.704999,0.712103,0.701598,0.795477
SPAC1002.07c,1.0,0.783431,1.362667,1.602399,1.836938,1.938045,1.969526,0.570085,0.593492,0.524142,...,1.046894,1.307253,1.640936,2.070811,2.12554,2.078482,1.9979,1.900363,1.857979,1.780422
SPAC1002.09c,1.0,0.728322,0.701878,1.35481,1.140213,0.637736,0.368306,0.276762,0.236933,0.258554,...,0.904997,0.85532,0.872414,0.904201,0.923575,0.93211,0.93983,0.961271,0.991512,1.018891
