In [84]:
%matplotlib inline
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data Preprocessing

## 0 - Loading data

In [85]:
# set data path
path_data_prot = "data/ratiowt.csv"
path_data_ribo = "data/summary_pat1_CDS_density_data_normalized_RPKM.txt"

# read data from files
raw_data_prot = pd.read_csv(path_data_prot, index_col=0)
raw_data_ribo = pd.read_csv(path_data_ribo, sep='\t')

## 1 - Datasets description

### a) Proteins data

In [86]:
# preview of the raw protein data
print(raw_data_prot.shape)
raw_data_prot.head()

(3281, 33)


Unnamed: 0,WT4_T00,WT4_T01,WT4_T02,WT4_T03,WT4_T04,WT4_T05,WT4_T06,WT4_T07,WT4_T08,WT4_T09,...,WT7_T01,WT7_T02,WT7_T03,WT7_T04,WT7_T05,WT7_T06,WT7_T07,WT7_T08,WT7_T09,WT7_T10
CON__P00761,13.150281,9.407338,24.657872,10.264727,15.709685,28.330217,15.511812,9.484966,22.430577,7.535227,...,23.298076,111.804298,70.269131,46.431722,71.108583,59.157596,67.168189,18.576311,10.721906,30.374825
CON__P04264,5.610728,8.141996,11.312985,7.446016,4.232088,8.690362,5.36711,6.333122,3.213471,4.681867,...,7.684033,7.00035,5.52975,3.083089,5.549082,8.059966,4.581272,2.864099,6.615507,2.820079
CON__P13645,3.163556,3.937628,7.50469,3.456978,6.605892,1.881963,0.889442,6.892749,0.561703,1.760192,...,7.459903,10.434928,7.730365,5.783356,16.064515,4.615527,2.237787,8.932559,6.195019,7.190623
CON__P15636,20.156007,6.966699,4.828119,13.614333,16.8087,9.455371,12.878798,21.559158,14.028197,7.663422,...,10.987683,7.503564,5.772006,17.601295,9.832842,24.968789,9.896091,17.334928,22.422027,24.981888
CON__P35527,6.656903,3.573726,3.286339,3.203588,1.679995,1.941107,2.87282,3.753331,4.186027,2.490412,...,2.876456,0.33095,0.935541,3.802426,3.579611,10.761714,4.365478,1.076577,5.539552,3.279979


### b) Ribosomes data

In [87]:
# preview of the raw ribo data
print(raw_data_ribo.shape)
raw_data_ribo.head()

(5175, 14)


Unnamed: 0,pat1_exp_ribo,pat1_0h_ribo,pat1_1h_ribo,pat1_3h_ribo,pat1_5h_ribo,pat1_7h_ribo,pat1_10h_ribo,pat1_exp_mRNA,pat1_0h_mRNA,pat1_1h_mRNA,pat1_3h_mRNA,pat1_5h_mRNA,pat1_7h_mRNA,pat1_10h_mRNA
SPBC337.03,22.92542,59.81845,8.296752,119.440621,31.742899,8.128106,8.342521,14.632602,9.707561,22.316031,27.109601,5.067587,5.536374,5.420566
SPAC1B3.15c,11.164113,14.9187,3.467528,21.511157,12.437912,22.807897,13.553825,15.801948,5.01384,2.830356,24.499555,0.971048,28.39174,18.452541
SPBCPT2R1.10,27.763147,60.265964,24.730075,12.767686,2.055611,1.062212,5.125196,3.556112,6.04384,7.94905,2.94647,0.213956,2.206976,15.975507
SPBC21B10.13c,5.825523,37.656982,115.101522,84.159902,4.018543,45.787367,13.21013,17.350575,13.882374,41.554121,240.125592,0.692884,29.218661,34.22962
SPAC19E9.03,15.11732,62.077835,44.578471,11.424813,2.356965,34.778644,5.435827,56.480683,89.356403,36.815561,30.734372,0.756267,16.420532,48.105336


## 2 - Datasets preprocessing

### a) Proteins data

Our first preprocessing consists in averaging the triplicates at each time point. <br/>
Indead, it would be more simple to compute corralation between ribo et proteins concentrations if we have a single value at each time point.

In [88]:
# split protein data into 3 sets
data_prot_A = pd.DataFrame(raw_data_prot.ix[:,0:11].values)
data_prot_B = pd.DataFrame(raw_data_prot.ix[:,11:22].values)
data_prot_C = pd.DataFrame(raw_data_prot.ix[:,22:33].values)

# compute average of the 3 data sets
data_prot = (data_prot_A + data_prot_B + data_prot_C).copy()/3.0

In order to get measures at the same time points in both datasets, we remove the 9th measure (= 9h, absent from the ribo dataset).

In [89]:
data_prot = data_prot.iloc[:, [0, 1, 3, 5, 7, 10]]

Finally, we choose to use the protein's name as index, and name the features as the time of the measurement.

In [90]:
# set columns and rows indexes
data_prot.columns = ['h'+str(k) for k in [0,1,3,5,7,10]]
data_prot.index = raw_data_prot.index

# show begining of dataframe
print(data_prot.shape)
data_prot.head()

(3281, 6)


Unnamed: 0,h0,h1,h3,h5,h7,h10
CON__P00761,62.703297,14.396267,34.796653,38.255215,43.786914,29.96938
CON__P04264,6.844208,7.315701,7.077574,6.154198,5.591905,6.5859
CON__P13645,5.565627,7.004613,3.964237,7.828472,4.175028,5.982843
CON__P15636,13.634706,9.466882,9.90749,11.852809,14.847343,16.075755
CON__P35527,6.987298,4.176773,2.505684,2.966327,3.154968,4.038704


### b) Ribosomes data

In [91]:
# remove first unwanted columns
data_ribo = raw_data_ribo.iloc[:,1:7]
data_ribo.head()

Unnamed: 0,pat1_0h_ribo,pat1_1h_ribo,pat1_3h_ribo,pat1_5h_ribo,pat1_7h_ribo,pat1_10h_ribo
SPBC337.03,59.81845,8.296752,119.440621,31.742899,8.128106,8.342521
SPAC1B3.15c,14.9187,3.467528,21.511157,12.437912,22.807897,13.553825
SPBCPT2R1.10,60.265964,24.730075,12.767686,2.055611,1.062212,5.125196
SPBC21B10.13c,37.656982,115.101522,84.159902,4.018543,45.787367,13.21013
SPAC19E9.03,62.077835,44.578471,11.424813,2.356965,34.778644,5.435827


In [92]:
# set columns indexes
data_ribo.columns = ['h'+str(k) for k in [0,1,3,5,7,10]]

print(data_ribo.shape)
data_ribo.head()

(5175, 6)


Unnamed: 0,h0,h1,h3,h5,h7,h10
SPBC337.03,59.81845,8.296752,119.440621,31.742899,8.128106,8.342521
SPAC1B3.15c,14.9187,3.467528,21.511157,12.437912,22.807897,13.553825
SPBCPT2R1.10,60.265964,24.730075,12.767686,2.055611,1.062212,5.125196
SPBC21B10.13c,37.656982,115.101522,84.159902,4.018543,45.787367,13.21013
SPAC19E9.03,62.077835,44.578471,11.424813,2.356965,34.778644,5.435827


## 3 - Merging the ribo and protein data sets

We've got 2969 entries in our proteins dataset and 4786 entries in the ribo one.<br/>
We'll first reduce the 2 datasets to their common entries, and then join the 2 tables.

### a) Filtering data

As ribo names can be composed of several names, we extract the ribo entries which name contains the one of our proteins.

In [99]:
# create the pattern and the filter
pattern = '|'.join(data_prot.index.values)
filter_ = data_ribo.index.str.contains(pattern, case=False, na=False)

# extract ribo data with corresponding protein name
data_ribo = data_ribo[filter_]

# remove duplicates
data_ribo = data_ribo.drop_duplicates()

# show preview
print(data_ribo.shape)

(3235, 6)


### b) Ending with ribo duplicates and Filtering proteins data

Now, we have more proteins (2969) than we have ribo (2858). Before extracting the common entries from the protein dataset, let's check if we have remaining duplicates :

In [100]:
duplicates = np.array([], dtype=object)

for name in data_prot.index.values:
    filter_ = data_ribo.index.str.contains(name, case=False, na=False)
    
    if len(data_ribo[filter_].index)>1:
        print(data_ribo[filter_].index, name)
        duplicates = np.append(duplicates, data_ribo[filter_].index)

We still have one case of protein potentially produced by 2 distinct ribo. Let's aggregate it as we did previously :

In [101]:
last_duplicate = data_ribo.loc[duplicates]
last_duplicate_sum = last_duplicate.sum()
last_duplicate_sum.name = 'SPAC20H4.10'

data_ribo = data_ribo.drop(duplicates)
data_ribo = data_ribo.append(last_duplicate_sum)

Now, we can extract the common entries (the ARNm positions corresponding to proteins names) :

In [102]:
# get list of indexes and names to sort
idxname = np.array([], dtype=int)
listname = np.array([], dtype=object)

for name in data_prot.index.values:
    filter_ = data_ribo.index.str.contains(name, case=False, na=False)
    
    if len(data_ribo[filter_].index)>0:
        idxname = np.append(idxname, data_ribo.index.get_loc(data_ribo[filter_].index[0])) # ribo numeric positions
        listname = np.append(listname, name) # Protein index

Let's filter the proteins dataset :

In [103]:
# removing non existant keys in protein data
data_prot = data_prot.loc[listname]
print(data_prot.shape)
data_prot.head()

(3236, 6)


Unnamed: 0,h0,h1,h3,h5,h7,h10
SPAC1002.01,0.559502,0.797365,0.908648,1.149922,1.056883,1.21708
SPAC1002.02,0.294818,0.409764,1.71349,1.714607,0.844587,0.423492
SPAC1002.03c,1.199934,1.09828,0.900738,0.933986,1.017834,0.996403
SPAC1002.04c,1.228882,1.183849,1.283379,1.406864,0.863794,0.997622
SPAC1002.07c,0.626058,0.687943,1.01554,1.316786,1.251533,1.105592


### c) Joining the 2 datasets

Before merging the 2 datasets, we have to sort the ribo dataset in the same order than the protein one :

In [104]:
data_ribo = data_ribo.iloc[list(idxname)]
data_ribo.set_index([listname.tolist()], inplace=True)
data_ribo.head()

Unnamed: 0,h0,h1,h3,h5,h7,h10
SPAC1002.01,94.726178,43.214894,115.583562,100.703495,75.098539,7.917638
SPAC1002.02,111.177815,228.375045,1362.110144,69.428636,135.632005,48.754813
SPAC1002.03c,66.422367,25.939848,40.718125,33.354452,103.578822,15.223206
SPAC1002.04c,35.052994,59.509223,398.995782,88.526348,181.437065,72.384948
SPAC1002.07c,130.411811,85.784796,1121.422371,505.515368,23.491893,44.830133


Now, we can join the 2 datasets :

In [105]:
#We add a hierarchical column name
data_ribo = pd.concat({"ribo": pd.DataFrame(data_ribo)}, axis=1)
data_prot = pd.concat({"prot": pd.DataFrame(data_prot)}, axis=1)

#And join
data = data_prot.join(data_ribo)
print('Size of joined data:', data.shape)
data.head()

Size of joined data: (3236, 12)


Unnamed: 0_level_0,prot,prot,prot,prot,prot,prot,ribo,ribo,ribo,ribo,ribo,ribo
Unnamed: 0_level_1,h0,h1,h3,h5,h7,h10,h0,h1,h3,h5,h7,h10
SPAC1002.01,0.559502,0.797365,0.908648,1.149922,1.056883,1.21708,94.726178,43.214894,115.583562,100.703495,75.098539,7.917638
SPAC1002.02,0.294818,0.409764,1.71349,1.714607,0.844587,0.423492,111.177815,228.375045,1362.110144,69.428636,135.632005,48.754813
SPAC1002.03c,1.199934,1.09828,0.900738,0.933986,1.017834,0.996403,66.422367,25.939848,40.718125,33.354452,103.578822,15.223206
SPAC1002.04c,1.228882,1.183849,1.283379,1.406864,0.863794,0.997622,35.052994,59.509223,398.995782,88.526348,181.437065,72.384948
SPAC1002.07c,0.626058,0.687943,1.01554,1.316786,1.251533,1.105592,130.411811,85.784796,1121.422371,505.515368,23.491893,44.830133


Drop the 0 values

In [205]:
data = data.drop(data.ribo.iloc[np.where((data == 0))[0], :].index, axis=0)
data_ribo = pd.concat({"ribo": pd.DataFrame(data.ribo)}, axis=1)
data_prot = pd.concat({"prot": pd.DataFrame(data.prot)}, axis=1)

## 4 - Data variants

### a) Data in log2

In [206]:
data_prot_log2 = np.log2(data_prot)
data_ribo_log2 = np.log2(data_ribo)
data_log2 = np.log2(data)

### b) Data in difference of log2

In [207]:
def create_diff_log2(df):
    new_df = df.copy()
    new_df = new_df.diff(axis=1)
    new_df = new_df.drop('h0', axis=1)
    new_df.columns = ['d'+str(k) for k in [1,3,5,7,10]]
    
    return new_df

In [208]:
data_prot_geom = create_diff_log2(data_prot_log2.prot)
data_ribo_geom = create_diff_log2(data_ribo_log2.ribo)

#We add a hierarchical column name
data_ribo_geom = pd.concat({"ribo": pd.DataFrame(data_ribo_geom)}, axis=1)
data_prot_geom = pd.concat({"prot": pd.DataFrame(data_prot_geom)}, axis=1)

#And join
data_geom = data_prot_geom.join(data_ribo_geom)

### c) Data Standardization (log2 and z-score)

As we will need to cluster our data, it could be useful to standardize them.<br/>
First of all, we define a suitable standization function :

In [209]:
def standardize_by_row(df):
    mean_rows = df.mean(axis=1)
    std_rows = df.std(axis=1)
    
    df = df.sub(mean_rows, axis=0)
    df = df.div(std_rows, axis=0)

    return df

We standardize the 10 measures of each ribo and proteins independently :

In [210]:
data_prot_norm = standardize_by_row(data_prot_log2)
data_ribo_norm = standardize_by_row(data_ribo_log2)

And finally join the standardized datasets :

In [211]:
data_norm = data_prot_norm.join(data_ribo_norm)

### Saving datasets

In [212]:
# Unstandardized data
data_prot.to_csv('data2/data_prot.csv')
data_ribo.to_csv('data2/data_ribo.csv')
data.to_csv('data2/data.csv')

# Standardized data
data_prot_norm.to_csv('data2/data_prot_log2_zscore.csv')
data_ribo_norm.to_csv('data2/data_ribo_log2_zscore.csv')
data_norm.to_csv('data2/data_log2_zscore.csv')

# Log2 data
data_prot_log2.to_csv('data2/data_prot_log2.csv')
data_ribo_log2.to_csv('data2/data_ribo_log2.csv')
data_log2.to_csv('data2/data_log2.csv')

# Diff Log2 data
data_prot_geom.to_csv('data2/data_prot_diff_log2.csv')
data_ribo_geom.to_csv('data2/data_ribo_diff_log2.csv')
data_geom.to_csv('data2/data_diff_log2.csv')