# The following notebook merges PolyA and RiboD datasets
* Datasets are unbalanced in terms of disease
* PolyA datasets has many more samples compared to RiboD
* Here, diseases are not balanced
* However, an equal amount of PolyA samples have been randomly selected from the PolyA expression dataset
* This random sampling has an almost equal amount of samples with the RiboD expression dataset
    * File is named Poly_reduced.tsv
* Therefore, this merging of datasets is not account for disease prevalance


In [1]:
import pandas as pd
import numpy as np

## Loading PolyA and RiboD gene expression data

In [2]:
poly_reduced = pd.read_csv('./data/Poly_reduced.tsv', sep='\t', index_col=0)
ribo = pd.read_csv('./data/Ribo.tsv', sep='\t', index_col=0)

In [3]:
poly_reduced.shape

(361, 25924)

In [4]:
ribo.shape

(296, 25924)

## Loading Clinical Metadata for both PolyA and RiboD data


In [5]:
poly_reduced_clinical = pd.read_csv('./data/Poly_clinical_reduced.tsv', sep='\t', index_col=0)
ribo_clinical = pd.read_csv('./data/Ribo_clinical.tsv', sep='\t', index_col=0)

## Labelling:
* Ribo samples = 1
* Poly samples = 0

In [6]:
ribo_labels = ribo.shape[0]*[1]
poly_labels = poly_reduced.shape[0]*[0]

In [5]:
all_labels = ribo_labels+poly_labels
#making sure all genes are in the correct order
ribo = ribo.T.loc[poly_reduced.columns].T

## Merging Expression data and labels

In [6]:
merged_data = pd.concat([ribo, poly_reduced], axis=0)
merged_labels = pd.DataFrame(all_labels, index = merged_data.index, columns = ['Ribo'])

## Saving to disk

In [8]:
merged_data.to_csv('./data/MergedData_reduced.tsv', sep='\t')
merged_labels.to_csv('./data/MergedLabels_reduced.tsv', sep='\t')

In [9]:
ribo.head()

Unnamed: 0,TP53TG1,RP11-115C21.2,LCMT1,NDUFS4,RP11-215P8.3,TRPM4,PPP1R13B,ASAH1,MFN1,OLFML3,...,FAM104B,WBP4,NCF1B,TOMM70A,RP4-813D12.3,PTPRG-AS1,RP11-403I13.10,ATP6V1F,LRRC16B,ARHGEF26-AS1
TH38_1328_S01,1.682573,1.144046,3.477677,4.652486,0.454176,1.035624,3.695994,5.324811,3.626439,2.720278,...,3.41007,3.203201,0.163499,4.449561,0.263034,2.83996,0.214125,4.083213,1.560715,3.196922
THR19_0437_S01,0.970854,0.275007,1.250962,2.678072,0.15056,0.411426,1.189034,4.264536,1.480265,1.887525,...,0.956057,0.970854,0.214125,1.87578,0.056584,0.344828,0.124328,3.001802,0.464668,1.339137
TH01_0715_S01,1.636915,1.531069,3.0054,4.234195,0.111031,1.910733,0.669027,4.078097,2.82985,2.589763,...,2.06695,1.851999,0.389567,3.528571,0.042644,0.831877,0.584963,4.995032,1.176323,2.111031
TH01_0718_S01,1.847997,1.867896,3.480265,4.161081,0.321928,3.193772,1.713696,4.358959,3.496974,6.33985,...,2.017922,1.843984,0.704872,3.613532,0.0,0.910733,1.327687,5.1152,0.163499,0.378512
THR14_1221_S01,2.807355,0.367371,2.961623,4.099295,0.333424,1.304511,1.761285,4.114367,2.07382,1.956057,...,1.929791,1.589763,0.15056,2.440952,0.575312,0.823749,0.097611,4.498251,0.111031,2.257011


In [10]:
merged_data.head()

Unnamed: 0,TP53TG1,RP11-115C21.2,LCMT1,NDUFS4,RP11-215P8.3,TRPM4,PPP1R13B,ASAH1,MFN1,OLFML3,...,FAM104B,WBP4,NCF1B,TOMM70A,RP4-813D12.3,PTPRG-AS1,RP11-403I13.10,ATP6V1F,LRRC16B,ARHGEF26-AS1
TH38_1328_S01,1.682573,1.144046,3.477677,4.652486,0.454176,1.035624,3.695994,5.324811,3.626439,2.720278,...,3.41007,3.203201,0.163499,4.449561,0.263034,2.83996,0.214125,4.083213,1.560715,3.196922
THR19_0437_S01,0.970854,0.275007,1.250962,2.678072,0.15056,0.411426,1.189034,4.264536,1.480265,1.887525,...,0.956057,0.970854,0.214125,1.87578,0.056584,0.344828,0.124328,3.001802,0.464668,1.339137
TH01_0715_S01,1.636915,1.531069,3.0054,4.234195,0.111031,1.910733,0.669027,4.078097,2.82985,2.589763,...,2.06695,1.851999,0.389567,3.528571,0.042644,0.831877,0.584963,4.995032,1.176323,2.111031
TH01_0718_S01,1.847997,1.867896,3.480265,4.161081,0.321928,3.193772,1.713696,4.358959,3.496974,6.33985,...,2.017922,1.843984,0.704872,3.613532,0.0,0.910733,1.327687,5.1152,0.163499,0.378512
THR14_1221_S01,2.807355,0.367371,2.961623,4.099295,0.333424,1.304511,1.761285,4.114367,2.07382,1.956057,...,1.929791,1.589763,0.15056,2.440952,0.575312,0.823749,0.097611,4.498251,0.111031,2.257011


In [11]:
merged_labels

Unnamed: 0,Ribo
TH38_1328_S01,1
THR19_0437_S01,1
TH01_0715_S01,1
TH01_0718_S01,1
THR14_1221_S01,1
THR19_0418_S01,1
TH01_0664_S01,1
THR21_0525_S01,1
THR14_1191_S01,1
THR14_1202_S01,1
