In [1]:
from helpers.utilities import *
%run helpers/notebook_setup.ipynb

In [2]:
from numpy import nan

The outputs will be saved to:

In [3]:
# inputs
raw_sample_list_path = 'data/raw/SampleListHead.txt'

protein_levels_path = 'data/clean/protein/levels.csv'
rna_seq_path = 'data/clean/rna/all_samples.csv'

# output
clean_sample_list_path = 'data/clean/samples_list.csv'

Possibly due to a copying error, my copy of the `SampleList.txt` file had over a million (1032213) lines. To avoid committing excessively long file, I trimmed it with:

`!head -n 50 data/raw/SampleList.txt > data/raw/SampleListHead.txt`

In [4]:
sample_list = read_table('data/raw/SampleListHead.txt').dropna(how='all')

In [5]:
sample_list

Unnamed: 0,RNA,Protein,RNA-only,Protein-only
0,149.TMD,149.TMD,136.TMD,007.TMD
1,001.TMD,001.TMD,168.TMD,064.TMD
2,151.TMD,151.TMD,241.TMD,093.TMD
3,170.TMD,170.TMD,242.TMD,248.TMD
4,083.TMD,083.TMD,185.TMD,175.TMD
...,...,...,...,...
41,,,,177.HC
42,,,,189.HC
43,,,,217.HC
44,,,,221.HC


### Loading the data

In [6]:
protein = read_csv(protein_levels_path, index_col=[0,1,2,3])
rna_seq = read_csv(rna_seq_path, index_col=[0,1])

### Checking if there are all the data

In [7]:
all_samples_with_protein = {*sample_list.Protein.dropna(), *sample_list['Protein-only'].dropna()}

In [8]:
assert all_samples_with_protein == set(protein.columns)

In [9]:
all_samples_with_rna = {*sample_list.RNA.dropna(), *sample_list['RNA-only'].dropna()}

In [10]:
assert all_samples_with_rna == set(rna_seq.columns)

### Reformatting to normal form

In [11]:
sample_ids = Series([
    *sample_list['RNA'],
    *sample_list['RNA-only'],
    *sample_list['Protein'],
    *sample_list['Protein-only']
]).dropna().drop_duplicates()

In [12]:
samples = DataFrame(dict(sample_id=sample_ids))

In [13]:
samples['has_rna'] = samples.sample_id.isin({*sample_list['RNA'], *sample_list['RNA-only']})
samples['has_protein'] = samples.sample_id.isin({*sample_list['Protein'], *sample_list['Protein-only']})

In [14]:
samples['has_molecular_data'] = samples.has_rna | samples.has_protein

In [15]:
samples = samples.set_index('sample_id')
samples

Unnamed: 0_level_0,has_rna,has_protein,has_molecular_data
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
149.TMD,True,True,True
001.TMD,True,True,True
151.TMD,True,True,True
170.TMD,True,True,True
083.TMD,True,True,True
...,...,...,...
177.HC,False,True,True
189.HC,False,True,True
217.HC,False,True,True
221.HC,False,True,True


In [16]:
samples.to_csv(clean_sample_list_path)