In [1]:
import pandas as pd
import numpy as np


In [2]:
file_name = "table.txt"

df = pd.read_table(file_name, sep='\t')
print(df.columns) # check the column names, we want library_name to be there!


Index(['study_accession', 'sample_accession', 'secondary_sample_accession',
       'experiment_accession', 'run_accession', 'tax_id', 'scientific_name',
       'instrument_model', 'library_name', 'library_layout', 'fastq_bytes',
       'fastq_ftp'],
      dtype='object')


In [3]:
patient_ids = df['library_name'].astype(str).str[0:6]
patients = set(patient_ids)
print(sorted(patients)) 
# there are 21 patients, but there are supposed to be only 20 patients according to the paper
# KTN609 does not appear in the appendix of the paper: https://www.cell.com/cms/attachment/2119295259/2091819478/mmc1.pdf

['KTN102', 'KTN115', 'KTN126', 'KTN129', 'KTN132', 'KTN134', 'KTN147', 'KTN152', 'KTN155', 'KTN206', 'KTN210', 'KTN215', 'KTN302', 'KTN304', 'KTN310', 'KTN316', 'KTN317', 'KTN501', 'KTN609', 'KTN612', 'KTN615']


In [4]:
# for each patient, print out the number of data points
s1 = 0
s2 = 0
for patient_id in sorted(patients):
    length = len(df[patient_ids == patient_id])
    print(patient_id, length)
    if length > 100:
        s1 += length
    else:
        s2 += length
print(s1, s2, 6862 - 900)

KTN102 511
KTN115 4
KTN126 831
KTN129 979
KTN132 764
KTN134 4
KTN147 3
KTN152 1035
KTN155 3
KTN206 3
KTN210 3
KTN215 3
KTN302 976
KTN304 4
KTN310 3
KTN316 3
KTN317 4
KTN501 3
KTN609 3
KTN612 3
KTN615 606
5702 46 5962


In [5]:
patient_id1 = ["KTN126", "KTN129", "KTN206", "KTN302"]
patient_id2 = ["KTN102", "KTN132", "KTN152", "KTN615"]
patient_id3 = patients - set(patient_id1) - set(patient_id2)
print(patient_id3)

{'KTN304', 'KTN501', 'KTN317', 'KTN147', 'KTN134', 'KTN210', 'KTN215', 'KTN155', 'KTN310', 'KTN612', 'KTN115', 'KTN609', 'KTN316'}


In [6]:
ss1 = list()
ss2 = list()
ss3 = list()
for pid1 in patient_id1:
    df2 = df.loc[patient_ids == pid1]
    df2 = df2.dropna(axis=0, how='any')
    ret = list(map(lambda row: np.asarray(str(row).split(";"), dtype=int), list(df2['fastq_bytes'])))
    num_files = np.array(list(map(lambda row: row.shape[0], ret)))
    s1 = sorted(df2[num_files == 1]["library_name"])
    s2 = sorted(df2[num_files == 2]["library_name"])
    s3 = sorted(df2[num_files == 3]["library_name"])    
    ss1.append(s1)
    ss2.append(s2)
    ss3.append(s3)
    print(pid1, len(s1), len(s2), len(s3))
print(ss3)
np.sum(list(map(len, ss1))) + np.sum(list(map(len, ss2))) + np.sum(list(map(len, ss3)))

KTN126 0 827 4
KTN129 0 975 4
KTN206 0 0 3
KTN302 406 563 4
[['KTN1260', 'KTN1260cell62', 'KTN126Blood', 'KTN126OP'], ['KTN1290', 'KTN1292', 'KTN129Blood', 'KTN129OP'], ['KTN2060', 'KTN206Blood', 'KTN206OP'], ['KTN3020', 'KTN3022', 'KTN302Blood', 'KTN302OP']]


2786

In [7]:
ss1 = list()
ss2 = list()
ss3 = list()
for pid2 in patient_id2:
    df2 = df.loc[patient_ids == pid2]
    df2 = df2.dropna(axis=0, how='any')
    ret = list(map(lambda row: np.asarray(str(row).split(";"), dtype=int), list(df2['fastq_bytes'])))
    num_files = np.array(list(map(lambda row: row.shape[0], ret)))
    s1 = sorted(df2[num_files == 1]["library_name"])
    s2 = sorted(df2[num_files == 2]["library_name"])
    s3 = sorted(df2[num_files == 3]["library_name"])    
    ss1.append(s1)
    ss2.append(s2)
    ss3.append(s3)
    print(pid2, len(s1), len(s2), len(s3))
print(ss3[1:4])
np.sum(list(map(len, ss1))) + np.sum(list(map(len, ss2))) + np.sum(list(map(len, ss2)))

KTN102 143 275 93
KTN132 93 667 4
KTN152 1030 0 4
KTN615 91 511 4
[['KTN1320', 'KTN1322', 'KTN132Blood', 'KTN132OP'], ['KTN1520', 'KTN1522', 'KTN152Blood', 'KTN152OP'], ['KTN6150', 'KTN6152', 'KTN6152cell76', 'KTN615Blood']]


4263

In [241]:
ss1 = list()
ss2 = list()
ss3 = list()
for pid3 in patient_id3:
    df2 = df.loc[patient_ids == pid3]
    df2 = df2.dropna(axis=0, how='any')
    ret = list(map(lambda row: np.asarray(str(row).split(";"), dtype=int), list(df2['fastq_bytes'])))
    num_files = np.array(list(map(lambda row: row.shape[0], ret)))
    s1 = sorted(df2[num_files == 1]["library_name"])
    s2 = sorted(df2[num_files == 2]["library_name"])
    s3 = sorted(df2[num_files == 3]["library_name"])    
    ss1.append(s1)
    ss2.append(s2)
    ss3.append(s3)
    print(pid3, len(s1), len(s2), len(s3))
print(ss3)
np.sum(list(map(len, ss1))) + np.sum(list(map(len, ss2))) + np.sum(list(map(len, ss3)))

KTN215 0 0 3
KTN210 0 0 3
KTN134 0 0 4
KTN612 0 0 3
KTN609 0 0 3
KTN304 0 0 4
KTN147 0 0 3
KTN155 0 0 3
KTN310 0 0 3
KTN501 0 0 3
KTN316 0 0 3
KTN115 0 0 4
KTN317 0 0 4
[['KTN215-0', 'KTN215Blood', 'KTN215OP'], ['KTN2100', 'KTN210Blood', 'KTN210OP'], ['KTN1340', 'KTN1342', 'KTN134Blood', 'KTN134OP'], ['KTN6120', 'KTN612Blood', 'KTN612OP'], ['KTN6090', 'KTN609Blood', 'KTN609OP'], ['KTN3040', 'KTN3042', 'KTN304Blood', 'KTN304OP'], ['KTN1470', 'KTN147Blood', 'KTN147OP'], ['KTN1550', 'KTN155Blood', 'KTN155OP'], ['KTN3100', 'KTN3102', 'KTN310Blood'], ['KTN5010', 'KTN501Blood', 'KTN501OP'], ['KTN3160', 'KTN316Blood', 'KTN316OP'], ['KTN1150', 'KTN1152', 'KTN115Blood', 'KTN115OP'], ['KTN3170', 'KTN3172', 'KTN317Blood', 'KTN317OP']]


43