In [1]:
import pandas as pd
import re

In this notebook we:
1. Generate a list of the VeriTas labeled filenames.
2. Generate a data frame including median values for only labeled files.
3. Generate a list of markers used in the experiment. Ensure the list uses uniform formatting.

### Generate list of VeriTas labeled filenames

In [2]:
median_data = pd.read_csv('../data/primary/161219_HIMC_Veritas_Screen_Vericell_only_intensity_medians.csv')
median_data.set_index('FCS Filename', inplace=True)
print(median_data.shape)
median_data.head()

(688, 8)


Unnamed: 0_level_0,Medians of Ho165Di (PE) for B cells,Medians of Ho165Di (PE) for CD4 T cells,Medians of Ho165Di (PE) for CD8 T cells,Medians of Ho165Di (PE) for NK cells,Medians of Ho165Di (PE) for CD14hi monocytes,Medians of Ho165Di (PE) for CD16hi monocytes,Medians of Ho165Di (PE) for CD1c DCs,Medians of Ho165Di (PE) for pDCs
FCS Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
161219_HIMC_Veritas_Screen_P1A01_FMO_VeriTas.fcs,4.732788,1.928152,1.649153,1.162573,6.228203,4.765468,5.105804,3.865644
161219_HIMC_Veritas_Screen_P1A01_FMO_Veri_nonlabeled.fcs,5.05777,1.889331,1.492631,0.726811,7.587465,6.734652,4.95174,3.051684
161219_HIMC_Veritas_Screen_P1A02_CD1a_VeriTas.fcs,13.089861,6.911612,7.016669,10.20764,25.348644,28.226051,19.358488,25.164061
161219_HIMC_Veritas_Screen_P1A02_CD1a_Veri_nonlabeled.fcs,12.878913,10.016663,9.529917,9.388466,19.264851,22.108725,28.400972,11.494913
161219_HIMC_Veritas_Screen_P1A03_CD1b_VeriTas.fcs,7.207995,5.860216,5.298354,7.442468,21.829674,21.126211,22.072254,22.924406


In [3]:
fcs_filenames_all = median_data.index.values
fcs_filenames_all[0:4]

array(['161219_HIMC_Veritas_Screen_P1A01_FMO_VeriTas.fcs',
       '161219_HIMC_Veritas_Screen_P1A01_FMO_Veri_nonlabeled.fcs',
       '161219_HIMC_Veritas_Screen_P1A02_CD1a_VeriTas.fcs',
       '161219_HIMC_Veritas_Screen_P1A02_CD1a_Veri_nonlabeled.fcs'],
      dtype=object)

In [4]:
fcs_filenames_veritas = [filename for filename in fcs_filenames_all if 'nonlabeled' not in filename]
print(len(fcs_filenames_veritas))

344


### Generate dataframe including median values for only labeled cells

In [5]:
median_data_veritas = median_data[median_data.index.isin(fcs_filenames_veritas)]
print(median_data_veritas.shape)
median_data_veritas.head()

(344, 8)


Unnamed: 0_level_0,Medians of Ho165Di (PE) for B cells,Medians of Ho165Di (PE) for CD4 T cells,Medians of Ho165Di (PE) for CD8 T cells,Medians of Ho165Di (PE) for NK cells,Medians of Ho165Di (PE) for CD14hi monocytes,Medians of Ho165Di (PE) for CD16hi monocytes,Medians of Ho165Di (PE) for CD1c DCs,Medians of Ho165Di (PE) for pDCs
FCS Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
161219_HIMC_Veritas_Screen_P1A01_FMO_VeriTas.fcs,4.732788,1.928152,1.649153,1.162573,6.228203,4.765468,5.105804,3.865644
161219_HIMC_Veritas_Screen_P1A02_CD1a_VeriTas.fcs,13.089861,6.911612,7.016669,10.20764,25.348644,28.226051,19.358488,25.164061
161219_HIMC_Veritas_Screen_P1A03_CD1b_VeriTas.fcs,7.207995,5.860216,5.298354,7.442468,21.829674,21.126211,22.072254,22.924406
161219_HIMC_Veritas_Screen_P1A04_CD1c_VeriTas.fcs,19.833889,5.624355,4.260443,6.553532,36.203098,18.008711,92.064938,2.275819
161219_HIMC_Veritas_Screen_P1A05_CD1d_VeriTas.fcs,18.794231,6.496009,6.794955,8.896626,99.334129,38.983028,124.732323,28.038233


### Generate list of markers:
match text after 5th underscore and before _Veri

In [6]:
veritas_markers = []
for filename in fcs_filenames_veritas:
    filename_split = filename.split('_')
    filename_marker = []
    
    i = 5
    while not filename_split[i].startswith('Veri'):
        filename_marker.append(filename_split[i])
        i += 1

    veritas_marker = ('_').join(filename_marker)
    veritas_markers.append(veritas_marker)
    
print(len(veritas_markers))
veritas_markers[0:3]

344


['FMO', 'CD1a', 'CD1b']

#### Fix outlier names
We also replace '.' with '_' to prevent potential bugs in the future.

In [7]:
veritas_markers_filt = ['CD318' if marker is'CD318-()' else marker for marker in veritas_markers]
veritas_markers_filt[0:3]

['FMO', 'CD1a', 'CD1b']

In [8]:
veritas_markers_filt = [marker.replace('.', '_') for marker in veritas_markers_filt]
veritas_markers_filt[0:3]

['FMO', 'CD1a', 'CD1b']

In [9]:
len(veritas_markers_filt)

344

#### Note: FMO is duplicated in our list. It corresponds to both the first and last filename
After speaking with Adeeb, we keep only the first FMO (also known as FMO1 in other files).

In [10]:
len(set(veritas_markers_filt))

343

In [11]:
veritas_markers_filt = veritas_markers_filt[0:-1]
print(len(veritas_markers_filt))
print(len(set(veritas_markers_filt)))

343
343
