In [23]:
import pandas as pd
import os
import glob
from FlowCytometryTools import FCMeasurement
import numpy as np

# Helper functions

In [24]:
FACS_COLS = ["SFL","FSC","SSC"]

In [36]:
def ID_exp_from_filename(f):
    r = f.split("]")[-3].split("[")[-1]
    return r.split("_")

def load_agonised_sysmex(datadir, root_dir="CBR 159", remove_rbc=True):
    """
    :return: dataframe of all valid measurements (not only platelets)
    """
    os.chdir(datadir)
    wb = glob.glob("%s/**/*PLT-F].fcs" % root_dir, recursive=True)
    print("%d candidates Sysmex files" % len(wb))
    dfs = []
    IDs = {}
    for f in wb:
        meas = FCMeasurement(ID='Test Sample', datafile=f)
        df = meas.data
        ID, exp = ID_exp_from_filename(f)

        # Filter out those measured along with RBC
        if remove_rbc:
            df["bin_200"] = np.arange(df.shape[0]) // 200
            mean_by_200 = df.groupby("bin_200")["Forward Scatter Signal"].mean()
            try:
                cutoff = np.min(
                    np.where(mean_by_200 > np.mean(mean_by_200[:3])+30)[0])
            except ValueError as e:
                cutoff = df["bin_200"].max()
            df = df.loc[df.bin_200 < cutoff]
            del df["bin_200"]
        if exp not in IDs:
            IDs[exp] = {}
        if ID in IDs[exp]:
            print("Two files for ID %s exp %s" % (ID,exp))
            print("\t %s" % f)
            print("\t %s" % IDs[exp][ID])
            continue
        else:
            IDs[exp][ID] = f
            df["ID"] = ID
            df["EXP"] = exp
            dfs.append(df)
    df = pd.concat(dfs, copy=False)
    df.dropna(axis=0, how="any", inplace=True)

    return df

# Read agonised sysmex scattergrams

In [37]:
data_dir = "/home/hv270/rds/rds-who1000-cbrc/user/wja24/shared/hv270/data_home"

In [49]:
sys_sct = load_agonised_sysmex(data_dir,root_dir="CBR 159")
sys_sct.rename(columns={"Side Fluorescence Signal":"SFL",
                                  "Forward Scatter Signal":"FSC",
                                  "Forward Scatter Pulse Width Signal":"FSCW",
                                  "Side Scatter Signal":"SSC"},inplace=True)
sys_sct = sys_sct[["ID","EXP"] + FACS_COLS]
# This is to fix typos in file names...
sys_sct["EXP"] = sys_sct["EXP"].replace({"PRP":"CRP","CPP":"CRP","Resting":"REST","RESTING":"REST","RESET":"REST"})
sys_sct[FACS_COLS] = sys_sct[FACS_COLS].astype(int)

629 candidates Sysmex files
Two files for ID C19XNH exp ADP
	 CBR 159/x17092018_2x/FCS/[XN-20^12829][00-22A (Build 7)][Fcs][17092018_115032][C19XNH_ADP][PLT-F].fcs
	 CBR 159/x17092018_2x/FCS/[XN-20^12829][00-22A (Build 7)][Fcs][17092018_115148][C19XNH_ADP][PLT-F].fcs


In [50]:
# EXP is the condition in which the scattergram was measured : 
# REST = non agonised, ADP = agonised with ADP, CRP = agonised with CRP
sys_sct.sample(10)

Unnamed: 0,ID,EXP,SFL,FSC,SSC
835,C19H64,ADP,153,240,201
555,C19PCY,CRP,34,67,26
1242,C1A0BH,REST,49,30,32
1885,C19RCQ,REST,41,45,41
142,C19TGA,REST,92,89,48
943,C19H64,REST,155,157,75
285,C19M0X,ADP,76,71,39
311,C1A0SK,CRP,18,66,128
19592,C19X5G,CRP,77,68,35
1710,C19YT1,CRP,80,72,36


In [51]:
sys_sct.EXP.value_counts()

CRP     475800
REST    413800
ADP     327400
Name: EXP, dtype: int64

In [52]:
sys_sct.groupby("EXP")["ID"].nunique()

EXP
ADP     208
CRP     209
REST    208
Name: ID, dtype: int64

# Read gold standard FC-measured PF phenotypes

In [64]:
fc = pd.read_csv("metadata_PF/PLATELET_FUNCTION_KD_11062019.txt",sep="\t")
fc.SAMPLE_ID = fc.SAMPLE_ID.astype(str)
print(fc.shape)

(2142, 28)


In [65]:
fc.sample(5)

Unnamed: 0,PROJECT,SAMPLE_ID,DATE,TIME,REST_FIB_1,REST_FIB_2,REST_PSEL_1,REST_PSEL_2,ADP_FIB_1,ADP_FIB_2,...,PAR1_PSEL_1,PAR1_PSEL_2,PAR4_FIB_1,PAR4_FIB_2,PAR4_PSEL_1,PAR4_PSEL_2,TRAP_FIB_1,TRAP_FIB_2,TRAP_PSEL_1,TRAP_PSEL_2
1932,PFC_4,C19HYM,25/05/18,09:44:11,0.22,0.2,16.36,16.08,67.97,70.31,...,80.36,83.28,18.71,19.48,73.31,73.22,2.74,1.68,39.56,37.46
1399,PFC_3,C0B210,14/04/14,10:56:43,1.23,,10.94,,7.76,7.18,...,92.14,91.51,28.53,27.93,91.26,91.09,7.98,8.07,65.02,65.66
1542,PFC_3,C0DNCQ,14/07/14,10:18:48,2.41,,8.61,,10.39,8.92,...,95.5,95.69,35.27,37.53,88.53,87.45,21.51,22.82,76.34,76.47
1608,PFC_4_U,C0WX7T,12/01/16,12:03:07,1.23,2.13,23.33,25.89,69.98,68.87,...,84.55,84.91,19.7,21.05,71.76,70.12,4.45,4.71,45.66,47.63
146,PFC_1,100010100059,25/07/05,08:09:00,11.2,2.59,,,19.08,20.72,...,,,,,,,,,,


# Read "Sysmex phenotypes" (PLT, MPV & co)

In [66]:
sys_phen = pd.read_csv("metadata_PF/PLATELET_FUNCTION_METADATA_KD_11062019.txt",sep="\t")
sys_phen.SAMPLE_ID = meta.SAMPLE_ID.astype(str)
print(sys_phen.shape)

(2197, 6)


In [60]:
sys_phen.sample(5)

Unnamed: 0,Cohort,SAMPLE_ID,SEX,AGE,PLT,MPV
652,PFC_3,C05QC4,F,59.0,299.0,10.1
240,PFC_4,C18JCT,F,33.0,225.0,10.5
405,PFC_4,C19FEX,F,76.0,193.0,11.6
1614,PFC_2,1312,F,62.0,377.0,7.6
230,PFC_4,C18FSD,M,27.0,231.0,10.6


# Merge 

In [67]:
# sys_phen and meta -> features
features = pd.merge(fc, meta, left_on="SAMPLE_ID",right_on="SAMPLE_ID",how="inner")
print(features.shape)

(2091, 33)


In [71]:
# count IDs in common between features and sys_sct
good_IDs = set(sys_sct.ID.unique().tolist()).intersection(features.SAMPLE_ID.unique().tolist())
print(len(good_IDs))

205


In [77]:
# All are from the PFC_4 project
features.loc[features.SAMPLE_ID.isin(good_IDs)].Cohort.value_counts()

PFC_4    205
Name: Cohort, dtype: int64

In [75]:
# There seem to be 205 individuals on which to train !
(~features.loc[features.SAMPLE_ID.isin(good_IDs)].isnull()).sum()

PROJECT        205
SAMPLE_ID      205
DATE           205
TIME           205
REST_FIB_1     205
REST_FIB_2     205
REST_PSEL_1    205
REST_PSEL_2    205
ADP_FIB_1      205
ADP_FIB_2      205
CRP_FIB_1      205
CRP_FIB_2      205
ADP_PSEL_1     205
ADP_PSEL_2     205
CRP_PSEL_1     205
CRP_PSEL_2     205
PAR1_FIB_1     205
PAR1_FIB_2     205
PAR1_PSEL_1    205
PAR1_PSEL_2    205
PAR4_FIB_1     205
PAR4_FIB_2     205
PAR4_PSEL_1    205
PAR4_PSEL_2    205
TRAP_FIB_1     205
TRAP_FIB_2     205
TRAP_PSEL_1    205
TRAP_PSEL_2    205
Cohort         205
SEX            205
AGE            205
PLT            205
MPV            205
dtype: int64