In [1]:
%matplotlib inline
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt, qiime2 as q2
from biom import load_table, Table
from biom.util import biom_open
from skbio import OrdinationResults
from emperor import Emperor, nbinstall; nbinstall()

def load_mf(fn, index='#SampleID'):
    _df = pd.read_csv(fn, sep='\t', dtype='str', na_values=[], keep_default_na=False)
    _df.set_index(index, inplace=True)
    return _df

In [5]:
mf = load_mf('data/11546_20190118-122918.txt', 'sample_name')

In [6]:
mf.surgery_type.value_counts()

none                 244
not applicable       125
colectomy             52
ileocolonic           51
partial_colectomy      5
missing                2
Name: surgery_type, dtype: int64

# For UC patients

In [7]:
mf.surgery_type.value_counts()

none                 244
not applicable       125
colectomy             52
ileocolonic           51
partial_colectomy      5
missing                2
Name: surgery_type, dtype: int64

In [8]:
surgery_subtype_remapper = {
    '100119': 'colectomy with j-pouch',
    '100125': 'colectomy with j-pouch',
    '100029': 'colectomy with j-pouch',
    '100191': 'colectomy with j-pouch',
    '100064': 'colectomy with j-pouch',
}

def update_surgery_subtype(row, remapper):
    if row.host_subject_id in remapper:
        return remapper[row.host_subject_id]
    else:
        return row.surgery_type
    
mf['surgery_subtype'] = mf.apply(update_surgery_subtype, axis=1, args=(surgery_subtype_remapper,))

We noticed that subject 100038 had been mislabeled as 100033

In [9]:
mf.loc['11546.stool.100038.baseline', 'host_subject_id'] = '100038'

In [10]:
disease_subtype_remapper = {
    '100064': 'pouchitis',
    
    # new assignments
    # they develop Crohn's within their pouch
    '100067': 'pouch_cd',
    '100013': 'pouch_cd',
    '100122': 'pouch_cd',
    
    '100023': 'pouch_cd',
    '100031': 'pouch_cd',
}

def update_disease_subtype(row, remapper):
    if row.host_subject_id in remapper:
        return remapper[row.host_subject_id]
    else:
        return row.disease_subtype
    
mf['disease_subtype'] = mf.apply(update_disease_subtype, axis=1, args=(disease_subtype_remapper,))

Validate

In [11]:
mf[(mf.pouch == 'y') & (mf.disease == 'cd')][['host_subject_id', 'disease_subtype']]

Unnamed: 0_level_0,host_subject_id,disease_subtype
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
11546.stool.100013.12month,100013,pouch_cd
11546.stool.100013.18month,100013,pouch_cd
11546.stool.100013.1month,100013,pouch_cd
11546.stool.100013.6month,100013,pouch_cd
11546.stool.100013.baseline,100013,pouch_cd
11546.stool.100023.6month,100023,pouch_cd
11546.stool.100023.baseline,100023,pouch_cd
11546.stool.100031.12month,100031,pouch_cd
11546.stool.100031.6month,100031,pouch_cd
11546.stool.100031.baseline,100031,pouch_cd


In [12]:
mf[mf.surgery_subtype == 'colectomy with j-pouch'][['host_subject_id', 'disease_subtype']]

Unnamed: 0_level_0,host_subject_id,disease_subtype
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
11546.stool.100029.12month,100029,pouchitis
11546.stool.100029.18month,100029,pouchitis
11546.stool.100029.6month,100029,pouchitis
11546.stool.100029.baseline,100029,pouchitis
11546.stool.100064.12month,100064,pouchitis
11546.stool.100064.18month,100064,pouchitis
11546.stool.100064.19month,100064,pouchitis
11546.stool.100064.6month,100064,pouchitis
11546.stool.100064.baseline,100064,pouchitis
11546.stool.100119.baseline,100119,pouch


# For CD

In [13]:
mf[mf.host_subject_id.isin(['100038', '100176'])][['surgery_type', 'surgery_subtype', 'disease_subtype', 'disease']]

Unnamed: 0_level_0,surgery_type,surgery_subtype,disease_subtype,disease
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11546.stool.100038.12month,partial_colectomy,partial_colectomy,ileocolonic,cd
11546.stool.100038.18month,partial_colectomy,partial_colectomy,ileocolonic,cd
11546.stool.100038.6month,partial_colectomy,partial_colectomy,ileocolonic,cd
11546.stool.100038.baseline,partial_colectomy,partial_colectomy,ileocolonic,cd
11546.stool.100176.6month,partial_colectomy,partial_colectomy,ileocolonic,cd


Need to combine the partial colectomies with ileocolonic.

In [14]:
mf.loc[mf.host_subject_id.isin(['100038', '100176']), 'surgery_subtype'] = 'ileocolonic'

In [15]:
mf.loc[mf.host_subject_id.isin(['100060', '100144', '100054']), 'surgery_subtype'] = 'colectomy with ileorectal'

In [16]:
mf.loc[mf.host_subject_id.isin(['100074', '100127', '100140', '100174']), 'surgery_subtype'] = 'colectomy with ileostomy'

In [17]:
mf[mf.host_subject_id.isin(['100074', '100127', '100140', '100174'])][['surgery_type', 'surgery_subtype', 'disease_subtype', 'disease', 'host_subject_id']]

Unnamed: 0_level_0,surgery_type,surgery_subtype,disease_subtype,disease,host_subject_id
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11546.stool.100074.12month,colectomy,colectomy with ileostomy,ileocolonic,cd,100074
11546.stool.100074.6month,colectomy,colectomy with ileostomy,ileocolonic,cd,100074
11546.stool.100074.baseline,colectomy,colectomy with ileostomy,ileocolonic,cd,100074
11546.stool.100127.6month,colectomy,colectomy with ileostomy,ileal,cd,100127
11546.stool.100127.baseline,colectomy,colectomy with ileostomy,ileal,cd,100127
11546.stool.100140.18month,colectomy,colectomy with ileostomy,ileocolonic,cd,100140
11546.stool.100140.6month,colectomy,colectomy with ileostomy,ileocolonic,cd,100140
11546.stool.100140.baseline,colectomy,colectomy with ileostomy,ileocolonic,cd,100140
11546.stool.100174.baseline,colectomy,colectomy with ileostomy,ileocolonic,cd,100174


Validate

In [18]:
mf.groupby('disease').surgery_type.value_counts()

disease         surgery_type     
cd              none                 114
                ileocolonic           51
                colectomy             38
                partial_colectomy      5
missing         missing                2
not applicable  not applicable       125
uc              none                 130
                colectomy             14
Name: surgery_type, dtype: int64

In [19]:
mf.groupby('disease').surgery_type.value_counts()

disease         surgery_type     
cd              none                 114
                ileocolonic           51
                colectomy             38
                partial_colectomy      5
missing         missing                2
not applicable  not applicable       125
uc              none                 130
                colectomy             14
Name: surgery_type, dtype: int64

# Comparisons we need to make 

- UC no surgery vs CD no surgery

- UC vs CD (all)

- UC:
    - no surgery vs colectomy with pouch
    - no surgery vs colectomy with pouchitis
    - no surgery vs colectomy with cd pouch (this info is in disease subtype)
    
- CD:    
    - no surgery vs ileocolonic
    - no surgery vs colectomy
    - no surgery vs colectomy ileorectal
    - no surgery vs colectomy ileostomy

In [20]:
update_cols = ['surgery_type', 'surgery_subtype', 'disease_subtype', 'host_subject_id']

In [22]:
mf[update_cols].to_csv('../../old-files/qiita-files/qiita-updates-post-draft.2.tsv', sep='\t')

In [2]:
mf = load_mf('data/11546_20190819-143944.txt', 'sample_name')

UC:
post surgery (will include additional CD patients who developed CD of the
pouch)

   1. colectomy with pouch — 10 patients (column AC = yes)
      1. subsequent normal pouch (100119, 100125)
      2. pouchitis (100029, 1*00191, 100064*)
      3. UC with subsequent development of CD (*100023, 100031, 100013,
      100122, 100067)*—
   2. *colectomy with end ileostomy—> none*
   3. Control: UC without surgery


CD
post surgery

   1. ileocolonic resection (within pt with CD ileoclonic + partial
   colectomy: 100038, 100176)
   2. ileorectal anastomosis (100060, 100144, 100054)
   3. end ileostomy (CD + colectomy without pouch: 100140,100174, 100074,
   100127) --> this would be the Crohn's disease with colectomy no pouch
   4. control: CD without surgery
   

We have a series of comparisons that need to be made and these can't be based on any of the metadata categories because the data is encoded in a complicated manner. The easeiest is to have the following two categories. The first one reflects the disease for analysis, and the second one the way that samples are tagged within that disease status.

In [88]:
mf['category_1'] = mf['disease']
mf['category_2'] = 'TBD'
mf['category_3'] = 'TBD'

## UC

In [89]:
subset = ['100119', '100125', '100029', '100191', '100064', '100023', '100031', '100013', '100122', '100067']

mf.loc[mf.host_subject_id.isin(subset), 'category_2'] = 'Colectomy with pouch'

subset = ['100119', '100125']
mf.loc[mf.host_subject_id.isin(subset), 'category_3'] = 'Subsequent normal pouch'

subset = ['100029', '100191', '100064']
mf.loc[mf.host_subject_id.isin(subset), 'category_3'] = 'Pouchitis'

subset = ['100023', '100031', '100013', '100122', '100067']
# these need to be marked as UC for the analysis as they developed that later down the road
mf.loc[mf.host_subject_id.isin(subset), 'category_1'] = 'uc'
mf.loc[mf.host_subject_id.isin(subset), 'category_3'] = 'Subsequent dev of CD'

Subjects that don't have surgery and have UC are controls.

In [90]:
mf.loc[mf.query("disease == 'uc' and has_surgery == 'n'").index, 'category_2'] = 'Control'
mf.loc[mf.query("disease == 'uc' and has_surgery == 'n'").index, 'category_3'] = 'Control'

## CD

There are only three subjects with ileorectal anastomosis.

In [91]:
subset = ['100060', '100144', '100054']
mf.loc[mf.host_subject_id.isin(subset), 'category_2'] = 'Ileorectal anastomosis'
mf.loc[mf.host_subject_id.isin(subset), 'category_3'] = 'Ileorectal anastomosis'

The following subjects all don't have a pouch

In [92]:
subset = ['100140', '100174', '100074', '100127', '100129']
mf.loc[mf.host_subject_id.isin(subset), 'category_2'] = 'End ileostomy'
mf.loc[mf.host_subject_id.isin(subset), 'category_3'] = 'End ileostomy'

Subjects with CD and without surgery are controls:

In [93]:
mf.loc[mf.query("disease == 'cd' and has_surgery == 'n'").index, 'category_2'] = 'Control'
mf.loc[mf.query("disease == 'cd' and has_surgery == 'n'").index, 'category_3'] = 'Control'

Lastly all the other subjects that have surgery, have CD and are not the previous ones should be labeled as "ileocolonic resection". These comprise the ileocolonic surgery with partial colectomy.

In [94]:
subset = ['100060', '100144', '100054', '100140', '100174', '100074', '100127', '100129']
mf.query('category_1 == "cd" and has_surgery == "y" and host_subject_id not in (%s)' % str(subset)[1:-1]).surgery_type.value_counts()

ileocolonic          51
partial_colectomy     5
none                  3
Name: surgery_type, dtype: int64

Note subject 100105 is coming into this category, even though the surgery type says none.

In [95]:
subset = ['100060', '100144', '100054', '100140', '100174', '100074', '100127', '100129']
samples = mf.query('category_1 == "cd" and has_surgery == "y" and host_subject_id not in (%s)' % str(subset)[1:-1]).index

mf.loc[samples, 'category_2'] = 'Ileocolonic resection'
mf.loc[samples, 'category_3'] = 'Ileocolonic resection'
mf.loc[mf.host_subject_id.isin(['100038', '100176']), 'category_3'] = 'Partial colectomy'

In [96]:
mf.groupby(['category_1', 'category_2', 'category_3']).host_subject_id.value_counts()

category_1      category_2              category_3               has_surgery   
cd              Control                 Control                  n                 111
                End ileostomy           End ileostomy            y                  11
                Ileocolonic resection   Ileocolonic resection    y                  54
                                        Partial colectomy        y                   5
                Ileorectal anastomosis  Ileorectal anastomosis   y                   9
missing         TBD                     TBD                      missing             2
not applicable  TBD                     TBD                      not applicable    125
uc              Colectomy with pouch    Pouchitis                y                  12
                                        Subsequent dev of CD     y                  18
                                        Subsequent normal pouch  y                   2
                Control                 Control   

In [98]:
mf.groupby(['category_1', 'category_2', 'category_3']).host_subject_id.nunique()

category_1      category_2              category_3             
cd              Control                 Control                     46
                End ileostomy           End ileostomy                5
                Ileocolonic resection   Ileocolonic resection       21
                                        Partial colectomy            2
                Ileorectal anastomosis  Ileorectal anastomosis       3
missing         TBD                     TBD                          1
not applicable  TBD                     TBD                        125
uc              Colectomy with pouch    Pouchitis                    3
                                        Subsequent dev of CD         5
                                        Subsequent normal pouch      2
                Control                 Control                     45
Name: host_subject_id, dtype: int64

In [97]:
mf.to_csv('data/new-categories.1.tsv', sep='\t')