In [None]:
%matplotlib inline
from __future__ import division

from qiime.parse import parse_mapping_file
from qiime.format import format_mapping_file
from skbio.io.util import open_file
from itertools import product
from os.path import join
from biom import load_table

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')

def load_mf(fn):
    with open_file(fn, 'U') as f:
        mapping_data, header, _ = parse_mapping_file(f)
        _mapping_file = pd.DataFrame(mapping_data, columns=header)
        _mapping_file.set_index('SampleID', inplace=True)
    return _mapping_file

def write_mf(f, _df):
    with open_file(f, 'w') as fp:
        lines = format_mapping_file(['SampleID'] + _df.columns.tolist(),
                                    list(_df.itertuples()))
        fp.write(lines+'\n')

In [None]:
from qiime.util import compute_days_since_epoch

mf = load_mf('metadata-2/mapping_file.fixed.alpha.cleaned.txt')
bt = load_table('analysis/beta/otu_table.ids.nocontaminants.noblanks_even7400.biom')

# mf = mf.loc[bt.ids()]

mf.SURGERY.replace({'ileocecal resection': 'Yes', 'colonresection, ileorectal anastomosis': 'Yes',
                    'small bowelbowel resections(2x)': 'Yes', 'Ileocolectomy': 'Yes',
                    'Ilececal and ileal resection': 'Yes', 'Ilecectomy': 'Yes'}, inplace=True)

def funk(line):
    if line['IBD'] == 'Crohns' and line['SURGERY'] == 'No':
        return 'Crohns'
    elif line['IBD'] == 'Crohns' and line['SURGERY'] == 'Yes':
        return 'Crohns (surgery)'
    # We checked with Hans and none of the patients that were reported as
    # NA had surgery, so this can be considered as NO surgery
    elif line['IBD'] == 'Crohns' and line['SURGERY'] == 'NA':
        return 'Crohns'
    elif line['IBD'] == 'Healthy Controls' and line['SURGERY'] == 'NA':
        return 'Controls'
    else:
        raise ValueError('Something bad happened')

mf['SURGERY_AND_IBD'] = mf.apply(funk, axis=1, reduce=False)

def funk(line):
    if '-' in line['Description']:
        return str(int(line['Description'].split('-')[-1]))
    else:
        return 'Index subject without family members'

mf['FAMILY_NUMBER'] = mf.apply(funk, axis=1, reduce=False)

# Hans was able to confirm that none of these subjects underwent surgery
mf['SURGERY'].replace({'NA': 'No'})

# add a days since epoch column
x = []
for d in mf.COLLECTION_DATE:
    if '/' in d:
        mo, da, ye = d.split('/')
        value = compute_days_since_epoch(da, mo, '20'+ye)
    else:
        value = np.nan
    x.append(value)

# make sure we have the right number of rows
assert len(x) == len(mf)
mf['DAYS_SINCE_EPOCH'] = np.array(x).astype(str)

write_mf('metadata-2/mapping_file.fixed.alpha.cleaned.cleaned.txt', mf)