In [2]:
# import libraries
import pandas as pd
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# disable warnings, use w caution
import warnings
warnings.filterwarnings('ignore')

# project specific libs
import os
import labtools.statistics

In [3]:
# set path
path = '/Users/KevinBu/Desktop/clemente_lab/Projects/ampaim/'

##### Metadata #####

In [4]:
# read in data
df = pd.read_csv(path + 'inputs/samples.txt', sep='\t',index_col=0)

# trim sample ids and drop dups
df.index = df.index.map(lambda x: x.split('_S')[0])
df = df[~df.index.duplicated(keep='first')]

# rename index and drop q2 shenanigans
df.index.name = 'SampleID'
df=df.drop('#q2:types')

# autofill diagnoses
def get_dx(x):
    if 'psa' in x:
        return 'PsA'
    elif 'ra' in x:
        return 'RA'
    elif 'pso' in x:
        return 'PsO'
    elif 'lockit' in x:
        return 'SLE'
    elif 'ellipss' in x:
        return 'PsO'
    elif 'sle' in x:
        return 'SLE'
    elif 'stamp' in x:
        return 'SjD'
    elif 'ibd' in x:
        return 'IBD'
    elif 'MOC':
        return 'Control'
    else:
        return 'NA'

df['Diagnosis'] = df.index.map(lambda x: get_dx(x))
df.to_csv(path + 'inputs/ampaim_metadata.txt', sep='\t')
df.Diagnosis.value_counts()

Diagnosis
PsA        144
SjD         85
RA          72
PsO         53
SLE         22
IBD          4
Control      3
Name: count, dtype: int64

In [21]:
# RB md
df_rb = pd.read_csv(path + 'inputs/Mapping file_yonghua_021925v5_rbb.txt', sep='\t',index_col=0)
df_rb.index.name = 'SampleID'
df_rb = df_rb.dropna(subset='Subject_ID')

# drop cols of all na
df_rb = df_rb.dropna(how='all',axis=1)

# drop halo; 356 samples all seem to be accounted for :) in Adam's downloads
df_rb = df_rb[df_rb['Project'] != 'Halo']

# drop non AMP AIM analyses
df_rb = df_rb[df_rb['Include in AMP AIM analyses?'] != 'NO']

# need to replace . with - so it is compatible with fastqs
df_rb.index = df_rb.index.map(lambda x: x.replace('.','-'))

df_rb.head()

Unnamed: 0_level_0,Subject_ID,Include in AMP AIM analyses?,AMP AIM,Project,Diagnosis,Plate,Number_Sample_Plate,Template_DNA_well,Amplicon_Well,Amp_Well_Plate,...,Seq_Barcode,Plates_For_March_Sequencing_2014,BARCODE_PLATE_GTC,Loc_Barcode_Plate_GTC,Barcode_Label_GTC,Seq_Barcode_GTC,Unnamed: 44,Unnamed: 45,MiSeq_Folder,Notes
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NEG-4,NEG.4,,,neg,neg,348,75,C10,C10,348_75,...,515rcbc513,TCCTAGGTCCGA,6,C10,515rcbc513,TCCTAGGTCCGA,1/23/25,MSQ152,1/23/25,https://genome.med.nyu.edu/results/segallab/20...
MOC-348,MOC.348,,,moc,moc,348,76,D10,D10,348_76,...,515rcbc525,CTCTTCTGATCA,6,D10,515rcbc525,CTCTTCTGATCA,1/23/25,MSQ152,1/23/25,https://genome.med.nyu.edu/results/segallab/20...
209-pso,209.pso,YES,,atrisk,pso,349,41,A6,A6,349_41,...,515rcbc485,GAGAGGGATCAC,6,A6,515rcbc485,GAGAGGGATCAC,1/23/25,MSQ152,1/23/25,https://genome.med.nyu.edu/results/segallab/20...
241-pso,241.pso,YES,,atrisk,pso,349,42,B6,B6,349_42,...,515rcbc497,TTGCGACAAAGT,6,B6,515rcbc497,TTGCGACAAAGT,1/23/25,MSQ152,1/23/25,https://genome.med.nyu.edu/results/segallab/20...
260-pso,260.pso,YES,,atrisk,pso,349,43,C6,C6,349_43,...,515rcbc509,CAGTGTCATGAA,6,C6,515rcbc509,CAGTGTCATGAA,1/23/25,MSQ152,1/23/25,https://genome.med.nyu.edu/results/segallab/20...


In [24]:
# which samples are missing in our run?
rb = list(df_rb.index.values)
ac = list(df.index.values)

inter = set(rb).intersection(set(ac))
print(len(inter))
rb_diff = set(rb).difference(set(ac))
print(rb_diff)
ac_diff = set(ac).difference(set(rb))
print(ac_diff)


267
{'134-b-HC-microra', '188-b-HC-microra', '183-b-HC-microra', '99-b-HC-microra', '100-b-HC-microra', 'NEG-4', '186-b-HC-microra', '158-b-HC-microra', '168-b-HC-microra', 'MOC-348', '160-b-HC-microra', 'NEG-5', 'NEG-6', '123-b-HC-microra', '150-b-HC-microra'}
{'PA1018-24-dipsa', 'PA1016-00-dipsa', '520-0-twin-psa', '557-0-twin-psa', 'PA1020-00-dipsa', '561905-12-dipsa', 'PA1001-12-dipsa', 'PA1018-12-dipsa', 'PA1004-12-dipsa', '541-0-twin-psa', '544-0-twin-psa', '556-0-twin-psa', 'PA1005-12-dipsa', 'PA1016-24-dipsa', '553-0-twin-psa', '521-0-twin-psa', '532-0-twin-psa', 'PA1020-24-dipsa', '276250-24-dipsa', '528-0-twin-psa', '547-0-twin-psa', '519-0-twin-psa', '571666-12-dipsa', '276250-00-dipsa', 'PA1001-24-dipsa', '552-0-twin-psa', 'PA1002-00-dipsa', 'PA1023-12-dipsa', 'PA1019-00-dipsa', '174086-00-dipsa', '554460-24-dipsa', '561859-24-dipsa', '193246-24-dipsa', '538-0-twin-psa', '525-0-twin-psa', '276250-12-dipsa', '539-0-twin-psa', 'PA1014-00-dipsa', 'PA1023-24-dipsa', '534-0-twin