# Fill sampletable/config.yaml for the chipseq-wf
Would be really useful to have a script to parse out information needed for these files because doing it by hand is a huge pain, and we have a lot of samples to run. 

All the information comes from this spreadsheet: 
{ note !! when saving this excel file, choose 'CSV UTF-8' } 


In [1]:
import pandas as pd
import glob
import os
import ruamel.yaml as YAML

In [2]:
spreadsheet = pd.read_csv('../output/chip/20171103_s2cell_chip-seq.csv')
spreadsheet.head()

Unnamed: 0,srx,cell_line,geo,paper,strategy,antibody,srr,chromatin,input
0,SRX191907,S2 DRSC: Schneider's line 2,GSM1015404,"['24055367', '24055367']",ChIP-Seq,CP190 antibody (rabbit),SRR580337,0,SRX191915
1,SRX191908,S2 DRSC: Schneider's line 2,GSM1015405,"['24055367', '24055367']",ChIP-Seq,CP190 antibody (rabbit),SRR580338,0,SRX191916
2,SRX191909,S2 DRSC: Schneider's line 2,GSM1015406,"['24055367', '24055367']",ChIP-Seq,Su(Hw) antibody 406 (rabbit),SRR580339,0,SRX191915
3,SRX191910,S2 DRSC: Schneider's line 2,GSM1015407,"['24055367', '24055367']",ChIP-Seq,Su(Hw) antibody 406 (rabbit),SRR580340,0,SRX191916
4,SRX191911,S2 DRSC: Schneider's line 2,GSM1015408,"['24055367', '24055367']",ChIP-Seq,Mod(mdg4)2.2 antibody (rabbit),SRR580341,0,SRX191915


### Sample table: 
- should have columns "samplename, antibody, biological_material, replicate, label, orig_filename" 
- put in a list of SRRs that you want sampletable for (for example all SRRs that aren't chromatin)
- write out to "sampletable_test.tsv" 

In [3]:
all_chromatin = spreadsheet[spreadsheet.chromatin == 1]
no_chromatin = spreadsheet[spreadsheet.chromatin == 0]

In [4]:
chrom_srr_list = all_chromatin.srr.values

In [13]:
SRR_list = chrom_srr_list[0:25] #biggest issue right now = how to handle paired end?? 
table = []

for val in SRR_list:
    for srr in val.split('|'):
        SRR = srr.strip()
        row = spreadsheet[spreadsheet.srr.astype(str).str.contains(SRR)]
        antibody = row.antibody.values[0].replace(' ', '-')
        srx = row.srx.values[0]
        inpt = row.input.values[0]
        PATH = '/data/MiegNCBI/ncbi_remap/prealn-wf/output/samples/'+srx+'/'+SRR
        with open(glob.glob(PATH+'/LAYOUT')[0]) as f:
            for line in f:
                if [line == 'SE'] and [os.path.getsize(PATH+'/'+SRR+'_1.fastq.gz') > 0]: 
                    filename = PATH+'/'+SRR+'_1.fastq.gz'
                if line == 'SE' and os.path.getsize(PATH+'/'+SRR+'_2.fastq.gz') > 0:
                    filename = PATH+'/'+SRR+'_2.fastq.gz'
                if line != 'SE': 
                    filename = [PATH+'/'+SRR+'_1.fastq.gz', PATH+'/'+SRR+'_2.fastq.gz']
        new_row = [SRR, antibody, 's2cell', '1', srx, filename]
        table.append(new_row)
        if inpt != 'no input?':
            inpt_row = spreadsheet[spreadsheet.srx.astype(str).str.contains(inpt)]
            inpt_srr = inpt_row.srr.values[0]
            if inpt_srr not in SRR_list: 
                SRR_list.append(inpt_srr)
    
my_sampletable = pd.DataFrame(table, columns=['samplename','antibody','biological_material','replicate','label',
                                             'orig_filename']) 
#write out 
my_sampletable.to_csv('../chipseq-wf/config/sampletable_test.tsv', sep='\t', index=False)

In [14]:
my_sampletable.head()

Unnamed: 0,samplename,antibody,biological_material,replicate,label,orig_filename
0,SRR580343,dCTCF-antibody-68-(rat),s2cell,1,SRX191913,/data/MiegNCBI/ncbi_remap/prealn-wf/output/sam...
1,SRR580344,dCTCF-antibody-68-(rat),s2cell,1,SRX191914,/data/MiegNCBI/ncbi_remap/prealn-wf/output/sam...
2,SRR580345,input,s2cell,1,SRX191915,/data/MiegNCBI/ncbi_remap/prealn-wf/output/sam...
3,SRR580346,input,s2cell,1,SRX191916,/data/MiegNCBI/ncbi_remap/prealn-wf/output/sam...
4,SRR585041,input,s2cell,1,SRX193308,/data/MiegNCBI/ncbi_remap/prealn-wf/output/sam...


### Config file
- yaml
- fill in label, algorithm, ip, control, extra
- write out to "copy_config.yaml"

In [7]:
#pull out each label that isn't input 
#keep algorithm and extra set for now, 2nd extra parameter set for not chromatin?? 
algorithm = 'macs2'
extra = '-g dm --bdg --broad --slocal 5000'
SRR_list = chrom_srr_list[0:25]

block_list = []
with open('../chipseq-wf/config/copy_config.yaml', 'w') as outfile: 
    for SRR in SRR_list: 
        row = spreadsheet[spreadsheet.srr.astype(str).str.contains(SRR)]
        if not row.antibody.astype(str).str.contains('input').bool():
            srx = row.srx.values[0]
            label = srx
            ip = srx
            control = row.input.values[0]
            block = {'label': srx, 'algorithm': algorithm, 'ip': [ip], 'control': [control], 
                                    'extra': YAML.scalarstring.SingleQuotedScalarString(extra)}
            block_list.append(block)
    with open('../chipseq-wf/config/config.yaml', 'r') as c:
        page = YAML.load(c, Loader=YAML.RoundTripLoader, preserve_quotes=True)
        for block in block_list: 
            page['chipseq']['peak_calling'].append(block)
        YAML.dump(page, outfile, Dumper=YAML.RoundTripDumper)