In [368]:
import os 
import re
import sys
sys.path.append('/projects/CARDIPS/database/Cardips/')
import django
django.setup()
import pandas as pd
from data.models import ATACS as data_atacs 
from subject.models import Subject as subject_subject
from django_pandas.io import read_frame

def get_data_atacs_cell_type(data_atacs_id):
    """
    Determine the celltype for the data_atacs_id sample.
    
    Parameters
    ----------
    data_atacs_id : str
        uuid from the data_atacs table.
        
    Returns
    -------
    celltype : str
        the celltype as entered in database. 
    """
    data_atacs_obj = data_atacs.objects.get(id=data_atacs_id)
    
    # Samples which have been linked 
    if data_atacs_obj.sample != None: 
        if data_atacs_obj.sample._meta.app_label in ['timecourse', 'data', 'family1070']:
            celltype = data_atacs_obj.sample.tissue.cell
            
        elif data_atacs_obj.sample._meta.app_label == 'product':
            # The product_pellets has a many to many relationship for the tissue for which 
            # you have to check congruency. 
            tissues = data_atacs_obj.sample.tissue.all()
            if len(tissues) == 0:
                celltype = 'Cannot determine. product_atacs object does not have an tissues assigned.'
            elif len(tissues) == 1:
                celltype = tissues[0].cell
            else:
                flags = [True if x.cell.strip() == tissues[0].cell.strip() else False for x in tissues]
                if all(flags):
                    celltype = tissues[0].cell
                else:
                    celltype = 'Cannot determine. product_atacs object has multiple tissues with contradicting cell type.'
                    
        else:
            celltype = 'Cannot determine. Trouble shoot the product_atacs_object.'
            
    # Samples which have not been linked 
    else:
        type1_re = re.search('(HEK|HeLa|ISL)', data_atacs_obj.name)
        type2_re = re.search('(FD[1-9])', data_atacs_obj.name)
        type3_re = re.search('(FD1[1-9])', data_atacs_obj.name)
        if type1_re:
            celltype = type1_re.groups()[0]
        elif type2_re:
            celltype = 'K562'
        elif type3_re:
            celltype = 'CM'  
        else:
            celltype = 'Cannot determine. data_atacs object does not have a sample --> tissue relationship.'
    return celltype 

def calculate_peak_qc_stats(data_atacs_id, bam, peak, variant_coverage, region_coverage, peak_qc_metrics, celltype, num_input_reads):
    """
    Calculate the quality control statistics for the atac's peak results.
    
    Parameters
    ----------
    data_atacs_id: str
        the data_atacs_id generate inside of the CARDiPS database. 
    bam : str
        the sort_rmdup bam file after aligning and processing the atac sequencing results. 
    peak : str
        the narrowpeak file produced by macs2. 
    variant_coverage : str
        produced as a result of running _____script_name_____.
    region_coverage : str
        produced as a result of running _____script_name_____.
    peak_qc_metrics : str
        name of the output qc file. 
    celltype : str
        the cell type of the sample with the corresponding Data_atacs ID. 
    num_input_reads : int
        the number of input reads determined by picard. 
    
    """
    
    dummy_fn = './testing_peak_qc_metrics.sh'
    
    lines = []
    lines.append('python /frazer01/home/joreyna/repos/cdpipelines/cdpipelines/scripts/calculate_peak_enrichment.py')
    lines.append('-dataID {}'.format(data_atacs_id))
    lines.append('-bam {}'.format(bam))
    lines.append('-peak {}'.format(peak))
    lines.append('-var_cov {}'.format(variant_coverage))
    lines.append('-reg_cov {}'.format(region_coverage))
    lines.append('-output {}'.format(peak_qc_metrics))
    lines.append('--cellType {}'.format(celltype))
    lines.append('--NumInputReads {}'.format(num_input_reads))
    
    with open(dummy_fn, 'w') as f:
        for i, line in enumerate(lines):
            if i == 0:
                f.write('{} \\\n'.format(line))
            elif i < len(lines) - 1:
                f.write('\t{} \\\n'.format(line))
            else:
                f.write('\t{}'.format(line))    
    #     with open(self.filename, 'a') as f:
    #         f.write('\t'.join(lines))
        
    return variant_coverage, region_coverage, peak_qc_metrics

In [384]:
data_atacs_id = '5e149150-ee49-462c-85ec-c95291ddeebf'
bam = '/frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/alignment/5e149150-ee49-462c-85ec-c95291ddeebf_sorted_rmdup.bam'
peak = '/frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/macs2/5e149150-ee49-462c-85ec-c95291ddeebf_peaks.narrowPeak'
variant_coverage = '/frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/sh/{}_variant.coverage'.format(data_atacs_id)
region_coverage = '/frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/sh/{}_region.coverage'.format(data_atacs_id)
peak_qc_metrics = '/frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/sh/{}_peak_qc_metrics.tsv'.format(data_atacs_id)
peak_qc_metrics = '/frazer01/home/joreyna/work/20161010_bill/{}_peak_qc_metrics.tsv'.format(data_atacs_id)

# Extracting Number of input reads from STAR log file. 
aln_metrics = pd.read_csv( \
    '/projects/CARDIPS/pipeline/ATACseq/sample/{0}/alignment/{0}_Log.final.out'.format(data_atacs_id), \
      header=None, sep='|', index_col=0, skiprows=[4, 7, 22, 27])
aln_metrics.iloc[:, 0] = aln_metrics.iloc[:, 0].apply(lambda x: x.strip())
aln_metrics.index = aln_metrics.index.str.strip()
num_input_reads = int(aln_metrics.ix['Number of input reads', 1])

# Extracting celltype 
celltype = get_data_atacs_cell_type(data_atacs_id)

In [385]:
calculate_peak_qc_stats(data_atacs_id, bam, peak, variant_coverage, region_coverage, peak_qc_metrics, celltype, num_input_reads)

('/frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/sh/5e149150-ee49-462c-85ec-c95291ddeebf_variant.coverage',
 '/frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/sh/5e149150-ee49-462c-85ec-c95291ddeebf_region.coverage',
 '/frazer01/home/joreyna/work/20161010_bill/5e149150-ee49-462c-85ec-c95291ddeebf_peak_qc_metrics.tsv')

In [386]:
print open('./testing_peak_qc_metrics.sh').read()

python /frazer01/home/joreyna/repos/cdpipelines/cdpipelines/scripts/calculate_peak_enrichment.py \
	-dataID 5e149150-ee49-462c-85ec-c95291ddeebf \
	-bam /frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/alignment/5e149150-ee49-462c-85ec-c95291ddeebf_sorted_rmdup.bam \
	-peak /frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/macs2/5e149150-ee49-462c-85ec-c95291ddeebf_peaks.narrowPeak \
	-var_cov /frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/sh/5e149150-ee49-462c-85ec-c95291ddeebf_variant.coverage \
	-reg_cov /frazer01/projects/CARDIPS/pipeline/ATACseq/sample/5e149150-ee49-462c-85ec-c95291ddeebf/sh/5e149150-ee49-462c-85ec-c95291ddeebf_region.coverage \
	-output /frazer01/home/joreyna/work/20161010_bill/5e149150-ee49-462c-85ec-c95291ddeebf_peak_qc_metrics.tsv \
	--cellType CM \
	--NumInputReads 28899238


In [387]:
query = data_atacs.objects.all()
df = read_frame(query, index_col='id')
ts = [get_data_atacs_cell_type(index) for index, sr in df.iterrows()]
df['celltype'] = ts

In [389]:
g = df.groupby('celltype')

In [391]:
l = g.groups.keys()

In [392]:
l.sort; l

['Cannot determine. data_atacs object does not have a sample --> tissue relationship.',
 u'CM',
 u'CC',
 u'iPSC',
 'K562',
 u'MP',
 u'ISL',
 u'HeLa',
 u'CP',
 u'HEK']

In [393]:
len(l)

10

In [398]:
celltype_errors_df = pd.concat([df[df.celltype.str.contains('^Cannot')],
    df[df.celltype.str.contains('MP')],
    df[df.celltype.str.contains('^CC')],
    df[df.celltype.str.contains('^CP')]])

In [406]:
celltype_errors_df.index.name = 'data_atacs_id'

In [407]:
celltype_errors_df

Unnamed: 0_level_0,sample_type,sample_id,sequence,name,status,comment,celltype
data_atacs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c13d47ee-f2e9-4a4a-92f7-d0ee2c2413a1,,,160820_Salk,T103_CM_C5P30_UDID073_ATAC_R01L01S02,Processing,,Cannot determine. data_atacs object does not h...
c2d3d7cc-3ece-423f-81d9-5f37fc002357,,,160820_Salk,T103_CM_C4P37_UDID056_ATAC_R01L01S02,Processing,,Cannot determine. data_atacs object does not h...
e7021f4e-90a7-456a-af8d-ed488cc405b5,,,160820_Salk,T104_CM_C6P25_UDID058_ATAC_R01L01S02,Processing,,Cannot determine. data_atacs object does not h...
f2d5469b-395f-44e1-9f12-511aff7c2ea1,,,160820_Salk,T211_CM_C3P30_UDID108_ATAC_R02L01S02,Processing,,Cannot determine. data_atacs object does not h...
ffbe88c6-49c0-4fce-8542-56bb2930ee4a,,,160820_Salk,S07002_CM_C7P29_UDID053_ATAC_R01L01S02,Processing,,Cannot determine. data_atacs object does not h...
0807363e-03bb-4e93-812f-8d4239c5f48c,,32.0,141205_D00611_0040_BC67L7ANXX,S07009_d2_1,Active,,MP
1b90c5f8-4245-419a-82fd-7944b14588f7,,12.0,141205_D00611_0040_BC67L7ANXX,S07002_d2_3,Active,,MP
21de5e31-8409-4fb8-a764-f3210eb87a7f,,7.0,141205_D00611_0040_BC67L7ANXX,S07002_d2_2,Active,,MP
4bf4760a-52b8-45b9-89c6-c068b952e74d,,22.0,141205_D00611_0040_BC67L7ANXX,S07003_d2_2,Active,,MP
4d75c71d-a7d6-4280-9248-36ac6712ff14,,2.0,141205_D00611_0040_BC67L7ANXX,S07002_d2_1,Active,,MP


In [412]:
fn = '/frazer01/home/joreyna/work/20161011_db/celltype_errors_df.csv'

In [413]:
celltype_errors_df.to_csv(fn); 

In [414]:
!cp $fn /nas/Personal\ folders/Joaquin

<h2>Finding out why Sample are not Properly Linked</h2>

In [425]:
# Samples which are not properly linked 
df2 = df[df.celltype.str.contains('^Cannot')]

In [493]:
# Aga's ATAC master linking table
aga_master = pd.read_excel('ATAC_MASTER_TABLE_20161004_ADC_20161006_0109.xlsx')

In [495]:
# Checking to see if ANY of the sample_name's can be found in aga's master table; it cannot 
aga_master[aga_master['Sample.Name'].isin(df2.name.tolist())]

Unnamed: 0,Sample.Name,Flow.Cell,Lane,Project,Library_Sequencing_Manifest_date,Assay_Type,ATACseq.ID,UDID,Subject,ClonePassage,...,Date.harvested,Date.tagmented,Operator.Tag,Tn5.enzyme.lot.,Tagm.buffer.lot,Qubit.ng.ul,Avg.MW,Percentage.150.1000bp,nM,nM.adjusted


In [None]:
# Samples with S02
df3 = read_frame(data_atacs.objects.filter(name__in=df2.name.tolist()))
df3.sort_values('name', inplace=True)
df3['merge'] = df3.name.str.replace('S02', 'S0')

In [471]:
# Samples with S01 
df4 = read_frame(data_atacs.objects.filter(name__in=[x.replace('S02', 'S01') for x in df2.name.tolist()]))
df4.sort_values('name', inplace=True)
df4['merge'] = df4.name.str.replace('S01', 'S0')
df5 = pd.merge(df3, df4, on='merge', suffixes=['_160820_Salk', '_160525_K00180_0185_AH7W7NBBXX_IGM'])

In [472]:
fn = '/frazer01/home/joreyna/work/20161011_db/missing_samples_df.csv'

In [473]:
df5.to_csv(fn)

In [474]:
!cp $fn /nas/Personal\ folders/Joaquin

In [478]:
df2.name.tolist()

[u'T103_CM_C5P30_UDID073_ATAC_R01L01S02',
 u'T103_CM_C4P37_UDID056_ATAC_R01L01S02',
 u'T104_CM_C6P25_UDID058_ATAC_R01L01S02',
 u'T211_CM_C3P30_UDID108_ATAC_R02L01S02',
 u'S07002_CM_C7P29_UDID053_ATAC_R01L01S02']

In [482]:
df3

Unnamed: 0,sample_type,sample_id,sequence,id,name,status,comment,merge
4,,,160820_Salk,ffbe88c6-49c0-4fce-8542-56bb2930ee4a,S07002_CM_C7P29_UDID053_ATAC_R01L01S02,Processing,,S07002_CM_C7P29_UDID053_ATAC_R01L01S0
1,,,160820_Salk,c2d3d7cc-3ece-423f-81d9-5f37fc002357,T103_CM_C4P37_UDID056_ATAC_R01L01S02,Processing,,T103_CM_C4P37_UDID056_ATAC_R01L01S0
0,,,160820_Salk,c13d47ee-f2e9-4a4a-92f7-d0ee2c2413a1,T103_CM_C5P30_UDID073_ATAC_R01L01S02,Processing,,T103_CM_C5P30_UDID073_ATAC_R01L01S0
2,,,160820_Salk,e7021f4e-90a7-456a-af8d-ed488cc405b5,T104_CM_C6P25_UDID058_ATAC_R01L01S02,Processing,,T104_CM_C6P25_UDID058_ATAC_R01L01S0
3,,,160820_Salk,f2d5469b-395f-44e1-9f12-511aff7c2ea1,T211_CM_C3P30_UDID108_ATAC_R02L01S02,Processing,,T211_CM_C3P30_UDID108_ATAC_R02L01S0


In [None]:
# Checking UUID's that are part of the fast

In [503]:
df5

Unnamed: 0,sample_type_160820_Salk,sample_id_160820_Salk,sequence_160820_Salk,id_160820_Salk,name_160820_Salk,status_160820_Salk,comment_160820_Salk,merge,sample_type_160525_K00180_0185_AH7W7NBBXX_IGM,sample_id_160525_K00180_0185_AH7W7NBBXX_IGM,sequence_160525_K00180_0185_AH7W7NBBXX_IGM,id_160525_K00180_0185_AH7W7NBBXX_IGM,name_160525_K00180_0185_AH7W7NBBXX_IGM,status_160525_K00180_0185_AH7W7NBBXX_IGM,comment_160525_K00180_0185_AH7W7NBBXX_IGM
0,,,160820_Salk,ffbe88c6-49c0-4fce-8542-56bb2930ee4a,S07002_CM_C7P29_UDID053_ATAC_R01L01S02,Processing,,S07002_CM_C7P29_UDID053_ATAC_R01L01S0,atacs,76,160525_K00180_0185_AH7W7NBBXX,f1e0d609-fe23-4a50-9316-e59094738d07,S07002_CM_C7P29_UDID053_ATAC_R01L01S01,Processing,
1,,,160820_Salk,c2d3d7cc-3ece-423f-81d9-5f37fc002357,T103_CM_C4P37_UDID056_ATAC_R01L01S02,Processing,,T103_CM_C4P37_UDID056_ATAC_R01L01S0,atacs,82,160525_K00180_0185_AH7W7NBBXX,08281563-cef0-4526-b865-b21f7d1a2a03,T103_CM_C4P37_UDID056_ATAC_R01L01S01,Processing,
2,,,160820_Salk,c13d47ee-f2e9-4a4a-92f7-d0ee2c2413a1,T103_CM_C5P30_UDID073_ATAC_R01L01S02,Processing,,T103_CM_C5P30_UDID073_ATAC_R01L01S0,atacs,129,160525_K00180_0185_AH7W7NBBXX,1e54bc10-33dd-4758-aff1-fb9d50db1c0d,T103_CM_C5P30_UDID073_ATAC_R01L01S01,Processing,
3,,,160820_Salk,e7021f4e-90a7-456a-af8d-ed488cc405b5,T104_CM_C6P25_UDID058_ATAC_R01L01S02,Processing,,T104_CM_C6P25_UDID058_ATAC_R01L01S0,atacs,86,160525_K00180_0185_AH7W7NBBXX,08ee2e14-a5fd-45fc-8ab3-617809befbf3,T104_CM_C6P25_UDID058_ATAC_R01L01S01,Processing,
4,,,160820_Salk,f2d5469b-395f-44e1-9f12-511aff7c2ea1,T211_CM_C3P30_UDID108_ATAC_R02L01S02,Processing,,T211_CM_C3P30_UDID108_ATAC_R02L01S0,atacs,162,160525_K00180_0185_AH7W7NBBXX,7fa51fb8-13e0-4c96-b4b3-35a789e394b7,T211_CM_C3P30_UDID108_ATAC_R02L01S01,Processing,
