In [1]:
import pandas as pd
import numpy as np

In [2]:
fpath = '/idi/moc_ec/MOC/MKJoin_files/SCR-0001.1//Klebs_1_KeyMetrics.txt'

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
# sample-wise data frame
df_sample = pd.read_csv(fpath, sep='\t')

In [53]:
pool_wise_dict = {
    'MOCP_ID': [],
    'Project_ID': [],
    'Plate/Box_ID': [],
    'Pool_ID': [],
    'pool_total_reads': [],
    'pool_total_frags_counted': [],
    'pool_pcnt_aligned': [],
    'pool_pcnt_properly_mapped_pairs': [],
    'pool_average_insert_len': [],
    'pool_pcnt_sense': [],
    'pool_CDS_total_counts_for_replicon': [],
    'pool_CDS_pcnt_of_counted': [],
    'pool_CDS_pcnt_sense': [],
    'pool_rRNA_pcnt_of_counted': [],
    'pool_rRNA_pcnt_sense': [],
    'pool_misc_RNA_pcnt_of_counted': [],
    'pool_misc_RNA_pcnt_sense': [],
    'pool_tRNA_pcnt_of_counted': [],
    'pool_tRNA_pcnt_sense': [],
    'pool_IGR_pcnt_of_counted': [],
}

# columns that are generic to all samples in the pool
generic_cols = ['MOCP_ID', 'Project_ID', 'Plate/Box_ID', 'Pool_ID']
for pool_id, pool_rows in df_sample.groupby('Plate/Box_ID'):
    for col in generic_cols:
        pool_wise_dict[col].append(pool_rows[col].values[0])
    
    # 'Pcnt_bc_in_pool' represents the percent of sequences in the pool that are mapped to a corresponding sample. 
    # This mapping is based on the inline/barcode sequence. 
    # Since there are sequences with barcodes that do not map to any recognized inline seq (ambiguous/no match), 'Pcnt_bc_in_pool' does not sum up to 1
    pcnt_bc_in_pool = pool_rows['Pcnt_bc_in_pool'].values
    
    # Normalizing this column, so that it adds to 1
    pcnt_bc_in_pool_normalized = pcnt_bc_in_pool*100 / np.sum(pcnt_bc_in_pool)
    
    # As explained above, since 'Pcnt_bc_in_pool' represents the percent of sequences in the pool that are mapped to a corresponding sample. 
    pcnt_sample_in_pool_normalized = pcnt_bc_in_pool_normalized
    
    pool_total_reads = np.sum(pool_rows['Total_reads'].values)
    pool_total_frags_counted = np.sum(pool_rows['Total_frags_counted'].values)
    
    pool_pcnt_aligned = np.average(pool_rows['pcnt_aligned'].values, weights=pcnt_sample_in_pool_normalized)
    
    # The below two metrics are not calculated accurately. The weights given are %reads in the sample, we rather need %fragments in the sample
    pool_pcnt_properly_mapped_pairs = np.average(pool_rows['pcnt_properly_mapped_pairs'].values, weights=pcnt_sample_in_pool_normalized)
    pool_average_insert_len = np.average(pool_rows['average_insert_len'].values, weights=pcnt_sample_in_pool_normalized)
    
    # number of aligned reads per sample
    aligned_reads = (pool_rows['Total_reads'].values * pool_rows['pcnt_aligned'].values / 100).astype(np.int64)
    
    # number of aligned sense reads per sample
    aligned_sense_reads = (aligned_reads * pool_rows['pcnt_sense'].values / 100).astype(np.int64)
    
    # percent of aligned reads that are in sense direction
    pool_pcnt_sense = 100 * np.sum(aligned_sense_reads).astype(np.float) / np.sum(aligned_reads)
    
    # CDS
    pool_CDS_total_counts_for_replicon = np.sum(pool_rows['CDS_total_counts_for_replicon'].values)
    
    pool_CDS_pcnt_of_counted = 100 * float(pool_CDS_total_counts_for_replicon) / pool_total_frags_counted
    
    num_CDS_sense = (pool_rows['CDS_pcnt_sense'].values * pool_rows['CDS_total_counts_for_replicon'].values / 100).astype(np.int64)
    pool_CDS_pcnt_sense = 100 * np.sum(num_CDS_sense).astype(np.float) / pool_CDS_total_counts_for_replicon
    
    # rRNA
    num_rRNA_frags = (pool_rows['rRNA_pcnt_of_counted'].values * pool_rows['Total_frags_counted'].values / 100).astype(np.int64)
    pool_rRNA_pcnt_of_counted = 100 * np.sum(num_rRNA_frags).astype(np.float) / pool_total_frags_counted
    
    num_rRNA_sense = (pool_rows['rRNA_pcnt_sense'].values * num_rRNA_frags / 100).astype(np.int64)
    pool_rRNA_pcnt_sense = 100 * np.sum(num_rRNA_sense).astype(np.float) / np.sum(num_rRNA_frags)
    
    # misc_RNA
    num_misc_RNA_frags = (pool_rows['misc_RNA_pcnt_of_counted'].values * pool_rows['Total_frags_counted'].values / 100).astype(np.int64)
    pool_misc_RNA_pcnt_of_counted = 100 * np.sum(num_misc_RNA_frags).astype(np.float) / pool_total_frags_counted
    
    num_misc_RNA_sense = (pool_rows['misc_RNA_pcnt_sense'].values * num_misc_RNA_frags / 100).astype(np.int64)
    pool_misc_RNA_pcnt_sense = 100 * np.sum(num_misc_RNA_sense).astype(np.float) / np.sum(num_misc_RNA_frags)
    
    # tRNA
    num_tRNA_frags = (pool_rows['tRNA_pcnt_of_counted'].values * pool_rows['Total_frags_counted'].values / 100).astype(np.int64)
    pool_tRNA_pcnt_of_counted = 100 * np.sum(num_tRNA_frags).astype(np.float) / pool_total_frags_counted
    
    num_tRNA_sense = (pool_rows['tRNA_pcnt_sense'].values * num_tRNA_frags / 100).astype(np.int64)
    pool_tRNA_pcnt_sense = 100 * np.sum(num_tRNA_sense).astype(np.float) / np.sum(num_tRNA_frags)
    
    # IGR
    num_IGR_frags = (pool_rows['IGR_pcnt_of_counted'].values * pool_rows['Total_frags_counted'].values / 100).astype(np.int64)
    pool_IGR_pcnt_of_counted = 100 * np.sum(num_IGR_frags).astype(np.float) / pool_total_frags_counted

    # add pool information to dict
    pool_wise_dict['pool_total_reads'].append(pool_total_reads)
    pool_wise_dict['pool_total_frags_counted'].append(pool_total_frags_counted)
    pool_wise_dict['pool_pcnt_aligned'].append(pool_pcnt_aligned)
    pool_wise_dict['pool_pcnt_properly_mapped_pairs'].append(pool_pcnt_properly_mapped_pairs)
    pool_wise_dict['pool_average_insert_len'].append(pool_average_insert_len)
    pool_wise_dict['pool_pcnt_sense'].append(pool_pcnt_sense)
    pool_wise_dict['pool_CDS_total_counts_for_replicon'].append(pool_CDS_total_counts_for_replicon)
    pool_wise_dict['pool_CDS_pcnt_of_counted'].append(pool_CDS_pcnt_of_counted)
    pool_wise_dict['pool_CDS_pcnt_sense'].append(pool_CDS_pcnt_sense)
    pool_wise_dict['pool_rRNA_pcnt_of_counted'].append(pool_rRNA_pcnt_of_counted)
    pool_wise_dict['pool_rRNA_pcnt_sense'].append(pool_rRNA_pcnt_sense)
    pool_wise_dict['pool_misc_RNA_pcnt_of_counted'].append(pool_misc_RNA_pcnt_of_counted)
    pool_wise_dict['pool_misc_RNA_pcnt_sense'].append(pool_misc_RNA_pcnt_sense)
    pool_wise_dict['pool_tRNA_pcnt_of_counted'].append(pool_tRNA_pcnt_of_counted)
    pool_wise_dict['pool_tRNA_pcnt_sense'].append(pool_tRNA_pcnt_sense)
    pool_wise_dict['pool_IGR_pcnt_of_counted'].append(pool_IGR_pcnt_of_counted)



In [56]:
# pool-wise data frame
df_pool = pd.DataFrame(pool_wise_dict)

cols = ['MOCP_ID','Project_ID','Plate/Box_ID','Pool_ID','pool_total_reads','pool_total_frags_counted','pool_pcnt_aligned','pool_pcnt_properly_mapped_pairs','pool_average_insert_len','pool_pcnt_sense','pool_CDS_total_counts_for_replicon','pool_CDS_pcnt_of_counted','pool_CDS_pcnt_sense','pool_rRNA_pcnt_of_counted','pool_rRNA_pcnt_sense','pool_misc_RNA_pcnt_of_counted','pool_misc_RNA_pcnt_sense','pool_tRNA_pcnt_of_counted','pool_tRNA_pcnt_sense','pool_IGR_pcnt_of_counted']

# Rearrange the columns
df_pool = df_pool[cols]
df_pool

Unnamed: 0,MOCP_ID,Project_ID,Plate/Box_ID,Pool_ID,pool_total_reads,pool_total_frags_counted,pool_pcnt_aligned,pool_pcnt_properly_mapped_pairs,pool_average_insert_len,pool_pcnt_sense,pool_CDS_total_counts_for_replicon,pool_CDS_pcnt_of_counted,pool_CDS_pcnt_sense,pool_rRNA_pcnt_of_counted,pool_rRNA_pcnt_sense,pool_misc_RNA_pcnt_of_counted,pool_misc_RNA_pcnt_sense,pool_tRNA_pcnt_of_counted,pool_tRNA_pcnt_sense,pool_IGR_pcnt_of_counted
0,SCR-0001.1,Klebs_1,P1,SCR-0001.1p1,437700,205094,83.387253,87.041631,352.888165,97.461816,191222,93.236272,97.471002,0.333506,94.590643,0.0,,0.160902,99.393939,3.759739
1,SCR-0001.1,Klebs_1,P2,SCR-0001.1p2,518494,236409,87.897062,95.671187,,97.29136,223094,94.367812,97.290828,0.247453,97.094017,0.0,,0.137897,100.0,2.55574
2,SCR-0001.1,Klebs_1,P3,SCR-0001.1p3,95378,42609,82.008426,89.879012,345.38494,97.462994,39895,93.630454,97.536032,0.227651,85.56701,0.0,,0.089183,100.0,3.442935


In [None]:
df_pool.write_csv(fpath.replace())