In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys

%matplotlib inline

In [2]:
# set the width of the jupyter notebook so we don't waste so much valuable browser real-estate!
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
data_directory = '042018'#'032118' #'020318' #'122717'

In [4]:
combo_corrections = pd.read_csv(
    './data/' + data_directory + '/water_controls/direct_water_subtraction.concordant.csv',
    index_col=0, names = ['subtract'], header=0)

rna_corrections = pd.read_csv(
    './data/' + data_directory + '/water_controls/direct_water_subtraction_RNA.concordant.csv',
    index_col=0, names = ['subtract'], header=0)

dna_corrections = pd.read_csv(
    './data/' + data_directory + '/water_controls/direct_water_subtraction_DNA.concordant.csv',
    index_col=0, names = ['subtract'], header=0)


# read in metadata to get RNA and DNA file names
metadata = pd.read_csv(
    './data/' + data_directory +
    '/NEB-TA-ONLY-metadata-03.29.18withFilenames.KKadaptation_v2.csv',
    header=0,
    index_col=0)

metadata = metadata[metadata['DNAfilename']
                    .notnull()]  #remove rows with null DNA filenames
metadata = metadata[metadata['RNAfilename']
                    .notnull()]  #remove rows with null RNA filenames

file_pairs_dict = dict(
    zip(metadata['DNAfilename'], metadata['RNAfilename']
        ))  #create dictionary mapping RNAfilename with associated DNAfilename

#create a reverse dictionary for file look-up from RNA:DNA later on
file_pairs_dict
file_pairs_dict_rev = {}
for i in file_pairs_dict.keys():
    file_pairs_dict_rev[file_pairs_dict[i]] = i
    
water_corrected_output_dir = 'BM_4WC'

In [5]:
#for each background model:
for bm in ['BM_4']:

    #create a list of RNA- and DNA-specific files from the pairs dictionary
    dna_csv_files = ['./data/' + data_directory + '/' + bm + '/DNA/' + list(file_pairs_dict.keys())[i] +'.report.csv' for i in range(len(file_pairs_dict))]
    rna_csv_files = ['./data/' + data_directory + '/' + bm + '/RNA/' + list(file_pairs_dict.values())[i] +'.report.csv' for i in range(len(file_pairs_dict))]
    
    #for each file:
    for c in range(len(rna_csv_files)):  #iterate through all csv files in the directory  #(1):
        rna = rna_csv_files[c]
        dna = dna_csv_files[c]
        print(rna)
        print(dna)

        #print('\n\n')
        #print(rna)
        #print(dna)

        #reset these dataframe values to avoid accidentally comparing two different residual frames
        df_rna = pd.DataFrame()
        df_dna = pd.DataFrame()

        #Try to read in the files, if they are not present then write the error and move on.
        try:
            df_rna = pd.read_csv(rna, error_bad_lines=False)
            for i in set(list(df_rna['Genus'])):
                try:
                    new_value_rna = float(list(set(df_rna[df_rna['Genus'] == i]['NT Genus rM']))[0]) - float(rna_corrections.loc[i]['subtract'])
                    keep_value_rna = max(new_value_rna,0)
                    df_rna.loc[df_rna['Genus'] == i,'NT Genus rM'] = keep_value_rna
                    #if(float(list(set(df_rna[df_rna['Genus'] == i]['NT Genus rM']))[0]) > 0) :
                    #    print(i)
                    #    print(df_rna[df_rna['Genus'] == i]['NT Genus rM'])
                    #    print(float(rna_corrections.loc[i]['subtract']))

                except:
                    continue
            df_rna.to_csv('./data/' + data_directory + '/' + water_corrected_output_dir + '/' + rna.split('/')[-1], index = False)
            
        except:
            print('failed to read RNA file: ' + rna)
            print(sys.exc_info()[0])
            continue
            
            
        try:
            df_dna = pd.read_csv(dna, error_bad_lines=False)
            
            for i in set(list(df_dna['Genus'])):
                try:
                    new_value_dna = float(list(set(df_dna[df_dna['Genus'] == i]['NT Genus rM']))[0]) - float(dna_corrections.loc[i]['subtract'])
                    keep_value_dna = max(new_value_dna,0)
                    df_dna.loc[df_dna['Genus'] == i,'NT Genus rM'] = keep_value_dna
                    #if(float(dna_corrections.loc[i]['subtract']) > 0):
                    #    print(i)
                    #    print(df_dna[df_dna['Genus'] == i]['NT Genus rM'])
                    #    print(float(dna_corrections.loc[i]['subtract']))

                except:
                    continue
            df_dna.to_csv('./data/' + data_directory + '/' + water_corrected_output_dir + '/' + dna.split('/')[-1], index=False)

        except:
            print('failed to read DNA file: ' + dna)
            print(sys.exc_info()[0])
            continue

        #print(df_rna[['Genus','NT Genus rM']])
        
                
                
          
        #print("NEW:")
        #print(df_rna[['Genus','NT Genus rM']])

./data/042018/BM_4/RNA/mBAL-202-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-202-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-205-RNA-TA1-QIA-61917.report.csv
./data/042018/BM_4/DNA/mBAL-205-DNA-TA1-B8.report.csv
./data/042018/BM_4/RNA/mBAL-208-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-208-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-209-RNA-TA1-QIA-61917.report.csv
./data/042018/BM_4/DNA/mBAL-209-DNA-TA1-B8.report.csv
./data/042018/BM_4/RNA/mBAL-211-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-211-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-212-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-212-DNA-TA1-ZYM-6717-B7.report.csv
./data/042018/BM_4/RNA/mBAL-213-RNA-TA1-QIA-62317.report.csv
./data/042018/BM_4/DNA/mBAL-213-DNA-TA1-B8.report.csv
./data/042018/BM_4/RNA/mBAL-215-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-215-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-216-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-216-DNA-TA1-B1

b'Skipping line 673: expected 14 fields, saw 15\n'


./data/042018/BM_4/RNA/mBAL-252-RNA-TA1-QIA-61917.report.csv
./data/042018/BM_4/DNA/mBAL-252-DNA-TA1-ZYM-6717-B7.report.csv


b'Skipping line 132: expected 14 fields, saw 15\n'


./data/042018/BM_4/RNA/mBAL-254-RNA-TA1-QIA-61217.report.csv
./data/042018/BM_4/DNA/mBAL-254-DNA-TA1-QIA-6517-B7.report.csv
./data/042018/BM_4/RNA/mBAL-256-RNA-TA2-QIA-60517.report.csv
./data/042018/BM_4/DNA/mBAL-256-DNA-TA2-QIA-6517-B7.report.csv
./data/042018/BM_4/RNA/mBAL-257-RNA-B5B.report.csv
./data/042018/BM_4/DNA/mBAL-257-DNA-TA1-ZYM-6717-B7.report.csv
./data/042018/BM_4/RNA/mBAL-258-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-258-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-261-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-261-DNA-TA2-QIA-6517-B7.report.csv
./data/042018/BM_4/RNA/mBAL-264-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-264-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-268-RNA-TA2-QIA-60517.report.csv
./data/042018/BM_4/DNA/mBAL-268-DNA-TA2-QIA-6517-B7.report.csv
./data/042018/BM_4/RNA/mBAL-270-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-270-DNA-TA-B12.report.csv
./data/042018/BM_4/RNA/mBAL-272-RNA-TA1-B10.report.csv
./data/04201

b'Skipping line 687: expected 14 fields, saw 15\n'


./data/042018/BM_4/RNA/mBAL-298-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-298-DNA-TA1-B8.report.csv


b'Skipping line 54: expected 14 fields, saw 15\nSkipping line 172: expected 14 fields, saw 15\n'


./data/042018/BM_4/RNA/mBAL-301-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-301-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-304-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-304-DNA-B6.report.csv
./data/042018/BM_4/RNA/mBAL-307-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-307-DNA-TA1-ZYM-6117-B7.report.csv
./data/042018/BM_4/RNA/mBAL-310-RNA-B5B.report.csv
./data/042018/BM_4/DNA/mBAL-310-DNA-TA1-ZYM-6117-B7.report.csv
./data/042018/BM_4/RNA/mBAL-311-RNA-B5B.report.csv
./data/042018/BM_4/DNA/mBAL-311-DNA-B6.report.csv
./data/042018/BM_4/RNA/mBAL-313-RNA-B5.report.csv
./data/042018/BM_4/DNA/mBAL-313-DNA-TA-041818.report.csv
./data/042018/BM_4/RNA/mBAL-314-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-314-DNA-B6.report.csv
./data/042018/BM_4/RNA/mBAL-315-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-315-DNA-B6.report.csv
./data/042018/BM_4/RNA/mBAL-316-RNA-B5.report.csv
./data/042018/BM_4/DNA/mBAL-316-DNA-TA1-ZYM-6117-B7.report.csv
./data/042018/BM_4/RNA

In [6]:
#for each background model:
other_rna = ['mBAL-308-RNA-B5','mBAL-312-RNA-B5B','mBAL-313-RNA-B5','mBAL-317-RNA-B5','mBAL-327-RNA-B5','mBAL-339-RNA-TA1-B10']
for bm in ['BM_4']:

    other_rna_csv_files = ['./data/' + data_directory + '/' + bm + '/RNA/' + other_rna[i] +'.report.csv' for i in range(len(other_rna))]
    
    #for each file:
    for c in range(len(other_rna_csv_files)):  #iterate through all csv files in the directory  #(1):
        rna = other_rna_csv_files[c]
        print(rna)

        #reset these dataframe values to avoid accidentally comparing two different residual frames
        df_rna = pd.DataFrame()

        #Try to read in the files, if they are not present then write the error and move on.
        try:
            df_rna = pd.read_csv(rna, error_bad_lines=False)
            for i in set(list(df_rna['Genus'])):
                try:
                    new_value_rna = float(list(set(df_rna[df_rna['Genus'] == i]['NT Genus rM']))[0]) - float(rna_corrections.loc[i]['subtract'])
                    keep_value_rna = max(new_value_rna,0)
                    df_rna.loc[df_rna['Genus'] == i,'NT Genus rM'] = keep_value_rna

                except:
                    continue
            df_rna.to_csv('./data/' + data_directory + '/' + water_corrected_output_dir + '/' + rna.split('/')[-1], index = False)
            
        except:
            print('failed to read RNA file: ' + rna)
            print(sys.exc_info()[0])
            continue
            
            
        

./data/042018/BM_4/RNA/mBAL-308-RNA-B5.report.csv
failed to read RNA file: ./data/042018/BM_4/RNA/mBAL-308-RNA-B5.report.csv
<class 'FileNotFoundError'>
./data/042018/BM_4/RNA/mBAL-312-RNA-B5B.report.csv
failed to read RNA file: ./data/042018/BM_4/RNA/mBAL-312-RNA-B5B.report.csv
<class 'FileNotFoundError'>
./data/042018/BM_4/RNA/mBAL-313-RNA-B5.report.csv
./data/042018/BM_4/RNA/mBAL-317-RNA-B5.report.csv
./data/042018/BM_4/RNA/mBAL-327-RNA-B5.report.csv
./data/042018/BM_4/RNA/mBAL-339-RNA-TA1-B10.report.csv


### If you want to subtract just the combined mean

In [7]:
water_corrected_output_dir = 'BM_4WC_combo'

#for each background model:
for bm in ['BM_4']:

    #create a list of RNA- and DNA-specific files from the pairs dictionary
    dna_csv_files = ['./data/' + data_directory + '/' + bm + '/DNA/' + list(file_pairs_dict.keys())[i] +'.report.csv' for i in range(len(file_pairs_dict))]
    rna_csv_files = ['./data/' + data_directory + '/' + bm + '/RNA/' + list(file_pairs_dict.values())[i] +'.report.csv' for i in range(len(file_pairs_dict))]
    
    #for each file:
    for c in range(len(rna_csv_files)):  #iterate through all csv files in the directory  #(1):
        rna = rna_csv_files[c]
        dna = dna_csv_files[c]
        print(rna)
        print(dna)

        #reset these dataframe values to avoid accidentally comparing two different residual frames
        df_rna = pd.DataFrame()
        df_dna = pd.DataFrame()

        #Try to read in the files, if they are not present then write the error and move on.
        try:
            df_rna = pd.read_csv(rna, error_bad_lines=False)
            for i in set(list(df_rna['Genus'])):
                try:
                    new_value_rna = float(list(set(df_rna[df_rna['Genus'] == i]['NT Genus rM']))[0]) - float(combo_corrections.loc[i]['subtract'])
                    keep_value_rna = max(new_value_rna,0)
                    df_rna.loc[df_rna['Genus'] == i,'NT Genus rM'] = keep_value_rna
                    #if(float(list(set(df_rna[df_rna['Genus'] == i]['NT Genus rM']))[0]) > 0) :
                    #    print(i)
                    #    print(df_rna[df_rna['Genus'] == i]['NT Genus rM'])
                    #    print(float(combo_corrections.loc[i]['subtract']))

                except:
                    continue
            df_rna.to_csv('./data/' + data_directory + '/' + water_corrected_output_dir + '/' + rna.split('/')[-1], index = False)
            
        except:
            print('failed to read RNA file: ' + rna)
            print(sys.exc_info()[0])
            continue
            
            
        try:
            df_dna = pd.read_csv(dna, error_bad_lines=False)
            
            for i in set(list(df_dna['Genus'])):
                try:
                    new_value_dna = float(list(set(df_dna[df_dna['Genus'] == i]['NT Genus rM']))[0]) - float(combo_corrections.loc[i]['subtract'])
                    keep_value_dna = max(new_value_dna,0)
                    df_dna.loc[df_dna['Genus'] == i,'NT Genus rM'] = keep_value_dna
                    #if(float(combo_corrections.loc[i]['subtract']) > 0):
                    #    print(i)
                    #    print(df_dna[df_dna['Genus'] == i]['NT Genus rM'])
                    #    print(float(combo_corrections.loc[i]['subtract']))

                except:
                    continue
            df_dna.to_csv('./data/' + data_directory + '/' + water_corrected_output_dir + '/' + dna.split('/')[-1], index=False)

        except:
            print('failed to read DNA file: ' + dna)
            print(sys.exc_info()[0])
            continue

        #print(df_rna[['Genus','NT Genus rM']])
        
                
                
          
        #print("NEW:")
        #print(df_rna[['Genus','NT Genus rM']])
        
        
        
        
        
        
#for each background model:
other_rna = ['mBAL-312-RNA-B5B','mBAL-313-RNA-B5','mBAL-317-RNA-B5','mBAL-327-RNA-B5','mBAL-339-RNA-TA1-B10']
for bm in ['BM_4']:

    other_rna_csv_files = ['./data/' + data_directory + '/' + bm + '/RNA/' + other_rna[i] +'.report.csv' for i in range(len(other_rna))]
    
    #for each file:
    for c in range(len(other_rna_csv_files)):  #iterate through all csv files in the directory  #(1):
        rna = other_rna_csv_files[c]
        print(rna)

        #reset these dataframe values to avoid accidentally comparing two different residual frames
        df_rna = pd.DataFrame()

        #Try to read in the files, if they are not present then write the error and move on.
        try:
            df_rna = pd.read_csv(rna, error_bad_lines=False)
            for i in set(list(df_rna['Genus'])):
                try:
                    new_value_rna = float(list(set(df_rna[df_rna['Genus'] == i]['NT Genus rM']))[0]) - float(combo_corrections.loc[i]['subtract'])
                    keep_value_rna = max(new_value_rna,0)
                    df_rna.loc[df_rna['Genus'] == i,'NT Genus rM'] = keep_value_rna

                except:
                    continue
            df_rna.to_csv('./data/' + data_directory + '/' + water_corrected_output_dir + '/' + rna.split('/')[-1], index = False)
            
        except:
            print('failed to read RNA file: ' + rna)
            print(sys.exc_info()[0])
            continue
            
            
        

./data/042018/BM_4/RNA/mBAL-202-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-202-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-205-RNA-TA1-QIA-61917.report.csv
./data/042018/BM_4/DNA/mBAL-205-DNA-TA1-B8.report.csv
./data/042018/BM_4/RNA/mBAL-208-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-208-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-209-RNA-TA1-QIA-61917.report.csv
./data/042018/BM_4/DNA/mBAL-209-DNA-TA1-B8.report.csv
./data/042018/BM_4/RNA/mBAL-211-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-211-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-212-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-212-DNA-TA1-ZYM-6717-B7.report.csv
./data/042018/BM_4/RNA/mBAL-213-RNA-TA1-QIA-62317.report.csv
./data/042018/BM_4/DNA/mBAL-213-DNA-TA1-B8.report.csv
./data/042018/BM_4/RNA/mBAL-215-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-215-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-216-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-216-DNA-TA1-B1

b'Skipping line 673: expected 14 fields, saw 15\n'


./data/042018/BM_4/RNA/mBAL-252-RNA-TA1-QIA-61917.report.csv
./data/042018/BM_4/DNA/mBAL-252-DNA-TA1-ZYM-6717-B7.report.csv


b'Skipping line 132: expected 14 fields, saw 15\n'


./data/042018/BM_4/RNA/mBAL-254-RNA-TA1-QIA-61217.report.csv
./data/042018/BM_4/DNA/mBAL-254-DNA-TA1-QIA-6517-B7.report.csv
./data/042018/BM_4/RNA/mBAL-256-RNA-TA2-QIA-60517.report.csv
./data/042018/BM_4/DNA/mBAL-256-DNA-TA2-QIA-6517-B7.report.csv
./data/042018/BM_4/RNA/mBAL-257-RNA-B5B.report.csv
./data/042018/BM_4/DNA/mBAL-257-DNA-TA1-ZYM-6717-B7.report.csv
./data/042018/BM_4/RNA/mBAL-258-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-258-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-261-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-261-DNA-TA2-QIA-6517-B7.report.csv
./data/042018/BM_4/RNA/mBAL-264-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-264-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-268-RNA-TA2-QIA-60517.report.csv
./data/042018/BM_4/DNA/mBAL-268-DNA-TA2-QIA-6517-B7.report.csv
./data/042018/BM_4/RNA/mBAL-270-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-270-DNA-TA-B12.report.csv
./data/042018/BM_4/RNA/mBAL-272-RNA-TA1-B10.report.csv
./data/04201

b'Skipping line 687: expected 14 fields, saw 15\n'


./data/042018/BM_4/RNA/mBAL-298-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-298-DNA-TA1-B8.report.csv


b'Skipping line 54: expected 14 fields, saw 15\nSkipping line 172: expected 14 fields, saw 15\n'


./data/042018/BM_4/RNA/mBAL-301-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-301-DNA-TA1-B10.report.csv
./data/042018/BM_4/RNA/mBAL-304-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-304-DNA-B6.report.csv
./data/042018/BM_4/RNA/mBAL-307-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-307-DNA-TA1-ZYM-6117-B7.report.csv
./data/042018/BM_4/RNA/mBAL-310-RNA-B5B.report.csv
./data/042018/BM_4/DNA/mBAL-310-DNA-TA1-ZYM-6117-B7.report.csv
./data/042018/BM_4/RNA/mBAL-311-RNA-B5B.report.csv
./data/042018/BM_4/DNA/mBAL-311-DNA-B6.report.csv
./data/042018/BM_4/RNA/mBAL-313-RNA-B5.report.csv
./data/042018/BM_4/DNA/mBAL-313-DNA-TA-041818.report.csv
./data/042018/BM_4/RNA/mBAL-314-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-314-DNA-B6.report.csv
./data/042018/BM_4/RNA/mBAL-315-RNA-TA1-B10.report.csv
./data/042018/BM_4/DNA/mBAL-315-DNA-B6.report.csv
./data/042018/BM_4/RNA/mBAL-316-RNA-B5.report.csv
./data/042018/BM_4/DNA/mBAL-316-DNA-TA1-ZYM-6117-B7.report.csv
./data/042018/BM_4/RNA