In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
# 4/30/18: Fixing DADA biom files 

# Load biom:
biom_file = pd.read_csv('../Seq_data/Updated_files/biom/filtered_biom.txt', sep='\t', skiprows=1, index_col=0)

# filter these based on Loren's table:
names_to_filter = ['NA.Plate1B3', 'NA.Plate1D1','NA.Plate1D3','NA.Plate3C6', 'W1.D7.A', 
                   'W2.D3.rB', 'W3.D1.rB', 'W3.D4.rB', 'W3.D7.A', 'W3.D7.rB','W4.D2.rB', 'W4.D3.rB', 'W4.D4.A', 'W4.D4.rB', 'W4.D7.rB', 'W7.D6.rA', 'W7.D6.rB']

# Drop columns:
dada_biom_dropped_missing = biom_file.drop(columns=names_to_filter)

# update column names cuz the old ones have random 'r' and capitalization:
col_list = dada_biom_dropped_missing.columns.values.tolist()
new_cols = [x.upper().replace('R','') for x in col_list]
dada_biom_dropped_missing.columns = new_cols

In [29]:
lvl7_summary = pd.read_csv('../Seq_data/Updated_files/biom/biom_summary/level-7.csv',index_col=0)
lvl7_summary = lvl7_summary.transpose()
lvl7_summary.index

Index(['Unassigned;__;__;__;__;__;__', 'k__Archaea;__;__;__;__;__;__',
       'k__Archaea;p__Crenarchaeota;__;__;__;__;__',
       'k__Archaea;p__Crenarchaeota;c__;o__;f__;g__;s__',
       'k__Archaea;p__Crenarchaeota;c__MBGA;o__;f__;g__;s__',
       'k__Archaea;p__Crenarchaeota;c__MBGA;o__NRP-J;f__;g__;s__',
       'k__Archaea;p__Crenarchaeota;c__MBGB;o__;f__;g__;s__',
       'k__Archaea;p__Crenarchaeota;c__MCG;o__;f__;g__;s__',
       'k__Archaea;p__Crenarchaeota;c__MCG;o__pGrfC26;f__;g__;s__',
       'k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o__AK31;f__;g__;s__',
       ...
       'Pb_Con', 'CU_Conc', 'Fe_Conc', 'Zn_Conc', 'Ca_Conc', 'K_conc',
       'Mg_Conc', 'P_Conc', 'Na_Conc', 'Description'],
      dtype='object', length=1031)

In [2]:
# I need to edit my tax and env tables so they exactly match the order and format for phyloseq
tax_table = pd.read_csv('../Seq_data/Updated_files/IBP_taxonomy.txt', sep='\t', index_col=0)
tax_series = tax_table['Taxon']

columns = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

rows_list = []

for index,value in tax_series.iteritems():
    taxa =  value.split(' ')
    taxa = [x.strip(';') for x in taxa]
    taxa = fix_greengenes_missing_data(taxa)
    
    row_dict = {}
    row_dict['OTU'] = index
    for t_ind in range(7):
        row_dict[columns[t_ind]] = taxa[t_ind]
    
    rows_list.append(row_dict)   
  

In [4]:
def fix_greengenes_missing_data(taxa_list):
    blank_taxa_terms = ['k__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__']
    missing_terms = 7 - len(taxa_list)
    taxa_list.extend(blank_taxa_terms[-missing_terms:])
    return taxa_list

In [5]:
# Make a dataframe from rows and subset to OTUs with >= 10 counts
taxa_df = pd.DataFrame(rows_list)
taxa_df.set_index('OTU', inplace=True)
taxa_df = taxa_df[columns]    

taxa_df_subset = taxa_df[taxa_df.index.isin(dada_biom_dropped_missing.index)]

5000
10000
15000
20000
25000
30000
35000
40000
45000


In [26]:
# Now do the same for environmental variables:
env_data = pd.read_csv('../Metadata/ibp_metadata_4.17.18.txt', sep='\t', index_col=0)
env_data_subset = env_data[env_data.index.isin(dada_biom_dropped_missing.columns)]

In [37]:
# Write all of these files to output:
env_data_subset.to_csv('../Seq_data/Updated_files/env_data_ordered_like_biom.txt', sep='\t')
taxa_df_subset.to_csv('../Seq_data/Updated_files/taxa_file_ordered_like_biom.txt', sep='\t')
dada_biom_dropped_missing.to_csv('../Seq_data/Updated_files/biom_dropped_missing.txt', sep='\t')

In [38]:
env_data_subset

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,Well,WLWNum,MicroNum,cum_depth,z,Soil_type_color_id,Soil_type_and_color,USDA_soil_series,...,Pb,Cu,Fe,Zn,Ca,K,Mg,P,Na,Description
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
W1.D1.A,ATGTTAGGGAAT,GTGTGYCAGCMGCCGCGGTAA,A1,1,1,0.027,184.849,1.0,Black CL,Selma loam,...,13.056,7.003,1497.839,59.173,307.061000,307.061000,782.507,148.882,37.823000,W1.D1.A
W1.D1.B,GTTCGGTGTCCA,GTGTGYCAGCMGCCGCGGTAA,A1,1,1,0.027,184.849,1.0,Black CL,Selma loam,...,13.056,7.003,1497.839,59.173,307.061000,307.061000,782.507,148.882,37.823000,W1.D1.A
W1.D10.A,GCGTGGTCATTA,GTGTGYCAGCMGCCGCGGTAA,A10,1,10,1.000,183.876,9.0,Gray CH,Selma loam,...,6.623,11.617,6232.421,35.205,2091.943000,2091.943000,4268.215,282.540,93.032000,W1.D10.A
W1.D10.B,ATTTGAAGAGGT,GTGTGYCAGCMGCCGCGGTAA,A10,1,10,1.000,183.876,9.0,Gray CH,Selma loam,...,6.623,11.617,6232.421,35.205,2091.943000,2091.943000,4268.215,282.540,93.032000,W1.D10.A
W1.D11.A,GATTCTTTAGAT,GTGTGYCAGCMGCCGCGGTAA,A11,1,11,1.100,183.776,8.0,Brown Sandy CL,Selma loam,...,4.199,8.925,4278.976,25.024,2357.742000,2357.742000,4101.744,161.330,96.229000,W1.D11.A
W1.D12.A,CGGATCTAGTGT,GTGTGYCAGCMGCCGCGGTAA,A12,1,12,1.200,183.676,8.0,Brown Sandy CL,Selma loam,...,5.920,9.459,7419.726,26.475,3898.543000,3898.543000,4436.305,260.419,134.976000,W1.D12.A
W1.D2.A,GCTAGTTATGGA,GTGTGYCAGCMGCCGCGGTAA,A2,1,2,0.164,184.712,1.0,Black CL,Selma loam,...,9.425,8.761,2542.052,30.112,299.339000,299.339000,882.723,125.267,41.396000,W1.D2.A
W1.D2.B,CTACCGATTGCG,GTGTGYCAGCMGCCGCGGTAA,A2,1,2,0.164,184.712,1.0,Black CL,Selma loam,...,9.425,8.761,2542.052,30.112,299.339000,299.339000,882.723,125.267,41.396000,W1.D2.A
W1.D3.B,GAGAGTCCACTT,GTGTGYCAGCMGCCGCGGTAA,A3,1,3,0.300,184.576,1.0,Black CL,Selma loam,...,4.870,8.809,2348.802,14.429,1093.514000,1093.514000,908.794,104.192,34.887000,W1.D3.A
W1.D4.A,ATTTGGCTCTTA,GTGTGYCAGCMGCCGCGGTAA,A4,1,4,0.393,184.483,1.0,Black CL,Selma loam,...,3.685,5.739,3228.200,12.863,961.704000,961.704000,931.216,77.119,39.043000,W1.D4.A


In [41]:
dada_biom_transpose = dada_biom_dropped_missing.transpose()

In [43]:
dada_biom_transpose.to_csv('../Seq_data/Updated_files/dada_biom_transpose.txt', sep='\t')