Preprocessing data to fit Microbiome Analyst formatting

In [1]:
import pandas as pd

In [6]:
def count_process(filename, outputfilename):
    df = pd.read_csv('../metag_topic_modeling/data_sets/' + filename + '.csv')
    
    # transpose df to match desired format
    df = df.T
    df.columns = ['#NAME'] + list(df.columns[1:])
    df.to_csv('../metag_topic_modeling/data_sets/' + outputfilename + '.csv', index=False)

In [15]:
def meta_process(filename, outputfilename):
    df = pd.read_csv('../metag_topic_modeling/data_sets/' + filename + '.csv')
    
    # keep only the necessary columns
    df = df[['PSN', 'HMP_BODY_SITE']]

    # rename the columns
    df.rename(columns={'PSN': '#NAME', 'HMP_BODY_SITE': 'SampleType'}, inplace=True)
    df.to_csv('../metag_topic_modeling/data_sets/' + outputfilename + '.csv', index=False)

In [18]:
def taxonomy_process(filename, outputfilename):
    df = pd.read_csv('../metag_topic_modeling/data_sets/' + filename + '.csv')
    
    # rename the columns
    df.rename(columns={'OTU_ID': '#TAXONOMY'}, inplace=True)
    df.to_csv('../metag_topic_modeling/data_sets/' + outputfilename + '.csv', index=False)

In [9]:
def family_process(filename, outputfilename):
    df = pd.read_csv('../metag_topic_modeling/data_sets/' + filename + '.csv')

    # Remove the 'GENUS' and 'OTU_ID' columns
    df = df.drop(columns=['GENUS', 'OTU_ID'])

    # Move 'FAMILY' to the first column
    cols = ['FAMILY'] + [col for col in df.columns if col != 'FAMILY']
    df = df[cols]

    # Remove duplicate rows based on the 'FAMILY' column
    df = df.drop_duplicates(subset='FAMILY', keep='first')

    df.rename(columns={'FAMILY': '#TAXONOMY'}, inplace=True)
    df.to_csv('../metag_topic_modeling/data_sets/' + outputfilename + '.csv', index=False)

In [None]:
count_process('HMP_V13_OTU_counts', 'OTU_processed')

In [7]:
count_process('HMP_V13_family_counts', 'family_processed')

In [17]:
meta_process('HMP_V13_participant_data', 'meta_data_processed')

In [20]:
taxonomy_process('HMP_V13_taxonomy_fix', 'taxonomy_processed')

In [11]:
family_process('HMP_V13_taxonomy_fix', 'family_taxa_processed')

In [12]:
meta_df = pd.read_csv('../metag_topic_modeling/data_sets/meta_data_processed.csv')
data_df = pd.read_csv('../metag_topic_modeling/data_sets/family_processed.csv')

print(meta_df.shape)
print(data_df.shape)

(2898, 2)
(2898, 161)
