In [1]:
import pandas as pd

from pandas import DataFrame as df
import numpy as np
import csv
import pickle


In [19]:
otu_notrim = pd.read_csv('../data/AG_new/03-otus/03-otus/notrim/gg-13_8-97-percent/otu_table_notrim.txt', sep = '\t', index_col = 0)
print(otu_notrim.shape)

(9162, 7744)


In [43]:
otu_notrim.shape

(35511, 19491)

In [33]:
#Throw samples with less than 5000 reads
sample_sums = np.sum(otu_notrim, axis = 0)

In [41]:
np.array(sample_sums > 5000)

array([ True,  True,  True, ..., False, False, False])

In [42]:
otu_filter = otu_notrim.loc[:, np.array(sample_sums > 5000)]
print(otu_filter.shape)

(35511, 15475)


In [44]:
taxa_sums = np.sum(otu_filter, axis = 1)
otu_filter = otu_filter.loc[np.array(taxa_sums > 0), :]
print(otu_filter.shape)

(35275, 15475)


In [45]:
#Read in map file so we can decide on samples to include
mapping = pd.read_csv("../data/AG_new/AG_mapping.txt", sep = "\t", index_col=0)
map_clean = mapping.loc[otu_filter.columns.values] #Keep samples if present in otu
map_clean = map_clean.reindex(sorted(map_clean.index.values))
otu_clean = otu_filter.reindex(sorted(otu_filter.columns.values), axis = 1)

  interactivity=interactivity, compiler=compiler, result=result)
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until


In [46]:
print(map_clean.shape)
otu_clean.shape

(15475, 527)


(35275, 15475)

# Decide on what samples should be included in the embedding calculations, and name files accordingly

In [48]:
#Select all body sites

#otu_table_name = "data/AG_new/otu_filtered_AG_02perc_allbodysites.csv"
#otu_train_table_name = "data/AG_new/otu_filtered_train_AG_02perc_allbodysites.csv"
#otu_test_table_name = "data/AG_new/otu_filtered_test_AG_02perc_allbodysites.csv"
#test_sample_file = "data/AG_new/AG_test_samples_allbodysites.obj"

np.sum([map_clean.index.values[i] == otu_clean.columns.values[i] for i in range(otu_clean.shape[1])])

15475

In [235]:
otu_table_name = "../data/AG_new/feces/otu_filtered_AG_07perc_feces.csv"
otu_train_table_name = "../data/AG_new/feces/otu_filtered_train_AG_07perc_feces.csv"
otu_test_table_name = "../data/AG_new/feces/otu_filtered_test_AG_07perc_feces.csv"
test_sample_file = "../data/AG_new/feces/AG_test_samples_feces.obj"
glove_output_file = "../data/AG_new/feces/glove_input_07perc_feces_sampledata.txt"

In [199]:
#Select only feces samples
otu_use = otu_clean.loc[:, map_clean["BODY_SITE"] == "UBERON:feces"]
map_use = map_clean.loc[map_clean["BODY_SITE"] == "UBERON:feces", :]



In [200]:
def cleanBySampleData(otu_clean, map_clean, cat_criteria, number_criteria):
    keep = [True] * map_clean.shape[0]
    print("Samples originally: " + str(sum(keep)))
    for criteria in cat_criteria:
        #Keep sample if it has desired metadata available
        keep_tmp = [( (i != "Unknown") and (i != "Unspecified") and (i!="other" ) and (i != "unspecified") and (isinstance(i, str)) )  for i in map_clean[criteria]] 

        keep = [(i and j) for (i,j) in zip(keep, keep_tmp)]
    print("Samples after categorical filter: " + str(sum(keep)))

    for criteria in number_criteria:
        keep_tmp = [hf.is_number(i) for i in map_clean[criteria]]
        keep = [i and j for (i,j) in zip(keep, keep_tmp)] 
    print("Samples after numerical filter: " + str(sum(keep)))


    otu_keep = otu_clean.loc[:, keep]
    map_keep = map_clean.loc[keep, cat_criteria + number_criteria]   
    otu_keep = otu_keep.T
    return(otu_keep, map_keep)

In [201]:
number_criteria = []
cat_criteria = ["IBD", "EXERCISE_FREQUENCY", "SEX", "ONE_LITER_OF_WATER_A_DAY_FREQUENCY", 
        "SEAFOOD_FREQUENCY", "PROBIOTIC_FREQUENCY", "OLIVE_OIL", "FRUIT_FREQUENCY", 
         "SLEEP_DURATION", "SUGAR_SWEETENED_DRINK_FREQUENCY", "MILK_CHEESE_FREQUENCY",
         "RED_MEAT_FREQUENCY","MEAT_EGGS_FREQUENCY", "VEGETABLE_FREQUENCY"]

otu_use, map_use = cleanBySampleData(otu_use, map_use, cat_criteria, number_criteria)
otu_use.index = otu_use.index.map(str)

Samples originally: 9132
Samples after categorical filter: 5980
Samples after numerical filter: 5980


In [202]:
thresh = 0.0007 * otu_use.shape[0] # at least 1 % of samples (10 in this case)
print("Only keep taxa present in greater than " + str(thresh) + " samples")
binary = otu_use > 0
keep = binary.sum(axis = 0) >= thresh# Should be ntaxa
binary = binary.loc[:, keep]
print("We will keep " + str(keep.sum()) +" taxa")


Only keep taxa present in greater than 4.186 samples
We will keep 9230 taxa


In [144]:
binary.shape

(5980, 9230)

In [203]:
f = open(test_sample_file, "rb")
test_samples = pickle.load(f)
f.close()


In [204]:
#Break out train and test
keep = [(i in otu_use.index.values) for i in test_samples]
test_samples = test_samples[keep]

binary_train = binary.loc[[not(i in test_samples) for i in binary.index.values ], :]
otu_train = otu_use.loc[[not(i in test_samples) for i in binary.index.values ], :]
otu_test = otu_use.loc[test_samples, :]
map_train = map_use.loc[otu_train.index.values, :]
map_test = map_use.loc[test_samples, :]
print(binary_train.shape)
print(otu_train.shape)
print(otu_test.shape)
print(map_train.shape)
print(map_test.shape)

(5068, 9230)
(5068, 35275)
(983, 35275)
(5068, 14)
(983, 14)


In [205]:
#Change sample data to one-hot
map_train_num, map_test_num = hf.makeMappingNumeric(map_train, map_test, number_criteria, cat_criteria)

#Transform to one hot
map_train_1hot = pd.concat([pd.get_dummies(map_train_num[col], prefix = col) for col in map_train_num], axis=1)
map_test_1hot = pd.concat([pd.get_dummies(map_test_num[col], prefix = col) for col in map_test_num], axis=1)

EXERCISE_FREQUENCY
ONE_LITER_OF_WATER_A_DAY_FREQUENCY
SEAFOOD_FREQUENCY
PROBIOTIC_FREQUENCY
OLIVE_OIL
FRUIT_FREQUENCY
SUGAR_SWEETENED_DRINK_FREQUENCY
MILK_CHEESE_FREQUENCY
RED_MEAT_FREQUENCY
MEAT_EGGS_FREQUENCY
VEGETABLE_FREQUENCY
SLEEP_DURATION
SEX
IBD


In [214]:
concat = pd.concat([binary_train, map_train_1hot], axis = 1)
print(binary_train.shape)
print(map_train_1hot.shape)
print(concat.shape)

(5068, 9230)
(5068, 64)
(5068, 9294)


In [233]:
taxa_share = concat.columns[concat.iloc[i,:] == True]
concat.columns

Index([                1081058,                  359105,
                       1134664,                  311173,
                       2700687,                  842284,
                        546305,                 3450454,
                       1909053,                  362389,
       ...
       'MEAT_EGGS_FREQUENCY_0', 'MEAT_EGGS_FREQUENCY_1',
       'MEAT_EGGS_FREQUENCY_2', 'MEAT_EGGS_FREQUENCY_3',
       'MEAT_EGGS_FREQUENCY_4', 'VEGETABLE_FREQUENCY_0',
       'VEGETABLE_FREQUENCY_1', 'VEGETABLE_FREQUENCY_2',
       'VEGETABLE_FREQUENCY_3', 'VEGETABLE_FREQUENCY_4'],
      dtype='object', length=9294)

In [234]:
file_text = []
for i in range(concat.shape[0]):
    #For every sample
    taxa_share = concat.columns[concat.iloc[i,:] == True] #write names of every taxa in that sample
    file_text.append(taxa_share)

In [236]:
lengths = []
for taxa_list in file_text:
    lengths.append(len(taxa_list))
max_val = max(lengths)
max_ind = np.argmax(lengths)
print("Max number of taxa in any given sample is: " + str(max_val))
print("Found at " + str(max_ind))


Max number of taxa in any given sample is: 1784
Found at 1746


In [241]:
file_text[1][0:5]

Index([4480529, 4442508, 4409730, 4450194, 4467447], dtype='object')

In [239]:
#Write Glove input file
with open(glove_output_file, mode = 'w', newline='') as file:
    writer = csv.writer(file, delimiter = "\t", quoting = csv.QUOTE_NONE, escapechar = '')
    for taxa_list in file_text:
        writer.writerow(taxa_list)
file.close()
