In [97]:
# Import Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [101]:
# Load all three source data from the folder
labels = pd.read_csv('Source Data//metadata_by_jgi_spid.tsv', sep='\t', header=0)
features = pd.read_csv('Source Data//features.csv',header=None)
columns = pd.read_csv('Source Data//feature_column_names.tsv', sep='\t', header=None)

print("Labels data shape: ", labels.shape)
print("features data shape: ", features.shape)
print("columns data shape: ", columns.shape)

display(features.head(5))

Labels data shape:  (1983, 22)
features data shape:  (1785, 16307)
columns data shape:  (16306, 3)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16297,16298,16299,16300,16301,16302,16303,16304,16305,16306
0,1000607,0,0,0,2649,14350,1225,0,0,2214,...,0,0,0,0,0,1,0,0,0,0
1,1000613,0,0,0,662,3805,293,0,0,515,...,0,0,0,0,0,0,0,0,0,0
2,1000616,0,0,0,1411,7054,515,0,0,997,...,0,0,0,0,0,0,0,0,0,0
3,1000619,0,0,0,53,394,18,0,4,39,...,0,0,0,0,0,0,0,0,0,0
4,1000622,0,0,0,665,5608,537,0,10,739,...,0,0,0,0,0,0,0,0,0,0


In [103]:
# Add columns headers to the features data
first_column = columns.iloc[:, 0].values.tolist()
samples_ids = features.iloc[:, 0].values.tolist()
column_names = ['sample_id', *first_column]

In [106]:
features.columns = column_names
display(features.head(5))

Unnamed: 0,sample_id,PF00001.19,PF00002.22,PF00003.20,PF00004.27,PF00005.25,PF00006.23,PF00007.20,PF00008.25,PF00009.25,...,PF17216.1,PF17217.1,PF17218.1,PF17219.1,PF17220.1,PF17221.1,PF17222.1,PF17223.1,PF17224.1,PF17225.1
0,1000607,0,0,0,2649,14350,1225,0,0,2214,...,0,0,0,0,0,1,0,0,0,0
1,1000613,0,0,0,662,3805,293,0,0,515,...,0,0,0,0,0,0,0,0,0,0
2,1000616,0,0,0,1411,7054,515,0,0,997,...,0,0,0,0,0,0,0,0,0,0
3,1000619,0,0,0,53,394,18,0,4,39,...,0,0,0,0,0,0,0,0,0,0
4,1000622,0,0,0,665,5608,537,0,10,739,...,0,0,0,0,0,0,0,0,0,0


In [107]:
# Filter the labels to align with features data
labels = labels[labels['jgi_spid'].isin(samples_ids)]
labels.shape

(1785, 22)

In [108]:
# assign features and labels
X = features.loc[:, 'PF00001.19':'PF17225.1']
Y = labels['EMPO_3']


In [116]:
# Split the training and test dataset by 80/20 ratio, stratifies by class labels

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1, stratify=Y)

In [118]:
# Count labels by class in the entire dataset
empo3_labels = labels['EMPO_3'].unique()
Y_bin = [(i, Y[Y==i].count()) for i in empo3_labels ]
Y_bin

[('Hypersaline (saline)', 22),
 ('Water (saline)', 414),
 ('Soil (non-saline)', 433),
 ('Subsurface (non-saline)', 206),
 ('Water (non-saline)', 350),
 ('Sediment (saline)', 20),
 ('Animal proximal gut', 55),
 ('Plant rhizosphere', 137),
 ('Animal corpus', 6),
 ('Aerosol (non-saline)', 11),
 ('Sediment (non-saline)', 105),
 ('Plant surface', 8),
 ('Surface (non-saline)', 7),
 ('Surface (saline)', 9),
 ('Plant corpus', 2)]

In [119]:
# Count labels by class in the training dataset
y_train_bin = [(i, Y_train[Y==i].count()) for i in empo3_labels ]
y_train_bin

[('Hypersaline (saline)', 15),
 ('Water (saline)', 290),
 ('Soil (non-saline)', 303),
 ('Subsurface (non-saline)', 144),
 ('Water (non-saline)', 245),
 ('Sediment (saline)', 14),
 ('Animal proximal gut', 39),
 ('Plant rhizosphere', 96),
 ('Animal corpus', 4),
 ('Aerosol (non-saline)', 8),
 ('Sediment (non-saline)', 73),
 ('Plant surface', 6),
 ('Surface (non-saline)', 5),
 ('Surface (saline)', 6),
 ('Plant corpus', 1)]

In [120]:
y_test_bin = [(i, Y_test[Y==i].count()) for i in empo3_labels ]
y_test_bin

[('Hypersaline (saline)', 7),
 ('Water (saline)', 124),
 ('Soil (non-saline)', 130),
 ('Subsurface (non-saline)', 62),
 ('Water (non-saline)', 105),
 ('Sediment (saline)', 6),
 ('Animal proximal gut', 16),
 ('Plant rhizosphere', 41),
 ('Animal corpus', 2),
 ('Aerosol (non-saline)', 3),
 ('Sediment (non-saline)', 32),
 ('Plant surface', 2),
 ('Surface (non-saline)', 2),
 ('Surface (saline)', 3),
 ('Plant corpus', 1)]