# Example modeling process

### Imports

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from helpers import (get_training_observations, 
                     get_training_labels, 
                     get_protein_proportions)
import pandas as pd
import numpy as np

# Model specific imports
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree

### Load training data and labels

In [2]:
x_train = get_training_observations()
y_train = get_training_labels()

Getting all training observations from 'metagenome_classification.db'...
Getting all training labels from 'metagenome_classification.db'...


In [3]:
x_train.head(10)

index,PF00001.19,PF00002.22,PF00003.20,PF00004.27,PF00005.25,PF00006.23,PF00007.20,PF00008.25,PF00009.25,PF00010.24,...,PF17216.1,PF17217.1,PF17218.1,PF17219.1,PF17220.1,PF17221.1,PF17222.1,PF17223.1,PF17224.1,PF17225.1
0,0.0,0.0,0.0,0.00402,0.006243,0.001039,0.0,0.0,0.003265,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.003575,0.013739,0.001026,0.0,0.0,0.002235,7e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.37688e-07,2.18844e-07,0.0,0.001619,0.016218,0.000916,0.0,2e-06,0.001711,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.002737,0.019874,0.001785,0.0,0.0,0.00357,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.001992,0.012389,0.001154,0.0,1.3e-05,0.002286,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.001684,0.016435,0.000931,0.0,0.0,0.001806,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.001404,0.023274,0.000845,0.0,0.0,0.001497,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.002301,0.015752,0.000925,0.0,0.0,0.001843,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.001885,0.014797,0.001012,0.0,0.0,0.001689,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.004574,0.013601,0.000995,0.0,0.0,0.002402,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
y_train.head()

index,EMPO_1,EMPO_2,EMPO_3
0,Free-living,Saline,Hypersaline (saline)
1,Free-living,Saline,Water (saline)
2,Host-associated,Plant,Plant rhizosphere
3,Free-living,Non-saline,Soil (non-saline)
4,Free-living,Saline,Water (saline)


### Data manipulation
Validation splits, dimensionality reduction, etc

In [5]:
# Split into train/validation if not CV
X_tr, X_val, Y_tr, Y_val = train_test_split(x_train, y_train, test_size=0.2) #, random_state=1)

### Model training

In [12]:
model1 = dtc(criterion = 'entropy', max_depth = 10)
model1.fit(X_tr, Y_tr['EMPO_1'])

model2 = dtc(criterion = 'entropy', max_depth = 10)
model2.fit(X_tr, Y_tr['EMPO_2'])

model3 = dtc(criterion = 'entropy', max_depth = 10)
model3.fit(X_tr, Y_tr['EMPO_3'])

model3_100 = dtc(criterion = 'entropy', max_depth = 100)
model3_100.fit(X_tr, Y_tr['EMPO_3'])

### Model evaluation

In [11]:
# Scoring model
pred_model1 = model1.predict(X_val)
acc1 = accuracy_score(Y_val['EMPO_1'], pred_model1)
print(f"Accuracy of model1 is {acc1}")

pred_model2 = model2.predict(X_val)
acc2 = accuracy_score(Y_val['EMPO_2'], pred_model2)
print(f"Accuracy of model2 is {acc2}")

pred_model3 = model3.predict(X_val)
acc3 = accuracy_score(Y_val['EMPO_3'], pred_model3)
print(f"Accuracy of model3 is {acc3}")

pred_model3_100 = model3.predict(X_val)
acc3_100 = accuracy_score(Y_val['EMPO_3'], pred_model3_100)
print(f"Accuracy of model3 is {acc3_100}")

Accuracy of model1 is 0.956
Accuracy of model2 is 0.924
Accuracy of model3 is 0.848


### Retrain best model
After experimenting with models, retrain your favorite model using entire training set (including validation) before saving

In [8]:
model_final = dtc(criterion = 'entropy', max_depth = 10)
model_final.fit(x_train, y_train['EMPO_3'])

### Save fitted model

In [9]:
# Save best model as joblib or pkl file to 'model_joblibs' folder
from joblib import dump

dump(clf, '../model_joblibs/example_multiclass_logreg.joblib')

['../model_joblibs/example_multiclass_logreg.joblib']