# Using TPOT to Identify the Best Model for VOC Data

## Importing Relevant Packages and Datasets

In [95]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier

In [2]:
data = pd.read_excel('data/raw_data.xlsx', sheet_name="Wide")

## Restructuring the data

The data contains null values for columns, and the real column names are contained within a row in the dataframe.
The row of column names need to be extracted, and added as the column names for the remaining data.

In [6]:
columns = data.iloc[2]

In [7]:
data_content = data.iloc[3:]

In [9]:
data_content.columns = columns

# Creating test and train features

The target label is the strain, and the features used to train the model exclude the strain, the species and the sample.
The species and the sample are removed from the training features due to their link to the strain.

In [114]:
features = data_content.iloc[:,3:]
targets = data_content.iloc[:,:3]
strain_targets = targets.iloc[:,1]

In [116]:
features = features.fillna(0)

In [117]:
## checking that all columns are numeric
features.shape[1] == features.select_dtypes(include=np.number).shape[1]

True

In [118]:
train_features, test_features, train_targets, test_targets = train_test_split(
features, strain_targets, test_size=0.33, random_state=42)

In [119]:
print(train_features.shape)
print(test_features.shape)

(56, 67)
(28, 67)


## Converting training data to numpy arrays

In [131]:
training_targets = np.array(train_targets).reshape((-1,))
testing_targets = np.array(test_targets).reshape((-1,))

## Adjusting the target values to be integers

In [140]:
strain_integer_conversion = {
    'SA_A' : 1,
    'SA_B' : 2,
    'PA_A' : 3,
    'PA_B' : 4,
    'EC_A' : 5,
    'EC_B' : 6
}

for target in range(0,len(training_targets)):
    if training_targets[target] in strain_integer_conversion.keys():
        training_targets[target] = strain_integer_conversion[training_targets[target]]
        
for target in range(0,len(testing_targets)):
    if testing_targets[target] in strain_integer_conversion.keys():
        testing_targets[target] = strain_integer_conversion[testing_targets[target]]

## Using TPOT

In [146]:
from sklearn.metrics import make_scorer

def my_custom_accuracy(y_true, y_pred):
    return float(sum(y_pred == y_true)) / len(y_true)

my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)

# Create a tpot object with a few parameters
tpot = TPOTClassifier(scoring = my_custom_scorer, 
                    max_time_mins = 12, 
                    n_jobs = -1,
                    verbosity = 2,
                    cv = 5)

In [141]:
# Create a tpot object with a few parameters
tpot = TPOTClassifier(scoring = 'accuracy', 
                    max_time_mins = 12, 
                    n_jobs = -1,
                    verbosity = 2,
                    cv = 5)

In [147]:
# Fit the tpot model on the training data
tpot.fit(training_features, training_targets)

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.5696969696969697

Generation 2 - Current best internal CV score: 0.5696969696969697

Generation 3 - Current best internal CV score: 0.6287878787878788

Generation 4 - Current best internal CV score: 0.6287878787878788

Generation 5 - Current best internal CV score: 0.7484848484848485

Generation 6 - Current best internal CV score: 0.7484848484848485

Generation 7 - Current best internal CV score: 0.7484848484848485

Generation 8 - Current best internal CV score: 0.7484848484848485

Generation 9 - Current best internal CV score: 0.75

Generation 10 - Current best internal CV score: 0.7666666666666667

Generation 11 - Current best internal CV score: 0.7666666666666667

Generation 12 - Current best internal CV score: 0.7666666666666667

Generation 13 - Current best internal CV score: 0.7666666666666667

Generation 14 - Current best internal CV score: 0.7666666666666667

12.04 minutes have elapsed. TPOT will close down.
TPOT closed during e

TPOTClassifier(max_time_mins=12, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Output of TPOT Classifier with Custom Scorer

Generation 14 - Current best internal CV score: 0.7666666666666667

12.04 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(Normalizer(input_matrix, norm=l1), learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)
TPOTClassifier(max_time_mins=12, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Output of TPOT Classifier with Negative Mean Squared Error

Generation 90 - Current best internal CV score: -0.1787878787878788

120.04 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(Normalizer(CombineDFs(input_matrix, RobustScaler(ZeroCount(Normalizer(input_matrix, norm=l1)))), norm=max), learning_rate=0.1, max_depth=1, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.7000000000000001, verbosity=0)
TPOTClassifier(max_time_mins=120, n_jobs=-1, scoring='neg_mean_absolute_error',
               verbosity=2)

### Exporting the model

In [None]:
# Show the final model
print(tpot.fitted_pipeline_)

In [None]:
# Export the pipeline as a python script file
tpot.export('tpot_exported_pipeline.py')