# Using TPOT to Identify the Best Model for VOC Data

## Importing Relevant Packages and Datasets

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier



In [3]:
data = pd.read_excel('data/raw_data.xlsx', sheet_name="Wide")

## Restructuring the data

The data contains null values for columns, and the real column names are contained within a row in the dataframe.
The row of column names need to be extracted, and added as the column names for the remaining data.

In [4]:
columns = data.iloc[2]

In [5]:
data_content = data.iloc[3:]

In [6]:
data_content.columns = columns

# Creating test and train features

The target label is the strain, and the features used to train the model exclude the strain, the species and the sample.
The species and the sample are removed from the training features due to their link to the strain.

In [7]:
features = data_content.iloc[:,3:]
targets = data_content.iloc[:,:3]
strain_targets = targets.iloc[:,1]

In [8]:
features = features.fillna(0)

In [9]:
## checking that all columns are numeric
features.shape[1] == features.select_dtypes(include=np.number).shape[1]

True

In [11]:
print(features.shape)
print(strain_targets.shape)

(84, 67)
(84,)


## Converting training data to numpy arrays

In [12]:
## features_array = np.array(features).reshape((-1,))
targets_array = np.array(strain_targets).reshape((-1,))

In [15]:
len(features_array)

5628

In [16]:
targets_array

array(['SA_A', 'SA_A', 'SA_A', 'SA_A', 'SA_A', 'SA_B', 'SA_B', 'SA_B',
       'SA_B', 'SA_B', 'SA_A', 'SA_A', 'SA_A', 'SA_A', 'SA_A', 'SA_B',
       'SA_B', 'SA_B', 'SA_B', 'SA_B', 'SA_A', 'SA_A', 'SA_A', 'SA_A',
       'SA_A', 'SA_B', 'SA_B', 'SA_B', 'SA_B', 'SA_B', 'PA_A', 'PA_A',
       'PA_A', 'PA_A', 'PA_A', 'PA_B', 'PA_B', 'PA_B', 'PA_B', 'PA_B',
       'PA_A', 'PA_A', 'PA_A', 'PA_A', 'PA_A', 'PA_B', 'PA_B', 'PA_B',
       'PA_B', 'PA_B', 'PA_A', 'PA_A', 'PA_A', 'PA_A', 'PA_B', 'PA_B',
       'PA_B', 'EC_A', 'EC_A', 'EC_A', 'EC_A', 'EC_B', 'EC_B', 'EC_B',
       'EC_A', 'EC_A', 'EC_A', 'EC_A', 'EC_A', 'EC_B', 'EC_B', 'EC_B',
       'EC_B', 'EC_B', 'EC_A', 'EC_A', 'EC_A', 'EC_A', 'EC_A', 'EC_B',
       'EC_B', 'EC_B', 'EC_B', 'EC_B'], dtype=object)

## Adjusting the target values to be integers

In [17]:
strain_integer_conversion = {
    'SA_A' : 1,
    'SA_B' : 2,
    'PA_A' : 3,
    'PA_B' : 4,
    'EC_A' : 5,
    'EC_B' : 6
}

for target in range(0,len(targets_array)):
    if targets_array[target] in strain_integer_conversion.keys():
        targets_array[target] = strain_integer_conversion[targets_array[target]]

## Using TPOT

In [18]:
from sklearn.metrics import make_scorer

def my_custom_accuracy(y_true, y_pred):
    return float(sum(y_pred == y_true)) / len(y_true)

my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)

# Create a tpot object with a few parameters
tpot = TPOTClassifier(scoring = my_custom_scorer, 
                    max_time_mins = 240, 
                    n_jobs = -1,
                    verbosity = 2,
                    cv = 5)

In [141]:
# Create a tpot object with a few parameters
#tpot = TPOTClassifier(scoring = 'accuracy', 
#                    max_time_mins = 12, 
#                    n_jobs = -1,
#                    verbosity = 2,
#                    cv = 5)

In [None]:
# Fit the tpot model on the training data
tpot.fit(features, targets_array)

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]

### Best algorithm based on accuracy and running for 4 hours

Generation 100 - Current best internal CV score: 0.8030303030303031

Best pipeline: XGBClassifier(SelectPercentile(Normalizer(input_matrix, norm=max), percentile=44), learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.6000000000000001, verbosity=0)
TPOTClassifier(max_time_mins=240, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Output of TPOT Classifier with Custom Scorer

Generation 14 - Current best internal CV score: 0.7666666666666667

12.04 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(Normalizer(input_matrix, norm=l1), learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)
TPOTClassifier(max_time_mins=12, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Output of TPOT Classifier with Negative Mean Squared Error

Generation 90 - Current best internal CV score: -0.1787878787878788

120.04 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(Normalizer(CombineDFs(input_matrix, RobustScaler(ZeroCount(Normalizer(input_matrix, norm=l1)))), norm=max), learning_rate=0.1, max_depth=1, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.7000000000000001, verbosity=0)
TPOTClassifier(max_time_mins=120, n_jobs=-1, scoring='neg_mean_absolute_error',
               verbosity=2)

### Exporting the model

In [None]:
# Show the final model
print(tpot.fitted_pipeline_)

In [None]:
# Export the pipeline as a python script file
tpot.export('tpot_exported_pipeline.py')