# Using TPOT to Identify the Best Model for VOC Data

## Importing Relevant Packages and Datasets

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier



In [2]:
data = pd.read_excel('data/raw_data.xlsx', sheet_name="Wide")

## Restructuring the data

The data contains null values for columns, and the real column names are contained within a row in the dataframe.
The row of column names need to be extracted, and added as the column names for the remaining data.

In [3]:
columns = data.iloc[2]

In [4]:
data_content = data.iloc[3:]

In [5]:
data_content.columns = columns

# Creating test and train features

The target label is the strain, and the features used to train the model exclude the strain, the species and the sample.
The species and the sample are removed from the training features due to their link to the strain.

In [6]:
features = data_content.iloc[:,3:]
targets = data_content.iloc[:,:3]
strain_targets = targets.iloc[:,1]

In [None]:
#features = features.fillna(0)

In [7]:
## checking that all columns are numeric
features.shape[1] == features.select_dtypes(include=np.number).shape[1]

False

In [None]:
print(features.shape)
print(strain_targets.shape)

## Normalising the feature vectors (if all were numeric)

In [None]:
type(features)

In [None]:
## mean normalised 
normalized_features=(features-features.mean())/features.std()

In [None]:
normalized_features.iloc[:,0]

In [None]:
min_max_normalized_features=(features-features.min())/(features.max()-features.min())

## Normalizing the feature vectors (nulls still included)

In [None]:
for c

In [8]:
for column in features.columns:
    features[column]=(features[column]-features[column].min())/(features[column].max()-features[column].min())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[column]=(features[column]-features[column].min())/(features[column].max()-features[column].min())


In [9]:
features

2,Ethyl Acetate,Ethanol,"Propanoic acid, ethyl ester",2-Pentanone,Decane,Methyl Isobutyl Ketone,Amylene hydrate,"Butanoic acid, 2-methyl-, methyl ester",Isobutyl acetate,Methyl isovalerate,...,1-Dodecanol,Methyl tetradecanoate,2-Pentadecanone,"Tetradecanoic acid, ethyl ester",Hexadecanal,n-Tridecan-1-ol,1-Tetradecanol,n-Pentadecanol,1-Hexadecanol,Indole
3,0.3484,0.095694,,0.280738,0.267534,0.247824,0.096187,,,0.0,...,,,,,,,,,,
4,0.144602,0.09786,,0.109738,0.098475,0.144856,0.0,,,0.0,...,,,,,,,,,,
5,0.301918,0.172296,,0.254583,0.000005,0.174291,0.096902,,,0.0,...,,,,,,,,,,
6,0.097199,0.478675,,0.419277,0.041532,0.0,0.248146,,,0.0,...,,,,,,,,,,
7,0.08767,0.318686,,0.053714,0.0,0.0,0.0,,,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,0.089323,0.127058,0.0,0.036897,0.0,,0.0,,0.0,,...,0.005973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.488614
83,0.096124,0.13512,0.0,0.078258,0.0,,0.0,,0.0,,...,0.064019,0.0,0.0,0.0,0.0,0.0,0.032103,0.0,0.0,0.952934
84,0.231334,0.526166,0.0,0.190671,0.0,,0.143815,,0.0,,...,0.029894,0.0,0.0,0.0,0.0,0.0,0.034451,0.0,0.0,0.603602
85,0.410589,0.334106,0.0,0.138788,0.0,,0.08014,,0.0,,...,0.316324,0.0,0.0,0.0,0.0,0.0,0.05972,0.0,0.0,0.732048


In [10]:
features = features.fillna(-1)

In [11]:
## checking that all columns are numeric
features.shape[1] == features.select_dtypes(include=np.number).shape[1]

True

## Converting target data to numpy arrays

In [12]:
targets_array = np.array(strain_targets).reshape((-1,))

## Adjusting the target values to be integers

In [13]:
strain_integer_conversion = {
    'SA_A' : 1,
    'SA_B' : 2,
    'PA_A' : 3,
    'PA_B' : 4,
    'EC_A' : 5,
    'EC_B' : 6
}

for target in range(0,len(targets_array)):
    if targets_array[target] in strain_integer_conversion.keys():
        targets_array[target] = strain_integer_conversion[targets_array[target]]

## Using TPOT

In [14]:
from sklearn.metrics import make_scorer

def my_custom_accuracy(y_true, y_pred):
    return float(sum(y_pred == y_true)) / len(y_true)

my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)

# Create a tpot object with a few parameters
tpot = TPOTClassifier(scoring = my_custom_scorer, 
                    max_time_mins = 60, 
                    n_jobs = -1,
                    verbosity = 2,
                    cv = 5)

In [16]:
# Fit the tpot model on the training data
tpot.fit(features, targets_array)

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.5573529411764706

Generation 2 - Current best internal CV score: 0.5573529411764706

Generation 3 - Current best internal CV score: 0.5573529411764706

Generation 4 - Current best internal CV score: 0.5595588235294118

Generation 5 - Current best internal CV score: 0.5705882352941176

Generation 6 - Current best internal CV score: 0.5823529411764706

Generation 7 - Current best internal CV score: 0.5823529411764706

Generation 8 - Current best internal CV score: 0.5941176470588235

Generation 9 - Current best internal CV score: 0.5941176470588235

Generation 10 - Current best internal CV score: 0.6051470588235295

Generation 11 - Current best internal CV score: 0.6051470588235295

Generation 12 - Current best internal CV score: 0.6080882352941177

Generation 13 - Current best internal CV score: 0.6316176470588235

Generation 14 - Current best internal CV score: 0.6316176470588235

Generation 15 - Current best internal CV score: 0.631617

TPOTClassifier(max_time_mins=60, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Best algorithm using min-max-normalised features (excluding nulls) after 60 mins



### Best Algorithm using min-max-normalised features after 60 mins

Generation 56 - Current best internal CV score: 0.7286764705882353

60.11 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(Binarizer(XGBClassifier(SelectFwe(Normalizer(input_matrix, norm=l2), alpha=0.018000000000000002), learning_rate=1.0, max_depth=10, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.7500000000000001, verbosity=0), threshold=0.65), learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.9000000000000001, verbosity=0)
TPOTClassifier(max_time_mins=60, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Best algorithm using min-max-normalised features after 5 minutes

Generation 6 - Current best internal CV score: 0.6051470588235295

5.01 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(MinMaxScaler(VarianceThreshold(input_matrix, threshold=0.05)), learning_rate=0.5, max_depth=8, min_child_weight=4, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)
TPOTClassifier(max_time_mins=5, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Best algorithm using mean-normalised features

Generation 6 - Current best internal CV score: 0.5823529411764705

5.24 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.5, max_depth=8, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.9000000000000001, verbosity=0)
TPOTClassifier(max_time_mins=5, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Best algorithm after using entire dataset to train (un-normalised)

Generation 89 - Current best internal CV score: 0.7139705882352941

240.33 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(VarianceThreshold(Normalizer(input_matrix, norm=max), threshold=0.0005), learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.9500000000000001, verbosity=0)
TPOTClassifier(max_time_mins=240, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Best algorithm based on accuracy and running for 4 hours

Generation 100 - Current best internal CV score: 0.8030303030303031

Best pipeline: XGBClassifier(SelectPercentile(Normalizer(input_matrix, norm=max), percentile=44), learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.6000000000000001, verbosity=0)
TPOTClassifier(max_time_mins=240, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Output of TPOT Classifier with Custom Scorer

Generation 14 - Current best internal CV score: 0.7666666666666667

12.04 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(Normalizer(input_matrix, norm=l1), learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)
TPOTClassifier(max_time_mins=12, n_jobs=-1,
               scoring=make_scorer(my_custom_accuracy), verbosity=2)

### Output of TPOT Classifier with Negative Mean Squared Error

Generation 90 - Current best internal CV score: -0.1787878787878788

120.04 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(Normalizer(CombineDFs(input_matrix, RobustScaler(ZeroCount(Normalizer(input_matrix, norm=l1)))), norm=max), learning_rate=0.1, max_depth=1, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.7000000000000001, verbosity=0)
TPOTClassifier(max_time_mins=120, n_jobs=-1, scoring='neg_mean_absolute_error',
               verbosity=2)

### Exporting the model

In [None]:
# Show the final model
print(tpot.fitted_pipeline_)

In [None]:
# Export the pipeline as a python script file
tpot.export('tpot_exported_pipeline.py')