In [1]:
import pandas as pd
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [3]:
combined_set = pd.concat([train_data, test_data])
combined_set['combined_var'] = (combined_set.hair_length * .40) + (combined_set.has_soul * .40)

# Replace categorical variables with numbers
def label_encoding(df, col):
    label_map = { key: float(n) for n, key in enumerate(df[col].unique()) }
    label_reverse_map = { label_map[key]: key for key in label_map }
    df[col] = df[col].apply(lambda x: label_map[x])
    return df, label_map, label_reverse_map

combined_set = pd.get_dummies(combined_set, columns=['color'])
combined_set

train_set = combined_set[:len(train_data.index)]
test_set = combined_set[len(train_data.index):]

In [4]:
train_cols = ['combined_var', 'rotting_flesh', 'bone_length', 'has_soul', 'hair_length']
target_var = ['type']
selected_cols = train_cols + target_var

In [5]:
train_set, type_label_map, type_label_reverse_map = label_encoding(train_set, 'type')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
p_train,val = train_test_split(train_set, train_size=.75, test_size=.25)

In [7]:
p_train.shape, val.shape

((278, 13), (93, 13))

In [8]:
p_train[train_cols].head().values

array([[ 0.28384169,  0.661752  ,  0.3183162 ,  0.23313558,  0.47646866],
       [ 0.48313942,  0.6306694 ,  0.5545096 ,  0.61536431,  0.59248423],
       [ 0.56893579,  0.45852253,  0.4165772 ,  0.65400642,  0.76833305],
       [ 0.57739325,  0.65731673,  0.52378864,  0.62450389,  0.81897924],
       [ 0.56649552,  0.64908767,  0.55982507,  0.65694341,  0.75929538]])

In [9]:
tpot = TPOTClassifier(verbosity=3)

In [10]:
help(tpot.fit)

Help on method fit in module tpot.base:

fit(features, classes) method of tpot.tpot.TPOTClassifier instance
    Fits a machine learning pipeline that maximizes classification score
    on the provided data
    
    Uses genetic programming to optimize a machine learning pipeline that
    maximizes classification score on the provided features and classes.
    Performs an internal stratified training/testing cross-validaton split
    to avoid overfitting on the provided data.
    
    Parameters
    ----------
    features: array-like {n_samples, n_features}
        Feature matrix
    classes: array-like {n_samples}
        List of class labels for prediction
    
    Returns
    -------
    None



In [12]:
tpot.fit(pd.np.array(p_train[train_cols]), pd.np.array(p_train[target_var]))

Optimization Progress:   3%|▎         | 288/10100 [00:00<00:10, 907.81pipeline/s]

Generation 1 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 2 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 3 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:   5%|▌         | 514/10100 [00:00<00:12, 787.16pipeline/s]

Generation 4 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 5 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:   7%|▋         | 723/10100 [00:00<00:13, 701.05pipeline/s]

Generation 6 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 7 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:   9%|▉         | 890/10100 [00:01<00:13, 688.02pipeline/s]

Generation 8 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 9 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  11%|█         | 1101/10100 [00:01<00:12, 706.28pipeline/s]

Generation 10 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 11 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  13%|█▎        | 1334/10100 [00:01<00:11, 734.64pipeline/s]

Generation 12 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 13 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  15%|█▍        | 1494/10100 [00:02<00:12, 715.21pipeline/s]

Generation 14 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 15 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  17%|█▋        | 1727/10100 [00:02<00:11, 743.26pipeline/s]

Generation 16 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 17 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  19%|█▊        | 1891/10100 [00:02<00:10, 767.78pipeline/s]

Generation 18 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 19 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  21%|██        | 2107/10100 [00:02<00:12, 627.76pipeline/s]

Generation 20 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 21 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  23%|██▎       | 2315/10100 [00:03<00:11, 652.12pipeline/s]

Generation 22 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 23 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  25%|██▍       | 2489/10100 [00:03<00:10, 710.11pipeline/s]

Generation 24 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 25 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  27%|██▋       | 2691/10100 [00:03<00:11, 649.68pipeline/s]

Generation 26 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 27 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  29%|██▉       | 2911/10100 [00:04<00:10, 681.44pipeline/s]

Generation 28 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 29 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  31%|███       | 3101/10100 [00:04<00:09, 761.87pipeline/s]

Generation 30 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 31 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  33%|███▎      | 3288/10100 [00:04<00:09, 728.63pipeline/s]

Generation 32 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 33 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  35%|███▍      | 3494/10100 [00:04<00:09, 667.18pipeline/s]

Generation 34 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 35 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  37%|███▋      | 3701/10100 [00:05<00:09, 665.99pipeline/s]

Generation 36 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 37 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  39%|███▉      | 3924/10100 [00:05<00:08, 700.66pipeline/s]

Generation 38 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 39 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  41%|████      | 4093/10100 [00:05<00:08, 688.62pipeline/s]

Generation 40 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 41 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  43%|████▎     | 4300/10100 [00:06<00:08, 720.78pipeline/s]

Generation 42 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 43 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  45%|████▍     | 4529/10100 [00:06<00:07, 718.60pipeline/s]

Generation 44 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 45 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  46%|████▋     | 4693/10100 [00:06<00:07, 703.69pipeline/s]

Generation 46 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 47 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  48%|████▊     | 4894/10100 [00:06<00:07, 682.95pipeline/s]

Generation 48 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 49 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  51%|█████     | 5116/10100 [00:07<00:06, 713.04pipeline/s]

Generation 50 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 51 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  52%|█████▏    | 5300/10100 [00:07<00:06, 781.43pipeline/s]

Generation 52 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 53 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  55%|█████▍    | 5543/10100 [00:07<00:05, 784.76pipeline/s]

Generation 54 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 55 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  56%|█████▋    | 5704/10100 [00:07<00:05, 734.78pipeline/s]

Generation 56 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 57 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  59%|█████▉    | 5949/10100 [00:08<00:05, 786.92pipeline/s]

Generation 58 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 59 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  61%|██████    | 6116/10100 [00:08<00:04, 805.38pipeline/s]

Generation 60 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 61 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  62%|██████▏   | 6293/10100 [00:08<00:05, 676.92pipeline/s]

Generation 62 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 63 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  64%|██████▍   | 6491/10100 [00:09<00:05, 677.59pipeline/s]

Generation 64 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 65 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  66%|██████▋   | 6701/10100 [00:09<00:05, 675.91pipeline/s]

Generation 66 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 67 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  68%|██████▊   | 6891/10100 [00:09<00:04, 717.83pipeline/s]

Generation 68 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 69 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  70%|███████   | 7101/10100 [00:09<00:04, 702.29pipeline/s]

Generation 70 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 71 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  73%|███████▎  | 7335/10100 [00:10<00:03, 736.30pipeline/s]

Generation 72 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 73 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  74%|███████▍  | 7500/10100 [00:10<00:03, 767.41pipeline/s]

Generation 74 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 75 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  76%|███████▌  | 7690/10100 [00:10<00:03, 679.45pipeline/s]

Generation 76 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 77 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  78%|███████▊  | 7893/10100 [00:11<00:03, 697.25pipeline/s]

Generation 78 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 79 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  80%|████████  | 8108/10100 [00:11<00:02, 722.46pipeline/s]

Generation 80 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 81 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  82%|████████▏ | 8323/10100 [00:11<00:02, 684.72pipeline/s]

Generation 82 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 83 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  85%|████████▍ | 8537/10100 [00:11<00:02, 698.48pipeline/s]

Generation 84 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 85 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  86%|████████▌ | 8689/10100 [00:12<00:02, 680.94pipeline/s]

Generation 86 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 87 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  88%|████████▊ | 8901/10100 [00:12<00:01, 695.18pipeline/s]

Generation 88 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 89 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  90%|█████████ | 9101/10100 [00:12<00:01, 603.67pipeline/s]

Generation 90 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 91 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  92%|█████████▏| 9314/10100 [00:13<00:01, 664.71pipeline/s]

Generation 92 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 93 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  94%|█████████▍| 9542/10100 [00:13<00:00, 717.84pipeline/s]

Generation 94 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 95 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  96%|█████████▌| 9701/10100 [00:13<00:00, 703.59pipeline/s]

Generation 96 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 97 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



Optimization Progress:  98%|█████████▊| 9922/10100 [00:14<00:00, 718.43pipeline/s]

Generation 98 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)

Generation 99 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)





Generation 100 - Current Pareto front scores:
5000	inf	GradientBoostingClassifier(input_matrix, 0.93000000000000005, 0.92000000000000004)



ValueError: There was an error in the TPOT optimization process. This could be because the data was not formatted properly, or because data for a regression problem was provided to the TPOTClassifier object. Please make sure you passed the data to TPOT correctly.

In [21]:
p_train[train_cols].head()

Unnamed: 0,combined_var,rotting_flesh,bone_length,has_soul,hair_length
219,0.531771,0.415485,0.489446,0.503475,0.825953
262,0.387865,0.497112,0.209997,0.327212,0.642451
6,0.434517,0.568952,0.399331,0.467901,0.618391
359,0.421345,0.172182,0.626017,0.644941,0.408422
220,0.223747,0.648866,0.168909,0.25544,0.303927


In [23]:
p_train[target_var].head()

Unnamed: 0,type
219,0.0
262,1.0
6,1.0
359,0.0
220,2.0
