# Imports

## Standards

In [None]:
# analysis
import pandas as pd
import numpy as np
from scipy import stats

# visuals
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

# network
import networkx as nx
import community

from itertools import combinations

%matplotlib inline

## Machine Learning

### Preprocessing

In [None]:
# base
from sklearn.base import TransformerMixin, BaseEstimator

# pipeline
from sklearn.pipeline import Pipeline, FeatureUnion

# compose
from sklearn.compose import ColumnTransformer

# preprocessing
from sklearn.preprocessing import MinMaxScaler, RobustScaler, PolynomialFeatures, OneHotEncoder

# feature selection
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, mutual_info_classif, f_classif, RFE, RFECV

# decomposition
from sklearn.decomposition import PCA, TruncatedSVD, SparsePCA

# manifold
from sklearn.manifold import Isomap, TSNE, MDS, SpectralEmbedding

### Modeling

In [None]:
# cluster
from sklearn.cluster import OPTICS, AffinityPropagation, AgglomerativeClustering, DBSCAN, KMeans, MeanShift

# ensemble
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier

# logistic regression
from sklearn.linear_model import LogisticRegression, SGDClassifier

# naive bayes
from sklearn.naive_bayes import MultinomialNB

# process classifier
from sklearn.gaussian_process import GaussianProcessClassifier

# neighbors
from sklearn.neighbors import KNeighborsClassifier

# neural networks
from sklearn.neural_network import MLPClassifier

# support vector machines
from sklearn.svm import LinearSVC, SVC

# multiclass
from sklearn.multiclass import OneVsRestClassifier

# train test split, tuning, and score validation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

# classification report
from sklearn.metrics import classification_report

# Classifiers

In [None]:
classifiers = {
    'AdaBoostClassifier' : AdaBoostClassifier(random_state=0),
    'GradientBoostingClassifier' : GradientBoostingClassifier(random_state=0),
    'ExtraTreesClassifier' : ExtraTreesClassifier(n_estimators=100, random_state=0),
    'RandomForestClassifier' : RandomForestClassifier(n_estimators=100, random_state=0),
    'LogisticRegression' : LogisticRegression(solver='lbfgs', multi_class='auto', random_state=0, max_iter=500),
    'MultinomialNB' : MultinomialNB(),
    'GaussianProcessClassifier' : GaussianProcessClassifier(random_state=0),
    'KNeighborsClassifier' : KNeighborsClassifier(),
    'MLPClassifier' : MLPClassifier(random_state=0, max_iter=1500),
    'LinearSVC' : LinearSVC(random_state=0, max_iter=2000),
    'SVC' : SVC(gamma='scale', random_state=0),
    'SGDClassifier' : SGDClassifier(random_state=0),
}

# Ensembles

In [None]:
ensembles = {
    'AdaBoostClassifier' : AdaBoostClassifier(random_state=0),
    'GradientBoostingClassifier' : GradientBoostingClassifier(random_state=0),
    'ExtraTreesClassifier' : ExtraTreesClassifier(n_estimators=100, random_state=0),
    'RandomForestClassifier' : RandomForestClassifier(n_estimators=100, random_state=0),
}

# Data

In [None]:
# read training data
train = pd.read_csv("train.csv")

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.head()

# Color

In [None]:
# get dummy color features
train = train.join(pd.get_dummies(train.color))

In [None]:
# # create empty list to hold combinations of colors
# c = []

# # create combinations of color features (min 2, max nunique colors - 1)
# for i in range(2, train.color.nunique()):
#     els = [list(x) for x in combinations(train.color.unique(), i)]
#     c.extend(els)

# # sum the combinations and add new column to train
# for i in c:
#     train['_'.join(i)] = train[i].sum(axis=1)

# EDA

## `train_test_split`

In [None]:
X = train.drop(columns=[
    'id',
    'color',
    'type',
])

y = train.type

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Pipe

### Cleaner

In [None]:
pipe_cleaner = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', RobustScaler()),
    ('vt', VarianceThreshold()),
])

cleaner_params = {
    'cleaner__poly__degree' : stats.distributions.randint(2, 4+1),
    'cleaner__poly__include_bias' : [True, False],
    'cleaner__poly__interaction_only' : [True, False],
#     'cleaner__scaler__with_centering' : [True, False],
#     'cleaner__scaler__with_scaling' : [True, False],
#     'cleaner__scaler__quantile_range' : [(25.0, 75.0), (0.0, 100.0), (10.0, 90.0)],
    'cleaner__vt__threshold' : stats.distributions.uniform(0.0, 2e-2),
}

### Decomposition

In [None]:
pipe_decomp = Pipeline([
    ('union', FeatureUnion([
        ('pca', PCA(random_state=0)),
        ('svd', TruncatedSVD(random_state=0))
    ], n_jobs=-1))
])

decomp_params = {
    'decomp__union__pca__n_components' : stats.distributions.uniform(0.5, 0.5),
    'decomp__union__pca__whiten' : [True, False],
#     'decomp__union__pca__svd_solver' : ['auto', 'full', 'arpack', 'randomized'],
    'decomp__union__pca__tol' : stats.distributions.uniform(0.0, 0.5),
    'decomp__union__svd__n_components' : stats.distributions.randint(2, 50),
    'decomp__union__svd__algorithm' : ['randomized', 'arpack'],
    'decomp__union__svd__n_iter' : stats.distributions.randint(5, 15),
    'decomp__union__svd__tol' : stats.distributions.uniform(0.0, 0.5),
}

### Unsupervised Clustering

In [None]:
class ModelTransformer(TransformerMixin, BaseEstimator):
    
    def __init__(self, model):
        self.model = model
    
    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.fit_predict(X))

pipe_cluster = Pipeline([
    ('union', FeatureUnion([
        ('optics', ModelTransformer(OPTICS(n_jobs=-1))),
        ('dbscan', ModelTransformer(DBSCAN(n_jobs=-1))),
        ('kmeans', ModelTransformer(KMeans(random_state=0, n_jobs=-1))),
        ('meanshift', ModelTransformer(MeanShift(n_jobs=-1))),
        ('agglom', ModelTransformer(AgglomerativeClustering())),
        ('affinity', ModelTransformer(AffinityPropagation()))
    ], n_jobs=-1)),
    ('ohe', OneHotEncoder())
])

cluster_params = {
    'cluster__union__optics__model__min_samples' : stats.distributions.randint(2, 11),
    'cluster__union__optics__model__max_eps' : stats.distributions.uniform(1e-3, 3),
    'cluster__union__optics__model__p' : stats.distributions.randint(1,3),
    'cluster__union__optics__model__cluster_method' : ['xi', 'dbscan'],
    'cluster__union__optics__model__eps' : stats.distributions.uniform(1e-3, 3),
    'cluster__union__optics__model__xi' : stats.distributions.uniform(0, 1),
    'cluster__union__optics__model__predecessor_correction' : [True, False],
    'cluster__union__optics__model__min_cluster_size' : [None] + list(range(2, 11)),
    'cluster__union__optics__model__algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'cluster__union__optics__model__leaf_size' : stats.distributions.randint(20, 41),
    'cluster__union__dbscan__model__eps' : stats.distributions.uniform(1e-3, 3),
    'cluster__union__dbscan__model__min_samples' : stats.distributions.randint(2, 11),
    'cluster__union__dbscan__model__algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'cluster__union__dbscan__model__leaf_size' : stats.distributions.randint(20, 41),
    'cluster__union__dbscan__model__p' : [None] + list(np.random.uniform(1e-3, 5, size=20)),
    'cluster__union__kmeans__model__n_clusters' : stats.distributions.randint(2, 12),
    'cluster__union__kmeans__model__n_init' : stats.distributions.randint(10, 101),
    'cluster__union__kmeans__model__max_iter' : stats.distributions.randint(300, 501),
    'cluster__union__kmeans__model__tol' : stats.distributions.uniform(1e-6, 1e-1),
    'cluster__union__kmeans__model__algorithm' : ['auto', 'full', 'elkan'],
    'cluster__union__meanshift__model__bandwidth' : [None] + list(np.random.uniform(1e-3, 5, size=20)),
    'cluster__union__meanshift__model__cluster_all' : [True, False],
    'cluster__union__meanshift__model__max_iter' : stats.distributions.randint(300, 501),
    'cluster__union__agglom__model__affinity' : ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'],
    'cluster__union__agglom__model__compute_full_tree' : ['auto', True, False],
    'cluster__union__agglom__model__linkage' : ['ward', 'complete', 'average', 'single'],
    'cluster__union__affinity__model__damping' : stats.distributions.uniform(0.5, 0.5),
    'cluster__union__affinity__model__max_iter' : stats.distributions.randint(200, 401),
    'cluster__union__affinity__model__convergence_iter' : stats.distributions.randint(10, 31),
    'cluster__ohe__sparse' : [False],
}

### Classifier Swapping

In [None]:
class ClfSwap(BaseEstimator):
    
    def __init__(self, estimator = SGDClassifier()):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        
        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [None]:
pre_params = {**cleaner_params, **decomp_params, **cluster_params}

pipe_clf = Pipeline([
    ('cleaner', pipe_cleaner),
    ('decomp', pipe_decomp),
    ('cluster', pipe_cluster),
    ('clf', OneVsRestClassifier(ClfSwap(), n_jobs=-1)),
])

clf_params = [
    {
        'clf__estimator__estimator' : [AdaBoostClassifier(random_state=0)],
        **pre_params,
        'clf__estimator__estimator__n_estimators' : stats.distributions.randint(50, 200),
        'clf__estimator__estimator__learning_rate' : stats.distributions.uniform(0.5, 1),
        'clf__estimator__estimator__algorithm' : ['SAMME', 'SAMME.R'],
    },
    {
        'clf__estimator__estimator' : [GradientBoostingClassifier(random_state=0)],
        **pre_params,
        'clf__estimator__estimator__loss' : ['deviance', 'exponential'],
        'clf__estimator__estimator__learning_rate' : stats.distributions.uniform(0.5, 1),
        'clf__estimator__estimator__n_estimators' : stats.distributions.randint(100, 400),
        'clf__estimator__estimator__subsample' : stats.distributions.uniform(0.5, 0.5),
        'clf__estimator__estimator__criterion' : ['friedman_mse', 'mse', 'mae'],
        'clf__estimator__estimator__min_samples_split' : stats.distributions.randint(2, 4),
        'clf__estimator__estimator__min_samples_leaf' : stats.distributions.randint(1, 4),
        'clf__estimator__estimator__max_depth' : stats.distributions.randint(1,10),
        'clf__estimator__estimator__max_features' : ['sqrt', 'log2', None]
    },
    {
        'clf__estimator__estimator' : [ExtraTreesClassifier(random_state=0)],
        **pre_params
    },
    {
        'clf__estimator__estimator' : [RandomForestClassifier(random_state=0)],
        **pre_params
    },
    {
        'clf__estimator__estimator' : [LogisticRegression(random_state=0)],
        **pre_params
    },
    {
        'clf__estimator__estimator' : [MultinomialNB()],
        **pre_params
    },
    {
        'clf__estimator__estimator' : [GaussianProcessClassifier(random_state=0)],
        **pre_params
    },
    {
        'clf__estimator__estimator' : [KNeighborsClassifier()],
        **pre_params
    },
    {
        'clf__estimator__estimator' : [MLPClassifier(random_state=0)],
        **pre_params
    },
    {
        'clf__estimator__estimator' : [LinearSVC(random_state=0)],
        **pre_params
    },
    {
        'clf__estimator__estimator' : [SVC(random_state=0)],
        **pre_params
    },
    {
        'clf__estimator__estimator' : [SGDClassifier(random_state=0)],
        **pre_params
    },
]

### RandomSearchCV

In [None]:
rcv = RandomizedSearchCV(
    estimator=pipe_clf,
    param_distributions=clf_params,
    n_iter=10,
    n_jobs=-1,
    cv=10,
    random_state=0,
    verbose=5,
)

In [None]:
rcv.fit(X_train, y_train)

In [None]:
rcv.best_score_

In [None]:
# pipe = Pipeline([
#     ('cleaner', pipe_cleaner),
#     ('decomp', pipe_decomp),
#     ('cluster', pipe_cluster),
#     ('clf', pipe_clf),
# ], verbose=True)

In [None]:
# pipe.set_params(**params)

# _ = pipe.fit_transform(X)

In [None]:
# fig, ax = plt.subplots(figsize=(16,8))

# sns.scatterplot(
#     data=pd.DataFrame(_).join(train.type),
#     x=0,
#     y=1,
#     hue='type',
#     ax=ax,
#     s=100
# );

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.scatterplot(
    data=pd.DataFrame(_).join(train.type),
    x=0,
    y=1,
    hue='type',
    ax=ax,
    s=100
);

## `bone_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='bone_length',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='bone_length',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'bone_length'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_bone_length = train.groupby([
    'type',
]).bone_length.min().sort_values()

lower_bone_length

In [None]:
train['bone_length_lower'] = train.bone_length < lower_bone_length.iloc[1]

In [None]:
upper_bone_length = train.groupby([
    'type',
]).bone_length.max().sort_values()

upper_bone_length

In [None]:
train['bone_length_upper'] = train.bone_length > upper_bone_length.iloc[1]

## `rotting_flesh`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='rotting_flesh',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='rotting_flesh',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'rotting_flesh'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_rotting_flesh = train.groupby([
    'type',
]).rotting_flesh.min().sort_values()

lower_rotting_flesh

In [None]:
train['rotting_flesh_lower'] = train.rotting_flesh < lower_rotting_flesh.iloc[1]

In [None]:
upper_rotting_flesh = train.groupby([
    'type',
]).rotting_flesh.max().sort_values()

upper_rotting_flesh

In [None]:
train['rotting_flesh_upper'] = train.rotting_flesh > upper_rotting_flesh.iloc[1]

## `hair_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='hair_length',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='hair_length',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'hair_length'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_hair_length = train.groupby([
    'type',
]).hair_length.min().sort_values()

lower_hair_length

In [None]:
train['hair_length_lower'] = train.hair_length < lower_hair_length.iloc[1]

In [None]:
upper_hair_length = train.groupby([
    'type',
]).hair_length.max().sort_values()

upper_hair_length

In [None]:
train['hair_length_upper'] = train.hair_length > upper_hair_length.iloc[1]

## `has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='has_soul',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='has_soul',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'has_soul'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_has_soul = train.groupby([
    'type',
]).has_soul.min().sort_values()

lower_has_soul

In [None]:
train['has_soul_lower'] = train.has_soul < lower_has_soul.iloc[1]

In [None]:
upper_has_soul = train.groupby([
    'type',
]).has_soul.max().sort_values()

upper_has_soul

In [None]:
train['has_soul_upper'] = train.has_soul > upper_has_soul.iloc[1]

## Color Counts

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.heatmap(
    data=train.groupby([
        'type',
        'color',
    ]).id.nunique().unstack().apply(lambda x : x / x.sum(), axis=1).round(4)*100,
    ax=ax,
    annot=True,
    annot_kws={
        'fontsize' : 12,
    },
    cmap='Reds',
    fmt='g',
);

### `bone_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='bone_length',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='bone_length',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_bone_length = train.groupby([
    'color',
    'type',
]).bone_length.min()

lower_color_bone_length

In [None]:
for color in lower_color_bone_length.index.levels[0]:
    lcbl = lower_color_bone_length.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_lower'] = (train.color == color) & (train.bone_length < lcbl)

In [None]:
upper_color_bone_length = train.groupby([
    'color',
    'type',
]).bone_length.max()

upper_color_bone_length

In [None]:
for color in upper_color_bone_length.index.levels[0]:
    ucbl = upper_color_bone_length.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_upper'] = (train.color == color) & (train.bone_length > ucbl)

### `rotting_flesh`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='rotting_flesh',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='rotting_flesh',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_rotting_flesh = train.groupby([
    'color',
    'type',
]).rotting_flesh.min()

lower_color_rotting_flesh

In [None]:
for color in lower_color_rotting_flesh.index.levels[0]:
    lcbl = lower_color_rotting_flesh.loc[color].sort_values().iloc[1]
    train[f'{color}_rotting_flesh_lower'] = (train.color == color) & (train.rotting_flesh < lcbl)

In [None]:
upper_color_rotting_flesh = train.groupby([
    'color',
    'type',
]).rotting_flesh.max()

upper_color_rotting_flesh

In [None]:
for color in upper_color_rotting_flesh.index.levels[0]:
    ucbl = upper_color_rotting_flesh.loc[color].sort_values().iloc[1]
    train[f'{color}_rotting_flesh_upper'] = (train.color == color) & (train.rotting_flesh > ucbl)

### `hair_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='hair_length',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='hair_length',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_hair_length = train.groupby([
    'color',
    'type',
]).hair_length.min()

lower_color_hair_length

In [None]:
for color in lower_color_hair_length.index.levels[0]:
    lcbl = lower_color_hair_length.loc[color].sort_values().iloc[1]
    train[f'{color}_hair_length_lower'] = (train.color == color) & (train.hair_length < lcbl)

In [None]:
upper_color_hair_length = train.groupby([
    'color',
    'type',
]).hair_length.max()

upper_color_hair_length

In [None]:
for color in upper_color_hair_length.index.levels[0]:
    ucbl = upper_color_hair_length.loc[color].sort_values().iloc[1]
    train[f'{color}_hair_length_upper'] = (train.color == color) & (train.hair_length > ucbl)

### `has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='has_soul',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='has_soul',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_has_soul = train.groupby([
    'color',
    'type',
]).has_soul.min()

lower_color_has_soul

In [None]:
for color in lower_color_has_soul.index.levels[0]:
    lcbl = lower_color_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_has_soul_lower'] = (train.color == color) & (train.has_soul < lcbl)

In [None]:
upper_color_has_soul = train.groupby([
    'color',
    'type',
]).has_soul.max()

upper_color_has_soul

In [None]:
for color in upper_color_has_soul.index.levels[0]:
    ucbl = upper_color_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_has_soul_upper'] = (train.color == color) & (train.has_soul > ucbl)

## Continuous Interactions

### 2-way

In [None]:
train['bone_length_rotting_flesh'] = train.bone_length * train.rotting_flesh

train['bone_length_hair_length'] = train.bone_length * train.hair_length

train['bone_length_has_soul'] = train.bone_length * train.has_soul

train['rotting_flesh_hair_length'] = train.rotting_flesh * train.hair_length

train['rotting_flesh_has_soul'] = train.rotting_flesh * train.has_soul

train['hair_length_has_soul'] = train.hair_length * train.has_soul

#### `bone_length_rotting_flesh`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='bone_length_rotting_flesh',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='bone_length_rotting_flesh',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'bone_length_rotting_flesh'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_bone_length_rotting_flesh = train.groupby([
    'type',
]).bone_length_rotting_flesh.min().sort_values()

lower_bone_length_rotting_flesh

In [None]:
train['bone_length_rotting_flesh_lower'] = train.bone_length_rotting_flesh < lower_bone_length_rotting_flesh.iloc[1]

In [None]:
upper_bone_length_rotting_flesh = train.groupby([
    'type',
]).bone_length_rotting_flesh.max().sort_values()

upper_bone_length_rotting_flesh

In [None]:
train['bone_length_rotting_flesh_upper'] = train.bone_length_rotting_flesh > upper_bone_length_rotting_flesh.iloc[1]

#### `bone_length_hair_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='bone_length_hair_length',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='bone_length_hair_length',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'bone_length_hair_length'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_bone_length_hair_length = train.groupby([
    'type',
]).bone_length_hair_length.min().sort_values()

lower_bone_length_hair_length

In [None]:
train['bone_length_hair_length_lower'] = train.bone_length_hair_length < lower_bone_length_hair_length.iloc[1]

In [None]:
upper_bone_length_hair_length = train.groupby([
    'type',
]).bone_length_hair_length.max().sort_values()

upper_bone_length_hair_length

In [None]:
train['bone_length_hair_length_upper'] = train.bone_length_hair_length > upper_bone_length_hair_length.iloc[1]

#### `bone_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='bone_length_has_soul',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='bone_length_has_soul',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'bone_length_has_soul'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_bone_length_has_soul = train.groupby([
    'type',
]).bone_length_has_soul.min().sort_values()

lower_bone_length_has_soul

In [None]:
train['bone_length_has_soul_lower'] = train.bone_length_has_soul < lower_bone_length_has_soul.iloc[1]

In [None]:
upper_bone_length_has_soul = train.groupby([
    'type',
]).bone_length_has_soul.max().sort_values()

upper_bone_length_has_soul

In [None]:
train['bone_length_has_soul_upper'] = train.bone_length_has_soul > upper_bone_length_has_soul.iloc[1]

#### `rotting_flesh_hair_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='rotting_flesh_hair_length',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='rotting_flesh_hair_length',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'rotting_flesh_hair_length'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_rotting_flesh_hair_length = train.groupby([
    'type',
]).rotting_flesh_hair_length.min().sort_values()

lower_rotting_flesh_hair_length

In [None]:
train['rotting_flesh_hair_length_lower'] = train.rotting_flesh_hair_length < lower_rotting_flesh_hair_length.iloc[1]

In [None]:
upper_rotting_flesh_hair_length = train.groupby([
    'type',
]).rotting_flesh_hair_length.max().sort_values()

upper_rotting_flesh_hair_length

In [None]:
train['rotting_flesh_hair_length_upper'] = train.rotting_flesh_hair_length > upper_rotting_flesh_hair_length.iloc[1]

#### `rotting_flesh_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='rotting_flesh_has_soul',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='rotting_flesh_has_soul',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'rotting_flesh_has_soul'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_rotting_flesh_has_soul = train.groupby([
    'type',
]).rotting_flesh_has_soul.min().sort_values()

lower_rotting_flesh_has_soul

In [None]:
train['rotting_flesh_has_soul_lower'] = train.rotting_flesh_has_soul < lower_rotting_flesh_has_soul.iloc[1]

In [None]:
upper_rotting_flesh_has_soul = train.groupby([
    'type',
]).rotting_flesh_has_soul.max().sort_values()

upper_rotting_flesh_has_soul

In [None]:
train['rotting_flesh_has_soul_upper'] = train.rotting_flesh_has_soul > upper_rotting_flesh_has_soul.iloc[1]

#### `hair_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='hair_length_has_soul',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='hair_length_has_soul',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'hair_length_has_soul'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_hair_length_has_soul = train.groupby([
    'type',
]).hair_length_has_soul.min().sort_values()

lower_hair_length_has_soul

In [None]:
train['hair_length_has_soul_lower'] = train.hair_length_has_soul < lower_hair_length_has_soul.iloc[1]

In [None]:
upper_hair_length_has_soul = train.groupby([
    'type',
]).hair_length_has_soul.max().sort_values()

upper_hair_length_has_soul

In [None]:
train['hair_length_has_soul_upper'] = train.hair_length_has_soul > upper_hair_length_has_soul.iloc[1]

### 3-way

In [None]:
train['bone_length_rotting_flesh_hair_length'] = train.bone_length_rotting_flesh * train.hair_length

train['bone_length_rotting_flesh_has_soul'] = train.bone_length_rotting_flesh * train.has_soul

train['bone_length_hair_length_has_soul'] = train.bone_length_hair_length * train.has_soul

train['rotting_flesh_hair_length_has_soul'] = train.rotting_flesh_hair_length * train.has_soul

#### `bone_length_rotting_flesh_hair_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='bone_length_rotting_flesh_hair_length',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='bone_length_rotting_flesh_hair_length',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'bone_length_rotting_flesh_hair_length'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_bone_length_rotting_flesh_hair_length = train.groupby([
    'type',
]).bone_length_rotting_flesh_hair_length.min().sort_values()

lower_bone_length_rotting_flesh_hair_length

In [None]:
train['bone_length_rotting_flesh_hair_length_lower'] = train.bone_length_rotting_flesh_hair_length < lower_bone_length_rotting_flesh_hair_length.iloc[1]

In [None]:
upper_bone_length_rotting_flesh_hair_length = train.groupby([
    'type',
]).bone_length_rotting_flesh_hair_length.max().sort_values()

upper_bone_length_rotting_flesh_hair_length

In [None]:
train['bone_length_rotting_flesh_hair_length_upper'] = train.bone_length_rotting_flesh_hair_length > upper_bone_length_rotting_flesh_hair_length.iloc[1]

#### `bone_length_rotting_flesh_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='bone_length_rotting_flesh_has_soul',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='bone_length_rotting_flesh_has_soul',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'bone_length_rotting_flesh_has_soul'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_bone_length_rotting_flesh_has_soul = train.groupby([
    'type',
]).bone_length_rotting_flesh_has_soul.min().sort_values()

lower_bone_length_rotting_flesh_has_soul

In [None]:
train['bone_length_rotting_flesh_has_soul_lower'] = train.bone_length_rotting_flesh_has_soul < lower_bone_length_rotting_flesh_has_soul.iloc[1]

In [None]:
upper_bone_length_rotting_flesh_has_soul = train.groupby([
    'type',
]).bone_length_rotting_flesh_has_soul.max().sort_values()

upper_bone_length_rotting_flesh_has_soul

In [None]:
train['bone_length_rotting_flesh_has_soul_upper'] = train.bone_length_rotting_flesh_has_soul > upper_bone_length_rotting_flesh_has_soul.iloc[1]

#### `bone_length_hair_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='bone_length_hair_length_has_soul',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='bone_length_hair_length_has_soul',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'bone_length_hair_length_has_soul'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_bone_length_hair_length_has_soul = train.groupby([
    'type',
]).bone_length_hair_length_has_soul.min().sort_values()

lower_bone_length_hair_length_has_soul

In [None]:
train['bone_length_hair_length_has_soul_lower'] = train.bone_length_hair_length_has_soul < lower_bone_length_hair_length_has_soul.iloc[1]

In [None]:
upper_bone_length_hair_length_has_soul = train.groupby([
    'type',
]).bone_length_hair_length_has_soul.max().sort_values()

upper_bone_length_hair_length_has_soul

In [None]:
train['bone_length_hair_length_has_soul_upper'] = train.bone_length_hair_length_has_soul > upper_bone_length_hair_length_has_soul.iloc[1]

#### `rotting_flesh_hair_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='rotting_flesh_hair_length_has_soul',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='rotting_flesh_hair_length_has_soul',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'rotting_flesh_hair_length_has_soul'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_rotting_flesh_hair_length_has_soul = train.groupby([
    'type',
]).rotting_flesh_hair_length_has_soul.min().sort_values()

lower_rotting_flesh_hair_length_has_soul

In [None]:
train['rotting_flesh_hair_length_has_soul_lower'] = train.rotting_flesh_hair_length_has_soul < lower_rotting_flesh_hair_length_has_soul.iloc[1]

In [None]:
upper_rotting_flesh_hair_length_has_soul = train.groupby([
    'type',
]).rotting_flesh_hair_length_has_soul.max().sort_values()

upper_rotting_flesh_hair_length_has_soul

In [None]:
train['rotting_flesh_hair_length_has_soul_upper'] = train.rotting_flesh_hair_length_has_soul > upper_rotting_flesh_hair_length_has_soul.iloc[1]

### 4-way

In [None]:
train['bone_length_rotting_flesh_hair_length_has_soul'] = train.bone_length_rotting_flesh_hair_length * train.has_soul

#### `bone_length_rotting_flesh_hair_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='type',
    y='bone_length_rotting_flesh_hair_length_has_soul',
    saturation=0.2
)

sns.swarmplot(
    data=train,
    x='type',
    y='bone_length_rotting_flesh_hair_length_has_soul',
);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

for ggg in train.type.unique():

    sns.kdeplot(
        data=train.loc[train.type == ggg, 'bone_length_rotting_flesh_hair_length_has_soul'],
        cumulative=True,
        cut=0,
        label=ggg,
        ax=ax,
    )

In [None]:
lower_bone_length_rotting_flesh_hair_length_has_soul = train.groupby([
    'type',
]).bone_length_rotting_flesh_hair_length_has_soul.min().sort_values()

lower_bone_length_rotting_flesh_hair_length_has_soul

In [None]:
train['bone_length_rotting_flesh_hair_length_has_soul_lower'] = train.bone_length_rotting_flesh_hair_length_has_soul < lower_bone_length_rotting_flesh_hair_length_has_soul.iloc[1]

In [None]:
upper_bone_length_rotting_flesh_hair_length_has_soul = train.groupby([
    'type',
]).bone_length_rotting_flesh_hair_length_has_soul.max().sort_values()

upper_bone_length_rotting_flesh_hair_length_has_soul

In [None]:
train['bone_length_rotting_flesh_hair_length_has_soul_upper'] = train.bone_length_rotting_flesh_hair_length_has_soul > upper_bone_length_rotting_flesh_hair_length_has_soul.iloc[1]

In [None]:
train.head()

## Continuous Interactions + Color

### 2-way

#### `bone_length_rotting_flesh`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='bone_length_rotting_flesh',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='bone_length_rotting_flesh',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_bone_length_rotting_flesh = train.groupby([
    'color',
    'type',
]).bone_length_rotting_flesh.min()

lower_color_bone_length_rotting_flesh

In [None]:
for color in lower_color_bone_length_rotting_flesh.index.levels[0]:
    lcbl = lower_color_bone_length_rotting_flesh.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_rotting_flesh_lower'] = (train.color == color) & (train.bone_length_rotting_flesh < lcbl)

In [None]:
upper_color_bone_length_rotting_flesh = train.groupby([
    'color',
    'type',
]).bone_length_rotting_flesh.max()

upper_color_bone_length_rotting_flesh

In [None]:
for color in upper_color_bone_length_rotting_flesh.index.levels[0]:
    ucbl = upper_color_bone_length_rotting_flesh.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_rotting_flesh_upper'] = (train.color == color) & (train.bone_length_rotting_flesh > ucbl)

#### `bone_length_hair_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='bone_length_hair_length',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='bone_length_hair_length',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_bone_length_hair_length = train.groupby([
    'color',
    'type',
]).bone_length_hair_length.min()

lower_color_bone_length_hair_length

In [None]:
for color in lower_color_bone_length_hair_length.index.levels[0]:
    lcbl = lower_color_bone_length_hair_length.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_hair_length_lower'] = (train.color == color) & (train.bone_length_hair_length < lcbl)

In [None]:
upper_color_bone_length_hair_length = train.groupby([
    'color',
    'type',
]).bone_length_hair_length.max()

upper_color_bone_length_hair_length

In [None]:
for color in upper_color_bone_length_hair_length.index.levels[0]:
    ucbl = upper_color_bone_length_hair_length.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_hair_length_upper'] = (train.color == color) & (train.bone_length_hair_length > ucbl)

#### `bone_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='bone_length_has_soul',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='bone_length_has_soul',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_bone_length_has_soul = train.groupby([
    'color',
    'type',
]).bone_length_has_soul.min()

lower_color_bone_length_has_soul

In [None]:
for color in lower_color_bone_length_has_soul.index.levels[0]:
    lcbl = lower_color_bone_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_has_soul_lower'] = (train.color == color) & (train.bone_length_has_soul < lcbl)

In [None]:
upper_color_bone_length_has_soul = train.groupby([
    'color',
    'type',
]).bone_length_has_soul.max()

upper_color_bone_length_has_soul

In [None]:
for color in upper_color_bone_length_has_soul.index.levels[0]:
    ucbl = upper_color_bone_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_has_soul_upper'] = (train.color == color) & (train.bone_length_has_soul > ucbl)

#### `rotting_flesh_hair_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='rotting_flesh_hair_length',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='rotting_flesh_hair_length',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_rotting_flesh_hair_length = train.groupby([
    'color',
    'type',
]).rotting_flesh_hair_length.min()

lower_color_rotting_flesh_hair_length

In [None]:
for color in lower_color_rotting_flesh_hair_length.index.levels[0]:
    lcbl = lower_color_rotting_flesh_hair_length.loc[color].sort_values().iloc[1]
    train[f'{color}_rotting_flesh_hair_length_lower'] = (train.color == color) & (train.rotting_flesh_hair_length < lcbl)

In [None]:
upper_color_rotting_flesh_hair_length = train.groupby([
    'color',
    'type',
]).rotting_flesh_hair_length.max()

upper_color_rotting_flesh_hair_length

In [None]:
for color in upper_color_rotting_flesh_hair_length.index.levels[0]:
    ucbl = upper_color_rotting_flesh_hair_length.loc[color].sort_values().iloc[1]
    train[f'{color}_rotting_flesh_hair_length_upper'] = (train.color == color) & (train.rotting_flesh_hair_length > ucbl)

#### `rotting_flesh_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='rotting_flesh_has_soul',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='rotting_flesh_has_soul',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_rotting_flesh_has_soul = train.groupby([
    'color',
    'type',
]).rotting_flesh_has_soul.min()

lower_color_rotting_flesh_has_soul

In [None]:
for color in lower_color_rotting_flesh_has_soul.index.levels[0]:
    lcbl = lower_color_rotting_flesh_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_rotting_flesh_has_soul_lower'] = (train.color == color) & (train.rotting_flesh_has_soul < lcbl)

In [None]:
upper_color_rotting_flesh_has_soul = train.groupby([
    'color',
    'type',
]).rotting_flesh_has_soul.max()

upper_color_rotting_flesh_has_soul

In [None]:
for color in upper_color_rotting_flesh_has_soul.index.levels[0]:
    ucbl = upper_color_rotting_flesh_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_rotting_flesh_has_soul_upper'] = (train.color == color) & (train.rotting_flesh_has_soul > ucbl)

#### `hair_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='hair_length_has_soul',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='hair_length_has_soul',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_hair_length_has_soul = train.groupby([
    'color',
    'type',
]).hair_length_has_soul.min()

lower_color_hair_length_has_soul

In [None]:
for color in lower_color_hair_length_has_soul.index.levels[0]:
    lcbl = lower_color_hair_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_hair_length_has_soul_lower'] = (train.color == color) & (train.hair_length_has_soul < lcbl)

In [None]:
upper_color_hair_length_has_soul = train.groupby([
    'color',
    'type',
]).hair_length_has_soul.max()

upper_color_hair_length_has_soul

In [None]:
for color in upper_color_hair_length_has_soul.index.levels[0]:
    ucbl = upper_color_hair_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_hair_length_has_soul_upper'] = (train.color == color) & (train.hair_length_has_soul > ucbl)

### 3-way

#### `bone_length_rotting_flesh_hair_length`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='bone_length_rotting_flesh_hair_length',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='bone_length_rotting_flesh_hair_length',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_bone_length_rotting_flesh_hair_length = train.groupby([
    'color',
    'type',
]).bone_length_rotting_flesh_hair_length.min()

lower_color_bone_length_rotting_flesh_hair_length

In [None]:
for color in lower_color_bone_length_rotting_flesh_hair_length.index.levels[0]:
    lcbl = lower_color_bone_length_rotting_flesh_hair_length.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_rotting_flesh_hair_length_lower'] = (train.color == color) & (train.bone_length_rotting_flesh_hair_length < lcbl)

In [None]:
upper_color_bone_length_rotting_flesh_hair_length = train.groupby([
    'color',
    'type',
]).bone_length_rotting_flesh_hair_length.max()

upper_color_bone_length_rotting_flesh_hair_length

In [None]:
for color in upper_color_bone_length_rotting_flesh_hair_length.index.levels[0]:
    ucbl = upper_color_bone_length_rotting_flesh_hair_length.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_rotting_flesh_hair_length_upper'] = (train.color == color) & (train.bone_length_rotting_flesh_hair_length > ucbl)

#### `bone_length_rotting_flesh_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='bone_length_rotting_flesh_has_soul',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='bone_length_rotting_flesh_has_soul',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_bone_length_rotting_flesh_has_soul = train.groupby([
    'color',
    'type',
]).bone_length_rotting_flesh_has_soul.min()

lower_color_bone_length_rotting_flesh_has_soul

In [None]:
for color in lower_color_bone_length_rotting_flesh_has_soul.index.levels[0]:
    lcbl = lower_color_bone_length_rotting_flesh_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_rotting_flesh_has_soul_lower'] = (train.color == color) & (train.bone_length_rotting_flesh_has_soul < lcbl)

In [None]:
upper_color_bone_length_rotting_flesh_has_soul = train.groupby([
    'color',
    'type',
]).bone_length_rotting_flesh_has_soul.max()

upper_color_bone_length_rotting_flesh_has_soul

In [None]:
for color in upper_color_bone_length_rotting_flesh_has_soul.index.levels[0]:
    ucbl = upper_color_bone_length_rotting_flesh_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_rotting_flesh_has_soul_upper'] = (train.color == color) & (train.bone_length_rotting_flesh_has_soul > ucbl)

#### `bone_length_hair_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='bone_length_hair_length_has_soul',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='bone_length_hair_length_has_soul',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_bone_length_hair_length_has_soul = train.groupby([
    'color',
    'type',
]).bone_length_hair_length_has_soul.min()

lower_color_bone_length_hair_length_has_soul

In [None]:
for color in lower_color_bone_length_hair_length_has_soul.index.levels[0]:
    lcbl = lower_color_bone_length_hair_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_hair_length_has_soul_lower'] = (train.color == color) & (train.bone_length_hair_length_has_soul < lcbl)

In [None]:
upper_color_bone_length_hair_length_has_soul = train.groupby([
    'color',
    'type',
]).bone_length_hair_length_has_soul.max()

upper_color_bone_length_hair_length_has_soul

In [None]:
for color in upper_color_bone_length_hair_length_has_soul.index.levels[0]:
    ucbl = upper_color_bone_length_hair_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_hair_length_has_soul_upper'] = (train.color == color) & (train.bone_length_hair_length_has_soul > ucbl)

#### `rotting_flesh_hair_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='rotting_flesh_hair_length_has_soul',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='rotting_flesh_hair_length_has_soul',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_rotting_flesh_hair_length_has_soul = train.groupby([
    'color',
    'type',
]).rotting_flesh_hair_length_has_soul.min()

lower_color_rotting_flesh_hair_length_has_soul

In [None]:
for color in lower_color_rotting_flesh_hair_length_has_soul.index.levels[0]:
    lcbl = lower_color_rotting_flesh_hair_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_rotting_flesh_hair_length_has_soul_lower'] = (train.color == color) & (train.rotting_flesh_hair_length_has_soul < lcbl)

In [None]:
upper_color_rotting_flesh_hair_length_has_soul = train.groupby([
    'color',
    'type',
]).rotting_flesh_hair_length_has_soul.max()

upper_color_rotting_flesh_hair_length_has_soul

In [None]:
for color in upper_color_rotting_flesh_hair_length_has_soul.index.levels[0]:
    ucbl = upper_color_rotting_flesh_hair_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_rotting_flesh_hair_length_has_soul_upper'] = (train.color == color) & (train.rotting_flesh_hair_length_has_soul > ucbl)

### 4-way

#### `bone_length_rotting_flesh_hair_length_has_soul`

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

sns.boxenplot(
    data=train,
    x='color',
    y='bone_length_rotting_flesh_hair_length_has_soul',
    hue='type',
    ax=ax,
    saturation=0.2,
)

sns.swarmplot(
    data=train,
    x='color',
    y='bone_length_rotting_flesh_hair_length_has_soul',
    hue='type',
    ax=ax,
    dodge=True,
);

In [None]:
lower_color_bone_length_rotting_flesh_hair_length_has_soul = train.groupby([
    'color',
    'type',
]).bone_length_rotting_flesh_hair_length_has_soul.min()

lower_color_bone_length_rotting_flesh_hair_length_has_soul

In [None]:
for color in lower_color_bone_length_rotting_flesh_hair_length_has_soul.index.levels[0]:
    lcbl = lower_color_bone_length_rotting_flesh_hair_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_rotting_flesh_hair_length_has_soul_lower'] = (train.color == color) & (train.bone_length_rotting_flesh_hair_length_has_soul < lcbl)

In [None]:
upper_color_bone_length_rotting_flesh_hair_length_has_soul = train.groupby([
    'color',
    'type',
]).bone_length_rotting_flesh_hair_length_has_soul.max()

upper_color_bone_length_rotting_flesh_hair_length_has_soul

In [None]:
for color in upper_color_bone_length_rotting_flesh_hair_length_has_soul.index.levels[0]:
    ucbl = upper_color_bone_length_rotting_flesh_hair_length_has_soul.loc[color].sort_values().iloc[1]
    train[f'{color}_bone_length_rotting_flesh_hair_length_has_soul_upper'] = (train.color == color) & (train.bone_length_rotting_flesh_hair_length_has_soul > ucbl)

## Heatmaps for Booleans?

In [None]:
train.head()

# Deviations From Normality

In [None]:
# flag bone_length within/outside +/-1 std of mean
train['bone_length_std1'] = train.bone_length.apply(
    lambda x : -1 if x < (train.bone_length.mean() - train.bone_length.std())
    else 1 if x > (train.bone_length.mean() + train.bone_length.std())
    else 0,
)

# flag rotting_flesh within/outside +/-1 std of mean
train['rotting_flesh_std1'] = train.rotting_flesh.apply(
    lambda x : -1 if x < (train.rotting_flesh.mean() - train.rotting_flesh.std())
    else 1 if x > (train.rotting_flesh.mean() + train.rotting_flesh.std())
    else 0,
)

# flag hair_length within/outside +/-1 std of mean
train['hair_length_std1'] = train.hair_length.apply(
    lambda x : -1 if x < (train.hair_length.mean() - train.hair_length.std())
    else 1 if x > (train.hair_length.mean() + train.hair_length.std())
    else 0,
)

# flag has_soul within/outside +/-1 std of mean
train['has_soul_std1'] = train.has_soul.apply(
    lambda x : -1 if x < (train.has_soul.mean() - train.has_soul.std())
    else 1 if x > (train.has_soul.mean() + train.has_soul.std())
    else 0,
)

# Polynomials

In [None]:
# define poly
poly = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)

# fit_transform train data
poly_numeric = poly.fit_transform(train.drop(columns=[
    'id',
    'type',
    'color',
]))

# redefine train with new poly features
train = train.loc[:, [
    'id',
    'type',
    'color',
]].join(pd.DataFrame(
    data=poly_numeric,
    columns=poly.get_feature_names(train.drop(columns=[
        'id',
        'type',
        'color',
    ]).columns.tolist())
))

# Scaling

In [None]:
# select X features
X = train.drop(columns=[
    'id',
    'color',
    'type',
])

# select y for training
y = train.type

# define scaler
scaler = MinMaxScaler()
rscaler = RobustScaler()

# scale X
X_ = pd.DataFrame(
    data=scaler.fit_transform(X),
    index=X.index,
    columns=X.columns
)

X_r = pd.DataFrame(
    data=rscaler.fit_transform(X),
    index=X.index,
    columns=X.columns
)

# Variance Threshold

In [None]:
# redefine variance threshold (in case it wasn't previously)
vt = VarianceThreshold(threshold=0.0)

# redefine X_ based on vt
X_ = pd.DataFrame(
    data=vt.fit_transform(X_),
    index=X_.index,
    columns=X_.columns[vt.get_support()]
)

# redefine X_r based on vt
X_r = pd.DataFrame(
    data=vt.fit_transform(X_r),
    index=X_r.index,
    columns=X_r.columns[vt.get_support()]
)

# Percentile Selection
##### Using [Mutual Information](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)

In [None]:
# define selector
selector = SelectPercentile(
    score_func=mutual_info_classif,
    percentile=10
)

# keep top 10 percent of features
X_ = pd.DataFrame(
    data=selector.fit_transform(X_, y),
    index=X_.index,
    columns=X_.columns[selector.get_support()]
)

# keep top 10 percent of features
X_r = pd.DataFrame(
    data=selector.fit_transform(X_r, y),
    index=X_r.index,
    columns=X_r.columns[selector.get_support()]
)

# Feature Importance

In [None]:
fi = {}

for k,v in ensembles.items():
    fi[k] = v.fit(X_, y).feature_importances_

fi = pd.DataFrame.from_dict(fi, orient='index', columns=X_.columns)

fi.sort_values(fi.columns.tolist(), ascending=False, inplace=True)
fi.sort_values(fi.index.tolist(), axis=1, ascending=False, inplace=True)

fi.style.highlight_max(axis=1)

In [None]:
fi = {}

for k,v in ensembles.items():
    fi[k] = v.fit(X_r, y).feature_importances_

fi = pd.DataFrame.from_dict(fi, orient='index', columns=X_r.columns)

fi.sort_values(fi.columns.tolist(), ascending=False, inplace=True)
fi.sort_values(fi.index.tolist(), axis=1, ascending=False, inplace=True)

fi.style.highlight_max(axis=1)

# Top n per FI

In [None]:
# top features per fi-algorithm
n = 1

sns.pairplot(
    data=X.join(y),
    hue='type',
    vars=list(set(fi.iloc[0].sort_values(ascending=False).head(n).index.tolist() +\
                  fi.iloc[1].sort_values(ascending=False).head(n).index.tolist() +\
                  fi.iloc[2].sort_values(ascending=False).head(n).index.tolist() +\
                  fi.iloc[3].sort_values(ascending=False).head(n).index.tolist()))
);

The most important feature from all three ensemble algorithms is `hair_length`.

# Model

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X_r, y, test_size=0.1, random_state=0)

In [None]:
results = {}

for k,v in classifiers.items():
    
    print(k)
    
    cv = cross_validate(
        estimator=v,
        X=train_x,
        y=train_y,
        cv=10
    )
    
    cv = cv['test_score']
    
    results[k] = cv

In [None]:
# selector 10
pd.DataFrame.from_dict(results, orient='columns').describe()

# GridSearchCV

## AdaBoostClassifier

In [None]:
AdaBoostClassifier_g = GridSearchCV(
    estimator=AdaBoostClassifier(),
    param_grid={
        'n_estimators' : np.arange(50, 250, 5),
        'learning_rate' : np.arange(1e-2, 1e1, 1e-1),
        'algorithm' : ['SAMME', 'SAMME.R'],
        'random_state' : [0]
    },
    cv=10
)

AdaBoostClassifier_g.fit(train_x, train_y)

print(f"AdaBoostClassifier_g.best_score_: {AdaBoostClassifier_g.best_score_}")

In [None]:
scores = cross_val_score(
    estimator=AdaBoostClassifier_g.best_estimator_,
    X=train_x,
    y=train_y,
    cv=10
)

print(pd.Series(scores).describe())

pd.Series(scores).plot.kde();

## ExtraTreesClassifier

In [None]:
ExtraTreesClassifier_g = GridSearchCV(
    estimator=ExtraTreesClassifier(),
    param_grid={
        'n_estimators' : np.arange(50, 250, 5),
        'learning_rate' : np.arange(1e-2, 1e1, 1e-1),
        'algorithm' : ['SAMME', 'SAMME.R'],
        'random_state' : [0]
    },
    cv=10
)

AdaBoostClassifier_g.fit(train_x, train_y)

print(f"AdaBoostClassifier_g.best_score_: {AdaBoostClassifier_g.best_score_}")

## LogisticRegression

In [None]:
LogisticRegression_g = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid={
        'random_state' : [0],
        'penalty' : ['l1', 'l2'],
        'C' : np.arange(1e-2, 1e1, 1e-1),
        'fit_intercept' : [True],
        'max_iter' : [200],
        'solver' : ['liblinear'],
        'multi_class' : ['auto'],
    },
    cv=10,
)

LogisticRegression_g.fit(train_x, train_y)

print(f"LogisticRegression_g.best_score_: {LogisticRegression_g.best_score_}")

In [None]:
scores = cross_val_score(
    estimator=LogisticRegression_g.best_estimator_,
    X=train_x,
    y=train_y,
    cv=10
)

print(pd.Series(scores).describe())

pd.Series(scores).plot.kde();

In [None]:
pd.DataFrame(classification_report(
    y_true=test_y,
    y_pred=LogisticRegression_g.best_estimator_.predict(test_x),
    output_dict=True
))