## Import libraries and data

In [1]:
#Import dataset with key telecom customer data

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    accuracy_score,
    precision_score,
    average_precision_score,
    recall_score,
    f1_score,
    precision_recall_curve,
    cohen_kappa_score,
    confusion_matrix,
    plot_confusion_matrix,
    classification_report
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    cross_val_score,
    cross_val_predict
)
from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC
)
from yellowbrick.model_selection import (
    LearningCurve 
)
from imblearn.over_sampling import (
    SMOTE, ADASYN
)
from seaborn import diverging_palette

ModuleNotFoundError: No module named 'imblearn'

In [None]:
data = pd.read_csv('churn.csv')

## Take a Quick Look at the Data Structure

In [None]:
data.head()

In [None]:
data.info()

Note that the "Total night charge" variable is missing values. We will have to impute these later.

In [None]:
data.describe()

### Create histograms of numberical variables

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins=50, figsize=(20,15))
plt.show()

## Create a Test Set

In [None]:
import numpy as np

# to make this notebook's output identical at every run
np.random.seed(42)

In [None]:

# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(data, 0.2)
len(train_set)

In [None]:
len(test_set)

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
data_with_id = data.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(data_with_id, 0.2, "index")

In [None]:
test_set.head()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
test_set.head()

### Create Histogram of target variable - Body Mass Index (BMI)

In [None]:
## Convert categorical target variable, Churn, into a numerical binary varible

churn_mapper = {True:1, False:0}

data["Churn"] = data["Churn"].replace(churn_mapper)

In [None]:
data["Churn"].value_counts()

In [None]:
data.head()

In [None]:
data["Churn"].hist()

### Exploring the U.S. state the customers come from

In [None]:
data["State"].hist()


from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["State"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

strat_test_set["State"].value_counts() / len(strat_test_set)

data["State"].value_counts() / len(data)

def state_proportions(data):
    return data["State"].value_counts() / len(data)

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": state_proportions(data),
    "Stratified": state_proportions(strat_test_set),
    "Random": state_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

compare_props

# Discover and Visualize the Data to Gain Insights

In [None]:
data = strat_train_set.copy()

In [None]:
data.plot(kind="scatter", x="Total_intl_calls", y="Total_night_calls")

In [None]:
import seaborn as sns

# Plot
calls_by_plan_type = sns.catplot(x="Total_day_calls", col="International_plan", col_wrap=4,
                        data=data[data.Total_day_calls.notnull()],
                        kind="count", height=3.5, aspect=.8, 
                        palette='tab20')

## Looking for Correlations

In [None]:
corr_matrix = data.corr()

In [None]:
corr_matrix["Churn"].sort_values(ascending=False)

In [None]:
corr_matrix

In [None]:
sns.heatmap(corr_matrix, cmap = 'Blues')

Note that there is very high correlation between the minutes and charge variables. This is becuase charge is a factor of minutes since the customers are charged a rate per minute. Therefore we need to remove one of the varibales for each pair of correlated variables before we use these variables as predictors in a model.

In [None]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["Churn", "Total_eve_calls", "Total_night_calls",
              "Total_intl_minutes", "Total_day_minutes", "Total_intl_calls"]
scatter_matrix(data[attributes], figsize=(12, 8))


## Experimenting with Attribute Combinations

In [None]:
data["Total_calls"] = data["Total_day_calls"]+data["Total_eve_calls"]+data['Total_night_calls']+data['Total_intl_calls']
data["Total_mins"] = data["Total_day_minutes"]+data["Total_eve_minutes"]+data['Total_night_minutes']+data['Total_intl_minutes']
data["Total_charges"] = data["Total_day_charge"]+data["Total_eve_charge"]+data['Total_night_charge']+data['Total_intl_charge']





In [None]:
corr_matrix = data.corr()
corr_matrix["Churn"].sort_values(ascending=False)

In [None]:
# Drop the three attributes we added by combining existing attributes because we will add them in later in script
# using the custom transformer function:

data = data.drop(columns=['Total_calls', 'Total_mins', 'Total_charges'])



# Prepare the Data for Machine Learning Algorithms

In [None]:
data = strat_train_set.drop("Churn", axis=1) # drop labels for training set
data_labels = strat_train_set["Churn"].copy()

## Data Cleaning

In [None]:
sample_incomplete_rows = data[data.isnull().any(axis=1)].head()
sample_incomplete_rows

We need to impute the missing values for the Total_night_charge variable. We can do this using the median value from this column.

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

Remove the text attribute because median can only be calculated on numerical attributes:

In [None]:
data_num = data.select_dtypes(include=[np.number])

In [None]:
imputer.fit(data_num)

In [None]:
imputer.statistics_

Check that this is the same as manually computing the median of each attribute:

In [None]:
data_num.median().values

Transform the training set:

In [None]:
X = imputer.transform(data_num)

In [None]:
data_tr = pd.DataFrame(X, columns=data_num.columns,
                          index=data.index)

In [None]:
data_tr.loc[sample_incomplete_rows.index.values]

In [None]:
imputer.strategy

In [None]:
data_tr = pd.DataFrame(X, columns=data_num.columns,
                          index=data_num.index)

In [None]:
data_tr.head()

## Handling Text and Categorical Attributes

Now let's preprocess the categorical input feature, `State, Area_code, International_plan, and Voice_mail_plan`:

In [None]:
data_cat = data[["State", "Area_code", "International_plan", "Voice_mail_plan"]]
data_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
data_cat_encoded = ordinal_encoder.fit_transform(data_cat)
data_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=False)
data_cat_1hot = cat_encoder.fit_transform(data_cat)
data_cat_1hot

In [None]:
cat_encoder.categories_

## Custom Transformers


Let's create a custom transformer to add extra attributes:

In [None]:
col_names = "Total_day_calls", "Total_eve_calls", "Total_night_calls", "Total_intl_calls", "Total_day_minutes", "Total_eve_minutes", "Total_night_minutes", "Total_intl_minutes", "Total_day_charge", "Total_eve_charge", "Total_night_charge", "Total_intl_charge"
Total_day_calls_ix, Total_eve_calls_ix, Total_night_calls_ix, Total_intl_calls_ix, Total_day_minutes_ix, Total_eve_minutes_ix,Total_night_minutes_ix, Total_intl_minutes_ix, Total_day_charge_ix, Total_eve_charge_ix, Total_night_charge_ix, Total_intl_charge_ix = [
    data.columns.get_loc(c) for c in col_names] # get the column indices

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_Total_calls=True): # no *args or **kargs
        self.add_Total_calls = add_Total_calls
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        Total_mins = X[:, Total_day_minutes_ix] + X[:, Total_eve_minutes_ix] + X[:, Total_night_minutes_ix] + X[:, Total_intl_minutes_ix]
        Total_charges = X[:, Total_day_charge_ix] + X[:, Total_eve_charge_ix] + X[:, Total_night_charge_ix] + X[:, Total_intl_charge_ix]
        if self.add_Total_calls:
            Total_calls = X[:, Total_day_calls_ix] + X[:, Total_eve_calls_ix] + X[:, Total_night_calls_ix] + X[:, Total_intl_calls_ix]
            return np.c_[X, Total_calls, Total_mins,
                         Total_charges]
        else:
            return np.c_[X, Total_mins, Total_charges]

attr_adder = CombinedAttributesAdder(add_Total_calls=True)
data_extra_attribs = attr_adder.transform(data.values)

In [None]:
len(data_extra_attribs[0])

Also, `data_extra_attribs` is a NumPy array, we've lost the column names (unfortunately, that's a problem with Scikit-Learn). To recover a `DataFrame`, you can run this:

In [None]:
data_extra_attribs = pd.DataFrame(
    data_extra_attribs,
    columns=list(data.columns)+["Total_calls","Total_mins","Total_charges"],
    index=data.index)
data_extra_attribs.head()

## Transformation Pipelines


Now let's build a pipeline for preprocessing the numerical attributes:

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

data_num_tr = num_pipeline.fit_transform(data_num)

In [None]:
data_num_tr

In [None]:
data

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(data_num)
num_attribs.remove('Area_code')  # Area_Code is categorical although gets number
cat_attribs = ["State", "Area_code", "International_plan", "Voice_mail_plan"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

data_prepared = full_pipeline.fit_transform(data)

In [None]:
data_prepared = data_prepared.toarray()
data_prepared

In [None]:
data_prepared.shape

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

Now let's join all these components into a big pipeline that will preprocess both the numerical and the categorical features:

In [None]:
num_attribs = list(data_num)
num_attribs.remove('Area_code')
cat_attribs = ["State", "Area_code", "International_plan", "Voice_mail_plan"]

old_num_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

old_cat_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

In [None]:
from sklearn.pipeline import FeatureUnion

old_full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", old_num_pipeline),
        ("cat_pipeline", old_cat_pipeline),
    ])

In [None]:
old_data_prepared = old_full_pipeline.fit_transform(data)
old_data_prepared

In [None]:
old_data_prepared.shape

The result is the same as with the `ColumnTransformer`:

In [None]:
np.allclose(data_prepared, old_data_prepared)

## Train test split

In [None]:
from sklearn import model_selection

X_train, X_val_test, y_train, y_val_test = model_selection.train_test_split(
    data_prepared, data_labels, test_size=0.3, random_state=42)

In [None]:
print(len(X_train))
print(len(X_val_test))
print(len(y_train))
print(len(y_val_test))

## Test validation split

In [None]:
X_val, X_test, y_val, y_test = model_selection.train_test_split(
    X_val_test, y_val_test, test_size=0.5, random_state=42)

In [None]:
print(len(X_val))
print(len(X_test))
print(len(y_val))
print(len(y_test))

In [None]:
X_val.shape

In [None]:
extra_attribs = ["Total_calls","Total_mins","Total_charges"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs_0 = list(cat_encoder.categories_[0])
cat_one_hot_attribs_1 = list(cat_encoder.categories_[1])
cat_one_hot_attribs_2 = list(cat_encoder.categories_[2])
cat_one_hot_attribs_3 = list(cat_encoder.categories_[3])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs_0 + cat_one_hot_attribs_1 + cat_one_hot_attribs_2 + cat_one_hot_attribs_3 

## Evaluation Metrics

### ROC Curve

In [None]:
def plot_roc_curve(fpr, tpr, roc_auc):
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

### PR Curve

In [None]:
def plot_pr_curve(precision, recall, average_precision):
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')   
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
    plt.show()

### Classification Score

It gets the model and evaluates it for training and validation.

In [None]:
def clf_score(clf, X_train, y_train, X_val, y_val, train=True):
    if train:
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

    elif train == False:
        print("Validation Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_val, clf.predict(X_val))))
        
        precision, recall, _ = precision_recall_curve(y_val, clf.predict(X_val))
        average_precision = average_precision_score(y_val, clf.predict(X_val))
        plot_pr_curve(precision, recall, average_precision)
        
        fpr, tpr, _ = roc_curve(y_val, clf.predict(X_val))
        roc_auc = roc_auc_score(y_val, clf.predict(X_val))
        print("roc auc score: {}\n".format(roc_auc))
        plot_roc_curve(fpr, tpr, roc_auc)
        
        print("Classification Report: \n {}\n".format(classification_report(y_val, clf.predict(X_val))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_val, clf.predict(X_val))))
        plot_confusion_matrix(clf,  X_val, clf.predict(X_val))
        print("End of validation Result\n")

### Classification Metrics

In [None]:
def evaluation_metrics(y_actual, y_pred):
            
        precision, recall, _ = precision_recall_curve(y_actual, y_pred)
        average_precision = average_precision_score(y_actual, y_pred)
        plot_pr_curve(precision, recall, average_precision)
        
        fpr, tpr, _ = roc_curve(y_actual, y_pred)
        roc_auc = roc_auc_score(y_actual, y_pred)
        print("roc auc score: {}\n".format(roc_auc))
        plot_roc_curve(fpr, tpr, roc_auc)
        
        print("Classification Report: \n {}\n".format(classification_report(y_actual, y_pred)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_actual, y_pred)))

## Dealing with imbalanced classes

### Visualize support for each class

In [None]:
from yellowbrick.target import ClassBalance

# Instantiate the visualizer
visualizer = ClassBalance(labels=["0", "1"])

visualizer.fit(y_train, y_val)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

## Balanced class weight - Combining Over Sampling and Under Sampling

### SMOTEENN

In [None]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_train_smoteen, y_train_smoteen = smote_enn.fit_resample(X_train, y_train)

In [None]:
# Instantiate the visualizer
visualizer = ClassBalance(labels=["0", "1"])

visualizer.fit(y_train_smoteen)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

# Select and Train a Model

In [None]:
##Importing performance measure metrics
import timeit
from sklearn import metrics

## Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_smoteen, y_train_smoteen)

In [None]:
clf_score(dummy_clf, X_train_smoteen, y_train_smoteen, X_val, y_val, train=False)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_clf = LogisticRegression()
log_clf.fit(X_train_smoteen, y_train_smoteen)

In [None]:
clf_score(log_clf, X_train_smoteen, y_train_smoteen, X_val, y_val, train=False)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_smoteen, y_train_smoteen)

In [None]:
clf_score(rf_clf, X_train_smoteen, y_train_smoteen, X_val, y_val, train=False)

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train_smoteen, y_train_smoteen)

In [None]:
clf_score(gb_clf, X_train_smoteen, y_train_smoteen, X_val, y_val, train=False)

## LGBM Classifier

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm_clf = LGBMClassifier()
lgbm_clf.fit(X_train_smoteen, y_train_smoteen)

In [None]:
clf_score(gb_clf, X_train_smoteen, y_train_smoteen, X_val, y_val, train=False)

## XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train_smoteen, y_train_smoteen)

In [None]:
clf_score(xgb_clf, X_train_smoteen, y_train_smoteen, X_val, y_val, train=False)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
clf_ada_boost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_ada_boost.fit(X_train_smoteen, y_train_smoteen)

In [None]:
clf_score(clf_ada_boost, X_train_smoteen, y_train_smoteen, X_val, y_val, train=False)

### Evaluate model on test set

#### Class Prediction Error

In [None]:
from yellowbrick.classifier import ClassPredictionError

classes = ["0", "1"]

visualizer = ClassPredictionError(
    clf_ada_boost, classes=classes, is_fitted=True
)

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.show()

#### Classification Report

In [None]:
from yellowbrick.classifier import ClassificationReport

visualizer = ClassificationReport(
    clf_ada_boost, classes=classes, support=True, is_fitted=True
)

visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

#### Confusion Matrix

In [None]:
from yellowbrick.classifier import confusion_matrix

# The ConfusionMatrix visualizer taxes a model
cm = ConfusionMatrix(clf_ada_boost, classes=[0,1], is_fitted=True)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm.score(X_test, y_test)

# How did we do?
cm.show()

#### PR Curve

In [None]:
from yellowbrick.classifier import PrecisionRecallCurve
# Create the visualizer, fit, score, and show it
viz = PrecisionRecallCurve(clf_ada_boost, is_fitted=True)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()

#### ROC-AUC

In [None]:
visualizer = ROCAUC(
    clf_ada_boost, classes=classes, is_fitted=True
)

visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

#### Validation Curve

In [None]:
from yellowbrick.model_selection import ValidationCurve

viz = ValidationCurve(
    AdaBoostClassifier(n_estimators=100, random_state=42), 
    param_name="learning_rate",
    param_range=np.arange(1, 11), 
    cv=5, 
    scoring="f1_weighted",
    np_jobs=8
)

# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.poof()

#### Learning Curve

In [None]:
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve


# Create the learning curve visualizer
cv = StratifiedKFold(n_splits=12)
sizes = np.linspace(0.3, 1.0, 10)

# Instantiate the classification model and visualizer

visualizer = LearningCurve(
    AdaBoostClassifier(n_estimators=100, random_state=42), 
    cv=cv, 
    scoring='f1_weighted', 
    train_sizes=sizes, 
    n_jobs=8
)

visualizer.fit(X_train_smoteen, y_train_smoteen)        # Fit the data to the visualizer
visualizer.poof()                       # Finalize and render the figure

#### Discrimintation Threshold

In [None]:
from yellowbrick.classifier import DiscriminationThreshold

visualizer = DiscriminationThreshold(clf_ada_boost, is_fitted=True)

visualizer.fit(X_train_smoteen, y_train_smoteen)
visualizer.poof() 

#### Decision Function

In [None]:
from sklearn.metrics import precision_recall_curve

# call decision_function on classifier to get scores (probas_pred)
probas_pred = clf_ada_boost.decision_function(X_test)
# compute precision-recall pairs for different probability thresholds
precisions, recalls, thresholds = precision_recall_curve(y_test, probas_pred)
# precision and recall vs. the decision threshold
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plt.show()

#### Feature Importances

In [None]:
from yellowbrick.model_selection import FeatureImportances

fig = plt.figure(figsize=(22, 26))
viz = FeatureImportances(clf_ada_boost, labels=attributes)
viz.fit(X_train_smoteen, y_train_smoteen)
viz.poof()

## Multi-Layer Perceptron

A simple ANN architecture with Keras

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
mlp = Sequential()
mlp.add(Dense(12, activation='relu', input_shape=(113,)))
mlp.add(Dense(8, activation= 'relu'))
mlp.add(Dense(1, activation='sigmoid'))

In [None]:
mlp.output_shape
mlp.summary()
mlp.get_config()
mlp.get_weights()

## TPOT

In [None]:
from tpot import TPOTClassifier

In [None]:
# https://epistasislab.github.io/tpot/using/#built-in-tpot-configurations
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, 
                      random_state=42, config_dict='TPOT light')
tpot.fit(X_train_smoteen, y_train_smoteen)
print(tpot.score(X_val, y_val))
tpot.export('tpot_exported_pipeline.py')

## END