# Titanic

# Capítulo 1 - Introdução

In [None]:
# import autosklearn, catboost, category_encoders, dtreeviz, eli5, fancyimpute, fastai, featuretools, glmnet_py, graphviz, hdbscan, imblearn, janitor, lime, matplotlib, missingno, mlxtend, numpy, pandas, pdpbox, phate, pydotplus, rfpimp, scikitplot, scipy, seaborn, shap, sklearn, statsmodels, tpot, treeinterpreter, umap, xgbfir, xgboost, yellowbrick

# Capítulo 2 - Visão geral do processo de machine learning

In [None]:
# Business understanding
# Data understanding
# Data preparation
# Modeling
# Evaluation
# Deployment

# Capítulo 3 - Descrição da classificação: conjunto de dados do Titanic

### Importações

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import ensemble, preprocessing, tree
from sklearn.metrics import auc, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from yellowbrick.classifier import ConfusionMatrix, ROCAUC
from yellowbrick.model_selection import LearningCurve
from sklearn import model_selection

### Colete os dados

In [None]:
url = ('https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls')
df = pd.read_excel(url)
orig_df = df

In [None]:
df.head()

### Limpe os dados

In [None]:
df.dtypes

In [None]:
# gera um relatório de perfil. sintetiza os tipos das colunas e permite visualizar os detalhes das estatísticas dos quantis, estatísticas descriticas, histograma, valores comuns e valores extremos.
import pandas_profiling
pandas_profiling.ProfileReport(df)

In [None]:
df.shape

In [None]:
# estatísticas resumidas, contatos de dados não nulos.
# abaixo foi truncado para exibir somente as primeiras colunas
df.describe().iloc[:, :2]

In [None]:
df.isnull().sum()

In [None]:
# porcentagem por valores nulos
df.isnull().mean(axis=1).loc[:10]

In [None]:
mask = df.isnull().any(axis=1)

In [None]:
mask.head()

In [None]:
df[mask].body.head()

In [None]:
df.sex.value_counts(dropna = False)

In [None]:
df.embarked.value_counts(dropna = False)

### Crie os atributos

In [None]:
name = df.name

In [None]:
name.head(3)

In [None]:
# O conjunto de dados Titanic contém atributos que provocam vazamento de informações (leaky features)
# leaky features são variáveis que contêm dados sobre o futuro ou o objetivo.

# a coluna body (número de identificação do corpo) informa que o passageiro não sobreviveu.
# a coluna boat (bote salva vidas) informa que o passageiro sobreviveu. 
# a coluna name não traz nenhuma informação relevante.
df = df.drop(columns = ['name', 'ticket', 'home.dest', 'boat', 'body', 'cabin'])

In [None]:
df.head()

In [None]:
# criar colunas dummy a partir das colunas de string
df = pd.get_dummies(df)

In [None]:
df.columns

In [None]:
# as colunas sex_male e sex_female estão inversamente correlacionadas de forma perfeita
# em geral, removemos qualquer coluna com uma correlação perfeita ou com uma correlação positiva ou negativa bem alta.
# a multicolinearidade pode causar impactos na interpretação da importância dos atributos dos coeficientes em alguns modelos.

df = df.drop(columns = 'sex_male')
df = pd.get_dummies(df, drop_first = True)
df.columns

In [None]:
y = df.survived
X = df.drop(columns = 'survived')

### Separe as amostras

In [None]:
# sempre devemos fazer treinamento e testes em dados distintos. caso contrário, você não saberá realmente quão bem seu modelo poderá ser generalizado para dados que ainda não tenham sido vistos antes.
# Usaremos o scikit-learn para extrair 30% dos dados para testes (usamos random_state=42 para eliminar a aleatoriedade caso venhamos a comparar diferentes modelos)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train, X_test

In [None]:
y_train, y_test

### Imputação de dados

In [None]:
# a coluna de idade tem valores ausentes. devemos imputar uma idade a partir dos valores numéricos. queremos imputar dados apenas no conjunto de treinamento, e então usar esse imputer para preencher os dados no conjunto de testes. caso contrário, causaremos vazamento de informações (trapaceando ao dar informações futuras para o modelo).

from sklearn.experimental import enable_iterative_imputer
from sklearn import impute

num_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female']
num_cols

In [None]:
imputer = impute.IterativeImputer()

In [None]:
imputed = imputer.fit_transform(X_train[num_cols])
X_train.loc[:, num_cols] = imputed

In [None]:
imputed = imputer.transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed

In [None]:
# imputar valores usando mediana

meds = X_train.median()
X_train = X_train.fillna(meds)
X_test = X_test.fillna(meds)

In [None]:
X_train

### Normalize os dados

In [None]:
# traduzir os dados de modo que tenham um valor de média igual a zero e um desvio-padrão igual a um.
# desse modo os modelos não tratarão as variáveis com escalas maiores como mais importantes que as variáveis com menor escala.

cols = 'pclass,age,sibsp,parch,fare,sex_female,embarked_C,embarked_Q,embarked_S'.split(',')
sca = preprocessing.StandardScaler()
cols

In [None]:
X_train = sca.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns = cols)
X_train

In [None]:
X_test = sca.transform(X_test)
X_test = pd.DataFrame(X_test, columns = cols)
X_test

### Refatore

In [None]:
def tweak_titanic(df):
    df = df.drop(columns = ['name', 'ticket', 'home.dest', 'boat', 'body', 'cabin']).pipe(pd.get_dummies, drop_first = True)
    return df

In [None]:
def get_train_test_X_y(df, y_col, size=0.3, std_cols=None):
    y = df[y_col]
    X = df.drop(columns = y_col)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = size, random_state = 42)
    cols = X.columns
    num_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare']
    fi = impute.IterativeImputer()
    X_train.loc[:, num_cols] = fi.fit_transform(X_train[num_cols])
    X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])
    if std_cols:
        std = preprocessing.StandardScaler()
        X_train.loc[:, std_cols] = std.fit_transform(X_train[std_cols])
        X_test.loc[:, std_cols] = std.transform(X_test[std_cols])
    return X_train, X_test, y_train, y_test

In [None]:
ti_df = tweak_titanic(orig_df)

In [None]:
ti_df.head()

In [None]:
std_cols = 'pclass,age,sibsp,fare'.split(',')

In [None]:
#X_train, X_test, y_train, y_test = get_train_test_X_y(ti_df, 'survived', std_cols = std_cols)

### Modelo de base

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
bm = DummyClassifier()
bm.fit(X_train, y_train)
bm.score(X_test, y_test) # precisão

In [None]:
from sklearn import metrics
metrics.precision_score(y_test, bm.predict(X_test))

### Várias famílias

In [None]:
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost

In [None]:
for model in [DummyClassifier, LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier, GaussianNB, SVC, RandomForestClassifier, xgboost.XGBClassifier]:
    cls = model()
    kfold = model_selection.KFold(n_splits = 10)
    s = model_selection.cross_val_score(cls, X, y, scoring = 'roc_auc', cv = kfold)
    print(f"{model.__name__:22} AUC: "
         f"{s.mean():.3f} STD: {s.std():.2f}")

### Stacking

In [None]:
from mlxtend.classifier import StackingClassifier

clfs = [x() for x in [LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier, GaussianNB, SVC, RandomForestClassifier]]
stack = StackingClassifier(classifiers = clfs, meta_classifier = LogisticRegression())
kfold = model_selection.KFold(n_splits = 10)
s = model_selection.cross_val_score(stack, X, y, scoring = 'roc_auc', cv = kfold)
print(f"{stack.__class__.__name__} "
      f"AUC: {s.mean():.3f} STD: {s.std():.2f}")

### Crie o modelo

In [None]:
rf = ensemble.RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

### Avalie o modelo

In [None]:
rf.score(X_test, y_test)
metrics.precision_score(y_test, rf.predict(X_test))

In [None]:
for col, val in sorted(zip(X_train.columns, rf.feature_importances_), key = lambda x: x[1], reverse = True)[:5]:
    print(f"{col:10}{val:10.3f}")

### Otimize o modelo

In [None]:
rf4 = ensemble.RandomForestClassifier()
params = {'max_features': [0.4, 'auto'],
         'n_estimators': [15, 200],
         'min_samples_leaf': [1, 0.1],
         'random_state': [42]}
cv = model_selection.GridSearchCV(rf4, params, n_jobs = -1).fit(X_train, y_train)
print(cv.best_params_)

In [None]:
rf5 = ensemble.RandomForestClassifier(**{'max_features': 'auto', 'min_samples_leaf': 0.1, 'n_estimators': 200, 'random_state': 42})
rf5.fit(X_train, y_train)
rf5.score(X_test, y_test)

### Matriz de Confusão

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = rf5.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
mapping = {0: 'died', 1: 'survived'}
fig, ax = plt.subplots(figsize = (6, 6))
cm_viz = ConfusionMatrix(rf5, classes = ['died', 'survived'], label_encoder = mapping)
cm_viz.score(X_test, y_test)
cm_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_0304.png', dpi = 300, bbox_inches = 'tight')

### Curva ROC

In [None]:
# curva roc - usada para avaliar os classificadores
y_pred = rf5.predict(X_test)
roc_auc_score(y_test, y_pred)

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 6))
roc_viz = ROCAUC(rf5)
roc_viz.score(X_test, y_test)
"""

In [None]:
"""
roc_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_0305.png')
"""

### Curva de aprendizado

In [None]:
# usada para nos dizer se temos dados de treinamento suficientes

import numpy as np
fig, ax = plt.subplots(figsize = (6, 4))
cv = StratifiedKFold(12)
sizes = np.linspace(0.3, 1.0, 10)
lc_viz = LearningCurve(rf5, cv = cv, train_sizes = sizes, scoring = 'f1_weighted', n_jobs = 4, ax = ax)
lc_viz.fit(X, y)
lc_viz.poof()

# Capítulo 4 - Dados ausentes

### Analisando dados ausentes

In [None]:
df = orig_df

In [None]:
df.isnull().mean()*100

In [None]:
import missingno as msno

In [None]:
ax = msno.matrix(df.sample(500))
ax.get_figure()

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
(1 - df.isnull().mean()).abs().plot.bar(ax = ax)
fig

In [None]:
ax = msno.bar(orig_df.sample(500))
ax.get_figure()

In [None]:
ax = msno.heatmap(df, figsize = (6, 6))
ax.get_figure()

In [None]:
ax = msno.dendrogram(df)
ax.get_figure()

### Descartando dados ausentes

In [None]:
# apaga linhas com dados ausentes
df1 = df.dropna()

In [None]:
df1 = df.drop(columns='cabin')

In [None]:
df1 = df.dropna(axis = 1)

### Imputando dados

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
num_cols = df.select_dtypes(include = 'number').columns
im = SimpleImputer() # média
imputed = im.fit_transform(df[num_cols])

### Acrescentando colunas informativas

In [None]:
def add_indicator(col):
    def wrapper(df):
        return df[col].isna().astype(int)
    return wrapper
df1 = df.assign(cabin_missing = add_indicator('cabin'))

# Capítulo 5 - Fazendo uma limpeza nos dados

### Nomes das colunas

In [None]:
"""
import janitor as jn

Xbad = pd.DataFrame({'A': [1, None, 3], ' sales numbers': [20.0, 30.0, None]})
jn.clean_names(Xbad)

def clean_col(name):
    return(name.strip().lower().replace(' ', '_'))
"""

### Substituindo valores ausentes

In [None]:
"""
jn.coalesce(Xbad, columns = ['A', ' sales numbers '], new_column_name = 'val')
val
"""

In [None]:
""" Xbad.fillna(10) """

In [None]:
""" jn.fill_empty(Xbad, columns = ['A', ' sales numbers '], value = 10) """

# Capítulo 6 - Explorando os dados

### Tamanho dos dados

In [None]:
X.shape

### Estatísticas resumidas

In [None]:
X.describe().iloc[:, [0, -1]]

In [None]:
X.iloc[[1, 4], -3:]

In [None]:
X.loc[[677, 864], 'sex_female':]

### Histograma

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
X.fare.plot(kind = 'hist', ax = ax)
fig

In [None]:
import seaborn as sns

In [None]:
"""
fig, ax = plt.subplots(figsize = (12, 8))
mask = y_train == 1
ax = sns.distplot(X_train[mask].fare, label = 'survived')
ax = sns.displot(X_train[~mask].fare, label = 'died')
ax.set_xlim(-1.5, 1.5)
ax.legend()
"""

### Gráfico de dispersão

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
X.plot.scatter(x = 'age', y = 'fare', ax = ax, alpha = 0.3)

In [None]:
X.age.corr(X.fare)

## Gráfico conjunto

In [None]:
from yellowbrick.features import JointPlotVisualizer

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
jpv = JointPlotVisualizer(feature = 'age', target = 'fare')
jpv.fit(X['age'], X['fare'])
jpv.poof()
fig.savefig('Desktop\oreilly\mlpr_0604.png', dpi = 300)

In [None]:
from seaborn import jointplot

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 6))
new_df = X.copy()
new_df['target'] = y
p = jointplot('age', 'fare', data = new_df, kind = 'reg')
p.savefig('Desktop\oreilly\mlpr_0605.png', dpi = 300)
"""

### Matriz de pares

In [None]:
from seaborn import pairplot

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 6))
new_df = X.copy()
new_df['target'] = y
vars = ['pclass', 'age', 'fare']
p = pairplot(new_df, vars = vars, hue = 'target', kind = 'reg')
p.savefig('Desktop\oreilly\mlpr_0606.png', dpi = 300)
"""

### Gráfico de caixas e gráfico violino

In [None]:
from seaborn import boxplot

In [None]:
fig, ax = plt.subplots(figsize = (8, 6))
new_df = X.copy()
new_df['target'] = y
boxplot(x = 'target', y = 'age', data = new_df)
fig.savefig('Desktop\oreilly\mlpr_0607.png', dpi = 300)

In [None]:
from seaborn import violinplot

In [None]:
fig, ax = plt.subplots(figsize = (8, 6))
new_df = X.copy()
new_df['target'] = y
violinplot(x = 'target', y = 'sex_female', data = new_df)
fig.savefig('Desktop\oreilly\mlpr_0608.png', dpi = 300)

### Comparando dois valores ordinais

In [None]:
fig, ax = plt.subplots(figsize = (8, 6))
(X.assign(age_bin = pd.qcut(X.age, q = 10, labels = False), class_bin = pd.cut(X.pclass, bins = 3, labels = False)).groupby(
['age_bin', 'class_bin']).size().unstack().pipe(lambda df: df.div(df.sum(1), axis = 0)).plot.bar(stacked = True, width = 1, ax = ax, cmap = 'viridis').legend(bbox_to_anchor = (1, 1)))
fig.savefig('Desktop\oreilly\mlpr_0609.png', dpi = 300, bbox_inches = 'tight')

### Correlação

In [None]:
from yellowbrick.features import Rank2D

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
pcv = Rank2D(features = X.columns, algorithm = 'pearson')
pcv.fit(X, y)
pcv.transform(X)
pcv.poof()
fig.savefig('Desktop\oreilly\mlpr_0610.png', dpi = 300, bbox_inches = 'tight')

In [None]:
from seaborn import heatmap
fig, ax = plt.subplots(figsize = (8, 8))
ax = heatmap(X.corr(), fmt = '.2f', annot = True, ax = ax, cmap = 'RdBu_r', vmin = -1, vmax = 1)
fig.savefig('Desktop\oreilly\mlpr_0611.png', dpi = 300, bbox_inches = 'tight')

In [None]:
X.corr().iloc[:, :2]

In [None]:
# colunas com alto grau de correlação não agregam valor e podem prejudicar a interpretação da importância dos atributos e dos coeficientes de regressão.

In [None]:
def correlated_columns(df, threshold = 0.95):
    return(
        df.corr()
        .pipe(
            lambda df1: pd.DataFrame(
                np.tril(df1, k = -1),
                columns = df.columns,
                index = df.columns
            )
        )
        .stack()
        .rename('pearson')
        .pipe(
            lambda s: s[
                s.abs() > threshold
            ].reset_index()
        )
        .query('level_0 not in level_1')
    )
correlated_columns(X)

In [None]:
# essa parte do código está apenas no capítulo 8
agg = (
    df.groupby('cabin')
    .agg('min,max,mean,sum'.split(','))
    .reset_index()
)
agg.columns = [
    '_'.join(c).strip('_')
    for c in agg.columns.values
]
agg_df = df.merge(agg, on = 'cabin')
agg_df
###

"""
c_df = correlated_columns(agg_df)
c_df.style.format({'pearson': '{:.2f}'})
"""

### RadViz

In [None]:
from yellowbrick.features import RadViz

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
rv = RadViz(
    classes = ['died', 'survived'],
    features = X.columns
)
rv.fit(X, y)
_ = rv.transform(X)
rv.poof()
fig.savefig('Desktop\oreilly\mlpr_0612.png', dpi = 300)

In [None]:
from pandas.plotting import radviz

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
new_df = X.copy()
new_df['target'] = y
radviz(new_df, 'target', ax = ax, colormap = 'PiYG')
fig.savefig('Desktop\oreilly\mlpr_0613.png', dpi = 300)

### Coordenadas paralelas

In [None]:
from yellowbrick.features import ParallelCoordinates

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
pc = ParallelCoordinates(
    classes = ['died', 'survived'],
    features = X.columns
)
pc.fit(X, y)
pc.transform(X)
ax.set_xticklabels(
    ax.get_xticklabels(), rotation = 45
)
pc.poof()
fig.savefig('Desktop\oreilly\mlpr_0614.png', dpi = 300)

In [None]:
from pandas.plotting import parallel_coordinates

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
new_df = X.copy()
new_df['target'] = y
parallel_coordinates(
    new_df,
    'target',
    ax = ax,
    colormap = 'viridis',
    alpha = 0.5
)
ax.set_xticklabels(
    ax.get_xticklabels(), rotation = 45
)
fig.savefig('Desktop\oreilly\mlpr_0615.png', dpi = 300)

# Capítulo 7 - Pré-processamento dos dados

In [None]:
X2 = pd.DataFrame(
    {
        'a': range(5),
        'b': [-100, -50, 0, 200, 1000]
    }
)
X2

### Padronize os dados

In [None]:
from sklearn import preprocessing

In [None]:
std = preprocessing.StandardScaler()
std.fit_transform(X2)

In [None]:
std.scale_

In [None]:
std.mean_

In [None]:
std.var_

In [None]:
# fazendo com pandas
X_std = (X2 - X2.mean()) / X2.std()
X_std

In [None]:
X_std.mean()

In [None]:
X_std.std()

In [None]:
""" 
# fazendo com biblioteca fastai 
X3 = X2.copy()
from fastai.tabular import scale_vars
scale_vars(X3, mapper = None)
X3.std()
X3.mean()
"""

### Escale para um intervalo

In [None]:
from sklearn import preprocessing

In [None]:
mms = preprocessing.MinMaxScaler()
mms.fit(X2)
mms.transform(X2)

In [None]:
# pandas
(X2 - X2.min()) / (X2.max() - X2.min())

### Variáveis Dummy

In [None]:
# também conhecido como one-hot encoding ou indicator encoding

X_cat = pd.DataFrame(
    {
        'name': ['George', 'Paul'],
        'inst': ['Bass', 'Guitar']
    }
)
X_cat

In [None]:
pd.get_dummies(X_cat, drop_first = True)

In [None]:
"""
import janitor as jn
X_cat2 = pd.DataFrame(
    {
        'A': [1, None, 3],
        'names': [
            'Fred,George',
            'George',
            'John,Paul'
        ]
    }
)
jn.expand_columns(X_cat2, 'names', sep = ',')
"""

### Codificador de rótulos

In [None]:
from sklearn import preprocessing

lab = preprocessing.LabelEncoder()
""" lab.fit_transform(X_cat) """

In [None]:
""" lab.inverse_transform([1, 1, 0]) """

In [None]:
X_cat.name.astype(
    'category'
).cat.as_ordered().cat.codes + 1

### Codificação de frequência

In [None]:
mapping = X_cat.name.value_counts()
X_cat.name.map(mapping)

### Extraindo categorias a partir de strings

In [None]:
from collections import Counter

In [None]:
c = Counter()
def triples(val):
    for i in range(len(val)):
        c[val[i : i+3]] += 1
df.name.apply(triples)

In [None]:
c.most_common(10)

In [None]:
df.name.str.extract(
    '([A-Za-z]+)\.', expand = False
).head()

In [None]:
df.name.str.extract(
    '([A-Za-z]+)\.', expand = False
).value_counts()

### Outras codificações de categoria

In [None]:
import category_encoders as ce

In [None]:
he = ce.HashingEncoder(verbose = 1)
he.fit_transform(X_cat)

In [None]:
size_df = pd.DataFrame(
    {
        'name': ['Fred', 'John', 'Matt'],
        'size': ['small', 'med', 'xxl']
    }
)
ore = ce.OrdinalEncoder(
    mapping = [
        {
            'col': 'size',
            'mapping': {
                'small': 1,
                'med': 2,
                'lg': 3
            }
        }
    ]
)
ore.fit_transform(size_df)

In [None]:
def get_title(df):
    return df.name.str.extract(
        '([A-Za-z]+)\.', expand = False
    )
te = ce.TargetEncoder(cols = 'Title')
te.fit_transform(
    df.assign(Title = get_title), df.survived
)['Title'].head()

### Engenharia de dados para datas

In [None]:
""" from fastai.tabular.transform import add_datepart """

In [None]:
"""
dates = pd.DataFrame(
    {
        'A': pd.to_datetime(
            ['9/17/2001', 'Jan 1, 2002']
        )
    }
)
add_datepart(dates, 'A')
dates.T
"""

### Adição do atributo col_na

In [None]:
from pandas.api.types import is_numeric_dtype

In [None]:
def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (
            name in na_dict
        ):
            df[name + '_na'] = pd.isnull(col)
            filler = (
                na_dict[name]
                if name in na_dict
                else col.median()
            )
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict
data = pd.DataFrame({'A': [0, None, 5, 100]})
fix_missing(data, data.A, 'A', {})
data

In [None]:
data = pd.DataFrame({'A': [0, None, 5, 100]})
data['A_na'] = data.A.isnull()
data['A'] = data.A.fillna(data.A.median())
data

### Engenharia de dados manual

In [None]:
agg = (
    df.groupby('cabin')
    .agg('min,max,mean,sum'.split(','))
    .reset_index()
)
agg.columns = [
    '_'.join(c).strip('_')
    for c in agg.columns.values
]
agg_df = df.merge(agg, on = 'cabin')
agg_df

# Capítulo 8 - Seleção de atributos

### Colunas colineares

In [None]:
limit = 0.95
corr = agg_df.corr()
mask = np.triu(
    np.ones(corr.shape), k = 1
).astype(bool)
corr_no_diag = corr.where(mask)
coll = [
    c
    for c in corr_no_diag.columns
    if any(abs(corr_no_diag[c]) > limit) # aqui o livro diz threshold, mas é limit
]
coll

In [None]:
"""
rfpimp.plot_dependence_heatmap(
    rfpimp.feature_dependence_matrix(X_train),
    value_fontsize = 12,
    label_fontsize = 14,
    figsize = (8,8), sn
)
fig = plt.gcf()
fig.savefig('Desktop\oreilly\mlpr_0801.png', dpi = 300, bbox_inches = 'tight')
"""

In [None]:
cols_to_remove = ['pclass', 'sibsp', 'parch', 'embarked_Q']

In [None]:
rf3 = RandomForestClassifier(random_state = 42)
rf3.fit(
    X_train[
        [
            c
            for c in X_train.columns
            if c not in cols_to_remove
        ]
    ],
    y_train
)
rf3.score(
    X_test[
        [
            c
            for c in X_train.columns
            if c not in cols_to_remove
        ]
    ],
    y_test,
)

In [None]:
rf4 = RandomForestClassifier(random_state = 42)
rf4.fit(X_train, y_train)
rf4.score(X_test, y_test)

### Regressão Lasso

In [None]:
from sklearn import linear_model
from sklearn.pipeline import make_pipeline

In [None]:
"""
model = make_pipeline(StandardScaler(with_mean=False), LassoLarsCV())
#model = linear_model.LassoLarsCV(cv = 10, max_n_alphas = 10).fit(X_train, y_train)
fig, ax = plt.subplots(figsize = (12, 8))
cm = iter(
    plt.get_cmap('tab20')(
        np.linspace(0, 1, X.shape[1])
    )
)
for i in range(X.shape[1]):
    c = next(cm)
    ax.plot(
        model.alphas_,
        model.coef_path_.T[:, i],
        c = c,
        alpha = 0.8,
        label = X.columns[i]
    )
ax.axvline(
    model.alpha_,
    linestyle = '-',
    c = 'k',
    label = 'alphaCV'
)
plt.ylabel('Regression Coefficients')
ax.legend(X.columns, bbox_to_anchor = (1, 1))
plt.xlabel('alpha')
plt.title(
    'Regression Coefficients Progression for Lasso Paths'
)
fig.savefig('Desktop\oreilly\mlpr_0802.png',
    dpi = 300,
    bbox_inches = 'tight'
)
"""

### Eliminação recursiva de atributos

In [None]:
from yellowbrick.features import RFECV

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
rfe = RFECV(
    ensemble.RandomForestClassifier(
        n_estimators = 100
    ),
    cv = 5
)
rfe.fit(X, y)
rfe.rfe_estimator_.ranking_

In [None]:
rfe.rfe_estimator_.n_features_

In [None]:
rfe.rfe_estimator_.support_

In [None]:
rfe.poof()
fig.savefig('Desktop\oreilly\mlpr_0803.png', dpi = 300)

In [None]:
from sklearn.feature_selection import RFE

In [None]:
"""
model = ensemble.RandomForestClassifier(
    n_estimators = 100
)
rfe = RFE(model, 4)
rfe.fit(X, y)
agg_X.columns[rfe.support_]
"""

### Informações mútuas

In [None]:
from sklearn import feature_selection

In [None]:
mic = feature_selection.mutual_info_classif(
    X, y
)
fig, ax = plt.subplots(figsize = (10, 8))
(
    pd.DataFrame(
        {'feature': X.columns, 'vimp': mic}
    )
    .set_index('feature')
    .plot.barh(ax = ax)
)
fig.savefig('Desktop\oreilly\mlpr_0804.png')

# Capítulo 9 - Classes desbalanceadas

### Use uma métrica diferente

### Algoritmos baseados em árvores e ensembles

### Modelos de penalização

### Upsampling da minoria

In [None]:
from sklearn.utils import resample

In [None]:
mask = df.survived == 1
surv_df = df[mask]
death_df = df[~mask]
df_upsample = resample(
    surv_df,
    replace = True,
    n_samples = len(death_df),
    random_state = 42
)
df2 = pd.concat([death_df, df_upsample])
df2.survived.value_counts()

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
ros = RandomOverSampler(random_state = 42)
X_ros, y_ros = ros.fit_resample(X, y)
pd.Series(y_ros).value_counts()

### Gerando dados de minoria

In [None]:
# SMOTE - Synthetic Minority Oversampling Technique -> over_sampling.SMOTE
# ADASYN - Adaptive Synthetic -> over_sampling_ADASYN

### Downsampling da maioria

In [None]:
from sklearn.utils import resample

In [None]:
mask = df.survived == 1
surv_df = df[mask]
death_df = df[~mask]
df_downsample = resample(
    death_df,
    replace = False,
    n_samples = len(surv_df),
    random_state = 42
)
df3 = pd.concat([surv_df, df_downsample])
df3.survived.value_counts()

# Capítulo 10 - Classificação

In [None]:
# fit(X, y[, sample_weight]) - > faz a adequação do modelo
# predict(X) -> faz a predição de classes
# predict_log_proba(X) - > faz a predição do logaritmo das probabilidades
# predic_proba(X) -> faz a predição da probabilidade
# score(X, y[, sample_weight]) -> obtém a precisão (accuracy)

### Regressão Logística

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(random_state = 42)
lr.fit(X_train, y_train)

In [None]:
lr.score(X_test, y_test)

In [None]:
lr.predict(X.iloc[[0]])

In [None]:
lr.predict_proba(X.iloc[[0]])

In [None]:
lr.predict_log_proba(X.iloc[[0]])

In [None]:
lr.decision_function(X.iloc[[0]])

In [None]:
lr.intercept_

In [None]:
def inv_logit(p):
    return np.exp(p) / (1 + np.exp(p))
inv_logit(lr.intercept_)

In [None]:
cols = X.columns
for col, val in sorted(
    zip(cols, lr.coef_[0]),
    key = lambda x: x[1],
    reverse = True
):
    print(f'{col:10}{val:10.3f} {inv_logit(val):10.3f}')

In [None]:
from yellowbrick.model_selection import FeatureImportances

In [None]:
fig, ax = plt.subplots(figsize = (6 ,4))
fi_viz = FeatureImportances(lr)
fi_viz.fit(X, y)
fi_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_0804.png', dpi = 300)

### Naive Bayes

In [None]:
# GaussianNB, MultinomialNB, BernoulliNB

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
nb.score(X_test, y_test)

In [None]:
nb.predict(X.iloc[[0]])

In [None]:
nb.predict_proba(X.iloc[[0]])

In [None]:
nb.predict_log_proba(X.iloc[[0]])

### Máquinas de vetores de suporte

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC(random_state = 42, probability = True)
svc.fit(X_train, y_train)

In [None]:
svc.score(X_test, y_test)

In [None]:
svc.predict(X.iloc[[0]])

In [None]:
svc.predict_proba(X.iloc[[0]])

In [None]:
svc.predict_log_proba(X.iloc[[0]])

### K vizinhos mais próximos

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)

In [None]:
knc.score(X_test, y_test)

In [None]:
knc.predict(X.iloc[[0]])

In [None]:
knc.predict_proba(X.iloc[[0]])

### Árvore de Decisão

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(
    random_state = 42, max_depth = 3
)
dt.fit(X_train, y_train)

In [None]:
dt.score(X_test, y_test)

In [None]:
dt.predict(X.iloc[[0]])

In [None]:
dt.predict_proba(X.iloc[[0]])

In [None]:
dt.predict_log_proba(X.iloc[[0]])

In [None]:
import pydotplus
from io import StringIO
from sklearn.tree import export_graphviz

In [None]:
"""
dot_data = StringIO()
tree.export_graphviz(
    dt,
    out_file = dot_data,
    feature_names = X.columns,
    class_names = ['Died', 'Survived'],
    filled = True,
)
g = pydotplus.graph_from_dot_data(
    dot_data.getvalue()
)
g.write_png('Desktop\oreilly\mlpr_1002.png')
"""

In [None]:
from IPython.display import Image

""" Image(g.create_png()) """

In [None]:
"""
viz = dtreeviz.trees.dtreeviz(
    dt,
    X,
    y,
    target_name = 'survived',
    feature_names = X.columns,
    class_names = ['died', 'survived']
)
viz
"""

In [None]:
for col, val in sorted(
    zip(X.columns, dt.feature_importances_),
    key = lambda x: x[1],
    reverse = True
)[:5]:
    print(f'{col:10}{val:10.3f}')

In [None]:
from yellowbrick.model_selection import FeatureImportances

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
fi_viz = FeatureImportances(dt)
fi_viz.fit(X, y)
fi_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1004.png', dpi = 300)

### Floresta aleatória

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
rf.predict(X.iloc[[0]])

In [None]:
rf.predict_proba(X.iloc[[0]])

In [None]:
rf.predict_log_proba(X.iloc[[0]])

In [None]:
import rfpimp

In [None]:
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, y_train)
rfpimp.importances(
    rf, X_test, y_test
).Importance

### XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgb_class = xgb.XGBClassifier(random_state = 42)
xgb_class.fit(
    X_train,
    y_train,
    early_stopping_rounds = 10,
    eval_set = [(X_test, y_test)]
)

In [None]:
xgb_class.score(X_test, y_test)

In [None]:
xgb_class.predict(X.iloc[[0]])

In [None]:
xgb_class.predict_proba(X.iloc[[0]])

In [None]:
for col, val in sorted(
    zip(
        X.columns,
        xgb_class.feature_importances_,
    ),
    key = lambda x: x[1],
    reverse = True
)[:5]:
    print(f'{col:10}{val:10.3f}')

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
xgb.plot_importance(xgb_class, ax = ax)
fig.savefig('Desktop\oreilly\mlpr_1005.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
fi_viz = FeatureImportances(xgb_class)
fi_viz.fit(X, y)
fi_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1006.png', dpi = 300)

In [None]:
booster = xgb_class.get_booster()
print(booster.get_dump()[0])

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 4))
xgb.plot_tree(xgb_class, ax = ax, num_trees = 0)
fig.savefig('Desktop\oreilly\mlpr_1006.png', dpi = 300)
"""

In [None]:
import xgbfir

In [None]:
xgbfir.saveXgbFI(
    xgb_class,
    feature_names = X.columns,
    OutputXlsxFile = 'fir.xlsx'
)

In [None]:
pd.read_excel('fir.xlsx').head(3).T
#http://localhost:8888/edit/fir.xlsx

In [None]:
pd.read_excel(
    'fir.xlsx',
    sheet_name = 'Interaction Depth 1'
).head(2)

In [None]:
pd.read_excel(
    'fir.xlsx',
    sheet_name = 'Interaction Depth 2'
).head(1).T

In [None]:
pd.read_excel(
    'fir.xlsx',
    sheet_name = 'Interaction Depth 2'
)[['Interaction', 'Gain']].head()

### Gradient Boosted com LightGBM

In [None]:
""" import lightgbm as lgb """

In [None]:
"""
lbgm_class = lgb.LGBMClassifier(random_state = 42)
lgbm_class.fit(X_train, y_train)
"""

In [None]:
""" lgbm_class.score(X_test, y_test) """

In [None]:
""" lgbm_class.predict(X.iloc[[0]]) """

In [None]:
""" lgbm_class.predict_proba(X.iloc[[0]]) """

In [None]:
"""
for col, val in sorted(
    zip(cols, lgbm_class.feature_importances_),
    key = lambda x: x[1],
    reverse = True
)[:5]:
    print(f'{col:10}{val:10.3f}'')
"""

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 4))
lgb.plot_importances(lgbm_class, ax = ax)
fig.tight_layout()
fig.savefig('Desktop\oreilly\mlpr_1008.png', dpi = 300)
"""

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 4))
lgb.plot_tree(lgb_class, tree_index = 0, ax = ax)
fig.savefig('Desktop\oreilly\mlpr_1009.png', dpi = 300)
"""

### TPOT

In [None]:
# demora muito pra rodar

In [None]:
from tpot import TPOTClassifier

In [None]:
tc = TPOTClassifier(generations = 2)
tc.fit(X_train, y_train)
tc.score(X_test, y_test)

In [None]:
tc.predict(X.iloc[[0]])

In [None]:
tc.predict_proba(X.iloc[[0]])

In [None]:
tc.export('tpot_exported_pipeline.py')

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Normalizer
from tpot.builtins import StackingEstimator

In [None]:
"""
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep = 'COLUMN_SEPARATOR', dtype = np.float64)
features = tpot_data.drop('target', axis = 1).values
training_features, testing_features, training_target, testing_target = train_test_split(features, tpot_data['target'].values, random_state = 42)
"""

In [None]:
"""
exported_pipeline = make_pipeline(
    Normalizer(norm = 'max'),
    StackingEstimator(
        estimator = ExtraTreesClassifier(bootstrap = True,
            criterion = 'gini', max_features = 0.85,
            min_samples_leaf = 2, min_samples_split = 19,
            n_estimators = 100)),
    ExtraTreesClassifier(bootstrap = False,
        criterion = 'entropy', max_features = 0.3,
        min_samples_leaf = 13, min_samples_split = 9,
        n_estimators = 100)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""

# Capítulo 11 - Seleção do modelo

### Curva de validação

In [None]:
from yellowbrick.model_selection import ValidationCurve

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
vc_viz = ValidationCurve(RandomForestClassifier(n_estimators = 100),
    param_name = 'max_depth',
    param_range = np.arange(1, 11),
    cv = 10,
    n_jobs = -1
)
vc_viz.fit(X, y)
vc_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1101.png', dpi = 300)

In [None]:
# scoring para classificação
# accuracy, average_precision, f1, f1_micro, f1_macro, f1_weighted, f1_samples, neg_log_loss, precision, recall, roc_auc

# scoring para clustering
# adjusted_mutual_info_score, adjusted_rand_score, completeness_score, fowkesmallows_score, homogeneity_score, mutual_info_score, normalized_mutual_info_score, v_measure_score

# scoring para regressão
# explained_variance, neg_mean_absolute_error, neg_mean_squared_error, neg_mean_squared_log_error, neg_median_absolute_error

### Curva de aprendizagem

In [None]:
from yellowbrick.model_selection import LearningCurve

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
lc3_viz = LearningCurve(
    RandomForestClassifier(n_estimators = 100),
    cv = 10
)
lc3_viz.fit(X, y)
lc3_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1102.png', dpi = 300)

# Capítulo 12 - Métricas e avaliação de classificação

In [None]:
y_predict = dt.predict(X_test)
tp = (
    (y_test == 1) & (y_test == y_predict)
).sum() # 123
tn = (
    (y_test == 0) & (y_test == y_predict)
).sum() # 199
fp = (
    (y_test == 0) & (y_test != y_predict)
).sum() # 25
fn = (
    (y_test == 1) & (y_test != y_predict)
).sum() # 46

In [None]:
tp, tn, fp, fn

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_predict = dt.predict(X_test)
pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = [
        'Predict died',
        'Predict Survive'
    ],
    index = ['True Death', 'True Survive']
)

In [None]:
import matplotlib.pyplot as plt
from yellowbrick.classifier import ConfusionMatrix

In [None]:
mapping = {0: 'died', 1: 'survived'}
fig, ax = plt.subplots(figsize = (6, 6))
cm_viz = ConfusionMatrix(
    dt,
    classes = ['died', 'survived'],
    label_encoder = mapping
)
cm_viz.score(X_test, y_test)
cm_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1202.png', dpi = 300)

### Métricas

### Acurácia

In [None]:
(tp + tn) / (tp + tn + fp + fn)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_predict = dt.predict(X_test)
accuracy_score(y_test, y_predict)

### Recall

In [None]:
tp / (tp + tn)

In [None]:
from sklearn.metrics import recall_score

In [None]:
y_predict = dt.predict(X_test)
recall_score(y_test, y_predict)

### Precisão

In [None]:
tp / (tp + fp)

In [None]:
from sklearn.metrics import precision_score

In [None]:
y_predict = dt.predict(X_test)
precision_score(y_test, y_predict)

### F1

In [None]:
pre = tp / (tp + fp)
rec = tp / (tp + fn)
2 * pre * rec / (pre + rec)

In [None]:
from sklearn.metrics import f1_score

In [None]:
y_predict = dt.predict(X_test)
f1_score(y_test, y_predict)

### Relatório de Classificação

In [None]:
import matplotlib.pyplot as plt
from yellowbrick.classifier import ClassificationReport

In [None]:
fig, ax = plt.subplots(figsize = (6, 3))
cm_viz = ClassificationReport(
    dt,
    classes = ['died', 'survived'],
    label_encoder = mapping
)
cm_viz.score(X_test, y_test)
cm_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1203.png', dpi = 300)

## ROC

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
y_predict = dt.predict(X_test)
roc_auc_score(y_test, y_predict)

In [None]:
from yellowbrick.classifier import ROCAUC

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 6))
roc_viz = ROCAUC(dt)
roc_viz.score(X_test, y_test)
"""

In [None]:
"""
roc_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1204.png', dpi = 300)
"""

### Curva de precisão-recall

In [None]:
from sklearn.metrics import average_precision_score

In [None]:
y_predict = dt.predict(X_test)
average_precision_score(y_test, y_predict)

In [None]:
from yellowbrick.classifier import PrecisionRecallCurve

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
viz = PrecisionRecallCurve(DecisionTreeClassifier(max_depth = 3))
viz.fit(X_train, y_train)
print(viz.score(X_test, y_test))

In [None]:
viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1205.png', dpi = 300)

### Gráfico de ganhos cumulativos

In [None]:
import scikitplot

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
y_probas = dt.predict_proba(X_test)
scikitplot.metrics.plot_cumulative_gain(
    y_test, y_probas, ax = ax
)
fig.savefig('Desktop\oreilly\mlpr_1206.png', dpi = 300, bbox_inches = 'tight')

### Gráfico de elevação

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
y_probas = dt.predict_proba(X_test)
scikitplot.metrics.plot_lift_curve(
    y_test, y_probas, ax = ax
)
fig.savefig('Desktop\oreilly\mlpr_1207.png', dpi = 300, bbox_inches = 'tight')

### Balanceamento das classes

In [None]:
from yellowbrick.classifier import ClassBalance

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
cb_viz = ClassBalance(labels = ['Died', 'Survied'])
cb_viz.fit(y_test)
cb_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1208.png', dpi = 300, bbox_inches = 'tight')

### Erro de predição de classe

In [None]:
from yellowbrick.classifier import ClassPredictionError

In [None]:
fig, ax = plt.subplots(figsize = (6, 3))
cpe_viz = ClassPredictionError(dt, classes = ['died', 'survived'])
cpe_viz.score(X_test, y_test)
cpe_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1209.png', dpi = 300)

### Limiar de discriminação

In [None]:
from yellowbrick.classifier import DiscriminationThreshold

In [None]:
fig, ax = plt.subplots(figsize = (6, 5))
dt_viz = DiscriminationThreshold(dt)
dt_viz.fit(X, y)
dt_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1210.png', dpi = 300)

# Capítulo 13 - Explicando os modelos

In [None]:
dt = DecisionTreeClassifier(random_state = 42, max_depth = 3)
dt.fit(X_train, y_train)

### Coeficientes de regressão

### Importância dos atributos

### LIME

In [None]:
from lime import lime_tabular

In [None]:
explainer = lime_tabular.LimeTabularExplainer(
    X_train.values,
    feature_names = X.columns,
    class_names = ['died', 'survived']
)
exp = explainer.explain_instance(
    X_train.iloc[-1].values, dt.predict_proba
)
exp

In [None]:
fig = exp.as_pyplot_figure()
fig.tight_layout()
fig.savefig('Desktop\oreilly\mlpr_1301.png', dpi = 300)

In [None]:
data = X_train.iloc[-2].values.copy()
dt.predict_proba(
    [data]
) # previsão de que uma mulher sobreviva

In [None]:
data[5] = 1 # muda para o sexo masculino
dt.predict_proba([data])

### Interpretação de árvores

In [None]:
from treeinterpreter import treeinterpreter as ti

In [None]:
instances = X.iloc[:2]
prediction, bias, contribs = ti.predict(
    rf5, instances
)
i = 0
print('Instance', i)
print('Prediction', prediction[i])
print('Bias (trainset mean)', bias[i])
print('Feature contributions:')
for c, feature in zip(contribs[i], instances.columns):
    print(' {} {}'.format(feature, c))

### Gráficos de dependência parcial

In [None]:
rf5 = ensemble.RandomForestClassifier(**{'max_features': 'auto', 'min_samples_leaf': 0.1, 'n_estimators': 200, 'random_state': 42})
rf5.fit(X_train, y_train)

In [None]:
from pdpbox import pdp

In [None]:
feat_name = 'age'
p = pdp.pdp_isolate(rf5, X, X.columns, feat_name)
fig, _ = pdp.pdp_plot(p, feat_name, plot_lines = True)
fig.savefig('Desktop\oreilly\mlpr_1302.png', dpi = 300)

In [None]:
features = ['fare', 'sex_male']
p = pdp.pdp_interact(rf5, X, X.columns, features)
fig, _ = pdp.pdp_interact_plot(p, features)
fig.savefig('Desktop\oreilly\mlpr_1303.png', dpi = 300)

### Modelos Substitutos

In [None]:
from sklearn import svm

In [None]:
sv = svm.SVC()
sv.fit(X_train, y_train)
sur_dt = tree.DecisionTreeClassifier()
sur_dt.fit(X_test, sv.predict(X_test))
for col, val in sorted(zip(X_test.columns, sur_dt.feature_importances_), key = lambda x: x[1], reverse = True)[:7]:
    print(f'{col:10}{val:10.3f}')

### Shapley

In [None]:
rf5.predict_proba(X_test.iloc[[20]])

In [None]:
import shap

In [None]:
s = shap.TreeExplainer(rf5)
shap_vals = s.shap_values(X_test)
target.idx = 1
shap.force_plot(s.expected_value[target_idx],
               shap_vals[target_idx][20, :],
               feature_names = X_test.columns)

In [None]:
shap.force_plot(s.expected_value[1],
               shap_vals[1],
               feature_names = X_test.columns)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
res = shap.dependence_plot('age', shap_vals[target_idx], X_test, feature_names = X_test.columns, alpha = 0.7)
fig.savefig('Desktop\oreilly\mlpr_1306.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (6,4))
shap.summary_plot(shap_vals[0], X_test)
fig.savefig('Desktop\oreilly\mlpr_1307.png', dpi = 300)

# Capítulo 14 - Regressão

In [None]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn import model_selection, preprocessing

In [None]:
b = load_boston()

In [None]:
bos_X = pd.DataFrame(b.data, columns = b.feature_names)
bos_X

In [None]:
bos_y = b.target
bos_y

In [None]:
bos_X_train, bos_X_test, bos_y_train, bos_y_test = model_selection.train_test_split(bos_X, bos_y, test_size = 0.3, random_state = 42)
bos_sX = preprocessing.StandardScaler().fit_transform(bos_X)
bos_sX_train, bos_sX_test, bos_sy_train, bos_sy_test = model_selection.train_test_split(bos_sX, bos_y, test_size = 0.3, random_state = 42)

### Modelo de base

In [None]:
from sklearn.dummy import DummyRegressor

In [None]:
dr = DummyRegressor()
dr.fit(bos_X_train, bos_y_train)
dr.score(bos_X_test, bos_y_test)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(bos_X_train, bos_y_train)

In [None]:
lr.score(bos_X_test, bos_y_test)

In [None]:
lr.coef_

In [None]:
lr2 = LinearRegression()
lr2.fit(bos_sX_train, bos_sy_train)

In [None]:
lr2.score(bos_sX_test, bos_sy_test)

In [None]:
lr2.intercept_

In [None]:
lr2.coef_

In [None]:
from yellowbrick.features import FeatureImportances

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
fi_viz = FeatureImportances(lr2, labels = bos_X.columns)
fi_viz.fit(bos_sX, bos_y)
fi_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1401.png', dpi = 300)

### SVMs

In [None]:
from sklearn.svm import SVR

In [None]:
svr = SVR()
svr.fit(bos_sX_train, bos_sy_train)

In [None]:
svr.score(bos_sX_test, bos_sy_test)

### K vizinhos mais próximos

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knr = KNeighborsRegressor()
knr.fit(bos_sX_train, bos_sy_train)

In [None]:
knr.score(bos_sX_test, bos_sy_test)

### Árvore de decisão

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dtr = DecisionTreeRegressor(random_state = 42)
dtr.fit(bos_X_train, bos_y_train)

In [None]:
dtr.score(bos_X_test, bos_y_test)

In [None]:
import pydotplus
from io import StringIO
from sklearn.tree import export_graphviz

In [None]:
"""
dot_data = StringIO()
tree.export_graphviz(dtr, out_file = dot_data, feature_names = bos_X.columns, filled = True)
g = pydotplus.graph_from_dot_data(dot_data.getvalue())
g.write_png('Desktop\oreilly\mlpr_1402.png')
"""

In [None]:
"""
fromIPython.display import Image
Image(g.creat_png())
"""

In [None]:
"""
dot_data = StringIO()
tree.export_graphviz(dtr, max_depth = 2, out_file = dot_data, feature_names = bos_X.columns, filled = True)
g = pydotplus.graph_from_dot_data(dot_data.getvalue())
g.write_png('Desktop\oreilly\mlpr_1403.png')
"""

In [None]:
"""
dtr3 = DecisionTreeRegressor(max_depth = 2)
dtr3.fit(bos_X_train, bos_y_train)
viz = dtreeviz.trees.dtreeviz(dtr3, bos_X, bos_y, target_name = 'price', feature_names = bos_X.columns)
viz
"""

In [None]:
for col, val in sorted(zip(bos_X.columns, dtr.feature_importances_), key = lambda x: x[1], reverse = True)[:5]:
    print(f'{col:10}{val:10.3f}')

### Floresta Aleatória

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state = 42, n_estimators = 100)
rfr.fit(bos_X_train, bos_y_train)

In [None]:
rfr.score(bos_X_test, bos_y_test)

In [None]:
for col, val in sorted(zip(bos_X.columns, rfr.feature_importances_), key = lambda x: x[1], reverse = True)[:5]:
    print(f'{col:10}{val:10.3f}')

### Regressão XGBoost

In [None]:
xgr = xgb.XGBRegressor(random_state = 42)
xgr.fit(bos_X_train, bos_y_train)

In [None]:
xgr.score(bos_X_test, bos_y_test)

In [None]:
xgr.predict(bos_X.iloc[[0]])

In [None]:
for col, val in sorted(zip(bos_X.columns, xgr.feature_importances_), key = lambda x: x[1], reverse = True)[:5]:
    print(f'{col:10}{val:10.3f}')

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
xgb.plot_importance(xgr, ax = ax)
fig.savefig('Desktop\oreilly\mlpr_1405.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
fi_viz = FeatureImportances(xgr)
fi_viz.fit(bos_X_train, bos_y_train)
fi_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1406.png', dpi = 300)

In [None]:
booster = xgr.get_booster()
print(booster.get_dump()[0])

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 4))
xgb.plot_tree(xgr, ax = ax, num_trees = 0)
fig.savefig('Desktop\oreilly\mlpr_1407.png', dpi = 300)
"""

### Regressão LightGBM

In [None]:
""" import lightgbm as lgb """

In [None]:
"""
lgr = lgb.LGBMRegressor(random_state = 42)
lgr.fit(bos_X_train, bos_y_train)
"""

In [None]:
""" lgr.score(bos_X_test, bos_y_test) """

In [None]:
""" lgr.predict(bos_X.iloc[[0]]) """

In [None]:
"""
for col, val in sorted(zip(bos_X.columns, lgr.feature_importances_), key = lambda x: x[1], reverse = True)[:5]:
    print(f'{col:10}{val:10.3f}')
"""

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 4))
lgb.plot_importance(lgr, ax = ax)
fig.tight_layout()
fig.savefig('Desktop\oreilly\mlpr_1408.png', dpi = 300')
"""

In [None]:
""" lgb.create_tree_digraph(lgbr) """

# Capítulo 15 = Métricas e avaliação de regressão

In [None]:
rfr = RandomForestRegressor(random_state = 42, n_estimators = 100)
rfr.fit(bos_X_train, bos_y_train)

In [None]:
from sklearn import metrics

In [None]:
rfr.score(bos_X_test, bos_y_test)

In [None]:
""" metrics.r2_score(bos_y_test, bos_y_test_pred) """

In [None]:
""" metrics.explained_variance_score(bos_y_test, bos_y_test_pred) """

In [None]:
""" metrics.mean_absolute_error(bos_y_test, bos_y_test_pred) """

In [None]:
""" metrics.mean_squared_error(bos_y_test, bos_y_test_pred) """

In [None]:
""" metrics.mean_squared_log_error(bos_y_test, bos_y_test_pred) """

In [None]:
from yellowbrick.regressor import ResidualsPlot

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
rpv = ResidualsPlot(rfr)
rpv.fit(bos_X_train, bos_y_train)
rpv.score(bos_X_test, bos_y_test)
rpv.poof()
fig.savefig('Desktop\oreilly\mlpr_1501.png', dpi = 300)

### Heterocedasticidade

In [None]:
import statsmodels.stats.api as sms

In [None]:
# no livro este termo resids é definido apenas no proximo bloco, porem, já é usado aqui
resids = bos_y_test - rfr.predict(bos_X_test)

hb = sms.het_breuschpagan(resids, bos_X_test)
labels = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
for name, num in zip(name, hb):
    print(f'{name}: {num:.2}')

### Resíduos com distribuição normal

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
resids = bos_y_test - rfr.predict(bos_X_test)
pd.Series(resids, name = 'residuals').plot.hist(bins = 20, ax = ax, title = 'Residual Histogram')
fig.savefig('Desktop\oreilly\mlpr_1502.png', dpi = 300)

In [None]:
from scipy import stats

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
_ = stats.probplot(resids, plot = ax)
fig.savefig('Desktop\oreilly\mlpr_1503.png', dpi = 300)

In [None]:
stats.kstest(resids, cdf = 'norm')

In [None]:
from yellowbrick.regressor import PredictionError

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
pev = PredictionError(rfr)
pev.fit(bos_X_train, bos_y_train)
pev.score(bos_X_test, bos_y_test)
pev.poof()
fig.savefig('Desktop\oreilly\mlpr_1504.png', dpi = 300)

# Capítulo 16 - Explicando os modelos de regressão

In [None]:
import xgboost as xgb

In [None]:
xgb = xgb.XGBRegressor(random_state = 42, base_score = 0.5)
xgr.fit(bos_X_train, bos_y_train)

### Shapley

In [None]:
sample_idx = 5
xgr.predict(bos_X.iloc[[sample_idx]])

In [None]:
""" import shap """

In [None]:
""" 
shap.initjs()
exp = shap.TreeExplainer(xgr)
vals = exp.shap_values(bos_X)
"""

In [None]:
""" shap.force_plot(exp.expected_value, vals[sample_idx], bos_X.iloc[sample_idx]) """

In [None]:
""" shap.force_plot(exp.expected_value, vals, bos_X) """

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 4))
shap.dependence_plot('LSTAT', vals, bos_X)
fig.savefig('Desktop\oreilly\mlpr_1603.png', dpi = 300)
"""

In [None]:
"""
fig, ax = plt.subplots(figsize = (6, 4))
shap.dependence_plot('DIS', vals, bos_X, interaction_index = 'RM')
fig.savefig('Desktop\oreilly\mlpr_1604.png', dpi = 300)
"""

### Redução de dimensionalidade

In [None]:
ti_df = tweak_titanic(orig_df)
std_cols = 'pclass,age,sibsp,fare'.split(',')
X_train, X_test, y_train, y_test = get_train_test_X_y(ti_df, 'survived', std_cols = std_cols)
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [None]:
X, y

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
pca = PCA(random_state = 42)
X_pca = pca.fit_transform(StandardScaler().fit_transform(X))
pca.explained_variance_ratio_

In [None]:
pca.components_[0]

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
ax.plot(pca.explained_variance_ratio_)
ax.set(xlabel = 'Component', ylabel = 'Percent of Explained variance', title = 'Scree Plot', ylim = (0, 1))
fig.savefig('Desktop\oreilly\mlpr_1701.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
ax.plot(np.cumsum(pca.explained_variance_ratio_))
ax.set(xlabel = 'Component', ylabel = 'Percent of Explained variance', title = 'Cumulative Variance', ylim = (0, 1))
fig.savefig('Desktop\oreilly\mlpr_1702.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
plt.imshow(pca.components_.T, cmap = 'Spectral', vmin = -1, vmax = 1)
plt.yticks(range(len(X.columns)), X.columns)
plt.xticks(range(8), range(1, 9))
plt.xlabel('Principal Component')
plt.ylabel('Contribution')
plt.title('Contribution of Features to Components')
plt.colorbar()
fig.savefig('Desktop\oreilly\mlpr_1703.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (8, 4))
pd.DataFrame(pca.components_, columns = X.columns).plot(kind = 'bar', ax = ax).legend(bbox_to_anchor = (1, 1))
fig.savefig('Desktop\oreilly\mlpr_1704.png', dpi = 300)

In [None]:
comps = pd.DataFrame(pca.components_, columns = X.columns)
min_val = 0.5
num_components = 2
pca_cols = set()
for i in range(num_components):
    parts = comps.iloc[i][comps.iloc[i].abs() > min_val]
    pca_cols.update(set(parts.index))
pca_cols

In [None]:
from yellowbrick.features.pca import PCADecomposition

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
colors = ['rg'[j] for j in y]
pca_viz = PCADecomposition(color = colors)
pca_viz.fit_transform(X, y)
pca_viz.poof()
fig.savefig('Desktop\oreilly\mlpr_1705.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
pca_df = pd.DataFrame(X_pca, columns = [f'PC{i+1}' for i in range(X_pca.shape[1])])
pca_df['status'] = [('deceased', 'survived')[i] for i in y]
evr = pca.explained_variance_ratio_
ax.set_aspect(evr[1] / evr[0])
sns.scatterplot(x = 'PC1', y = 'PC2', hue = 'status', data = pca_df, alpha = 0.5, ax = ax)
fig.savefig('Desktop\oreilly\mlpr_1706.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
pca_df = pd.DataFrame(X_pca, columns = [f'PC{i+1}' for i in range(X_pca.shape[1])])
pca_df['status'] = [('deceased', 'survived')[i] for i in y]
evr = pca.explained_variance_ratio_
x_idx = 0 # x_pc
y_idx = 1 # y_pc
ax.set_aspect(evr[y_idx] / evr[x_idx])
x_col = pca_df.columns[x_idx]
y_col = pca_df.columns[y_idx]
sns.scatterplot(x = x_col, y = y_col, hue = 'status', data = pca_df, alpha = 0.5, ax = ax)
scale = 8
comps = pd.DataFrame(pca.components_, columns = X.columns)
for idx, s in comps.T.iterrows():
    plt.arrow(0, 0, s[x_idx] * scale, s[y_idx] * scale, color = 'k')
    plt.text(s[x_idx] * scale, s[y_idx] * scale, idx, weight = 'bold')
fig.savefig('Desktop\oreilly\mlpr_1707.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
pca_df = pd.DataFrame(X_pca, columns = [f'PC{i+1}' for i in range(X_pca.shape[1])])
pca_df['status'] = [('deceased', 'survived')[i] for i in y]
evr = pca.explained_variance_ratio_
ax.set_aspect(evr[3] / evr[0])
sns.scatterplot(x = 'PC1', y = 'PC4', hue = 'status', data = pca_df, alpha = 0.5, ax = ax)
fig.savefig('Desktop\oreilly\mlpr_1708.png', dpi = 300, bbox_inches = 'tight')

In [None]:
from bokeh.io import output_notebook
from bokeh import models, palettes, transform
from bokeh.plotting import figure, show

In [None]:
def bokeh_scatter(x, y, data, hue = None, label_cols = None, size = None, legend = None, alpha = 0.5):
    output_notebook()
    circle_kwargs = {}
    if legend:
        circle_kwargs['legend'] = legend
    if size:
        circle_kwargs['size'] = size
    if hue:
        color_seq = data[hue]
        mapper = models.LinearColorMapper(palette = palettes.viridis(256), low = min(color_seq), high = max(color_seq))
        circle_kwargs['fill_color'] = transform.transform(hue, mapper)
    ds = models.ColumnDataSource(data)
    if label_cols in None:
        label_cols = data.columns
    tools_tips = sorted([(x, '@{}'.format(x)) for x in label_cols], key = lambda tup: tup[0])
    hover = models.HoverTool(tooltips = tool_tips)
    fig = figure(tools = [hover, 'pan', 'zoom_in', 'zoom_out', 'reset'], toolbar_location = 'below')
    fig.circle(x, y, source = ds, alpha = alpha, **circle_kwargs)
    show(fig)
    return(fig)
res = bokeh_scatter('PC1', 'PC2', data = pca_df.assign(surv = y.reset_index(drop = True)), hue = 'surv', size = 10, legend = 'surv')

In [None]:
from yellowbrick.features.pca import PCADecomposition

In [None]:
colors = ['rg'[j] for j in y]
pca3_viz = PCADecomposition(proj_dim = 3, color = colors)
pca3_viz.fit_transform(X, y)
pca3_viz.finalize()
fig = plt.gcf()
plt.tight_layout()
fig.savefig('Desktop\oreilly\mlpr_1710.png', dpi = 300, bbox_inches = 'tight')

In [None]:
import scprep

In [None]:
scprep.plot.rotate_scatter3d(X_pca[:, :3], c = y, cmap = 'Spectral', figsize = (8, 6), label_prefix = 'Principal Component')

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
fig = plt.figure(figsize = (6, 4))
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(xs = X_pca[:, 0], ys = X_pca[:, 1], zs = X_pca[:, 2], c = y, cmap = 'viridis')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')

### UMAP

In [None]:
import umap

In [None]:
u = umap.UMAP(random_state = 42)
X_umap = u.fit_transform(StandardScaler().fit_transform(X))
X_umap.shape

In [None]:
fig, ax = plt.subplots(figsize = (8, 4))
pd.DataFrame(X_umap.plot(kind = 'scatter', x = 0, y = 1, ax = ax, c= y, alpha = 0.2, cmap = 'Spectral'))
fig.savefig('Desktop\oreilly\mlpr_1713.png', dpi = 300)

In [None]:
X_std = StandardScaler().fit_transform(X)
fig, axes = plt.subplots(2, 2, figsize = (6, 4))
axes = axes.reshape(4)
for i, n in enumerate([2, 5, 10, 50]):
    ax = axes[i]
    u = umap.UMAP(random_state = 42, n_neighbors = n)
    X_umap = u_fit_transform(X_std)
    pd.DataFrame(X_umap).plot(kind = 'scatter', x = 0, y = 1, ax = ax, c = y, cmap = 'Spectral', alpha = 0.5)
    ax.set_title(f'nn = {n}')
    plt.tight_layout()
    fig.savefig('Desktop\oreilly\mlpr_1714.png', dpi = 300)

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (6, 4))
axes = axes.reshape(4)
for i, n in enumerate([0, 0.33, 0.66, 0.99]):
    ax = axes[i]
    u = umap.UMAP(random_state = 42, min_dist = n)
    X_umap = u.fit_transform(X_std)
    pd.DataFrame(X_umap).plot(kind = 'scatter', x = 0, y = 1, ax = ax, c = y, cmap = 'Spectral', alpha = 0.5)
    ax.set_title(f'min_dist = {n}')
plt.tight_layout()
fig.savefig('Desktop\oreilly\mlpr_1715.png', dpi = 300)

### t-SNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
X_std = StandardScaler().fit_transform(X)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
colors = ['rg'[j] for j in y]
scat = ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c = colors, alpha = 0.5)
ax.set_xlabel('Embedding 1')
ax.set_ylabel('Embedding 2')
fig.savefig('Desktop\oreilly\mlpr_1716.png', dpi = 300)

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (6, 4))
axes = axes.reshape(4)
for i, n in enumerate((2, 30, 50, 100)):
    ax = axes[i]
    t = TSNE(random_state = 42, perplexity = n)
    X_tsne = t.fit_transform(X)
    pd.DataFrame(X_tsne).plot(kind = 'scatter', x = 0, y = 1, ax = ax, c = y, cmap = 'Spectral', alpha = 0.5)
    ax.set_title(f'perplexity = {n}')
    plt.tight_layout()
    fig.savefig('Desktop\oreilly\mlpr_1717.png', dpi = 300)

### PHATE

In [None]:
import phate

In [None]:
p = phate.PHATE(random_state = 42)
X_phate = p.fit_transform(X)
X_phate.shape

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
phate.plot.scatter2d(p, c = y, ax = ax, alpha = 0.5)
fig.savefig('Desktop\oreilly\mlpr_1718.png', dpi = 300)

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (6, 4))
axes = axes.reshape(4)
p = phate.PHATE(random_state = 42, n_jobs = -1)
for i, n in enumerate((2, 5, 20, 100)):
    ax = axes[i]
    p.set_params(knn = n)
    X_phate = p.fit_transform(X)
    pd.DataFrame(X_phate).plot(kind = 'scatter', x = 0, y = 1, ax = ax, c = y, cmap = 'Spectral', alpha = 0.5)
    ax.set_title(f'knn = {n}')
plt.tight_layout()
fig.savefig('Desktop\oreilly\mlpr_1719.png', dpi = 300)

# Capítulo 18 - Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
X_std = preprocessing.StandardScaler().fit_transform(X)
km = KMeans(2, random_state = 42)
km.fit(X_std)

In [None]:
X_km = km.predict(X)
X_km

In [None]:
inertias = []
sizes = range(2, 12)
for k in sizes:
    k2 = KMeans(random_state = 42, n_clusters = k)
    k2.fit(X)
    inertias.append(k2.inertia_)
fig, ax = plt.subplots(figsize = (6, 4))
pd.Series(inertias, index = sizes).plot(ax = ax)
ax.set_xlabel('K')
ax.set_ylabel('Inertia')
fig.savefig('Desktop\oreilly\mlpr_1801.png', dpi = 300)

In [None]:
from sklearn import metrics

In [None]:
inertias = []
sils = []
chs = []
dbs = []
sizes = range(2, 12)
for k in sizes:
    k2 = KMeans(random_state = 42, n_clusters = k)
    k2.fit(X_std)
    inertias.append(k2.inertia_)
    sils.append(metrics.silhouette_score(X, k2.labels_))
    chs.append(metrics.calinski_harabasz_score(X, k2.labels_))
    dbs.append(metrics.davies_bouldin_score(X, k2.labels_))
fig, ax = plt.subplots(figsize = (6, 4))
(pd.DataFrame({'inertia': inertias, 'silhouette': sils, 'calinski': chs, 'davis': dbs, 'k': sizes}).set_index('k').plot(ax = ax, subplots = True, layout = (2, 2)))
fig.savefig('Desktop\oreilly\mlpr_1802.png', dpi = 300)

In [None]:
from yellowbrick.cluster.silhouette import SilhouetteVisualizer

fig, axes = plt.subplots(2, 2, figsize = (12, 8))
axes = axes.reshape(4)
for i, k in enumerate(range(2, 6)):
    ax = axes[i]
    sil = SilhouetteVisualizer(KMeans(n_clusters = k, random_state = 42), ax = ax)
    sil.fit(X_std)
    sil.finalize()
    ax.set_xlim(-0.2, 0.8)
plt.tight_layout()
fig.savefig('Desktop\oreilly\mlpr_1803.png', dpi = 300)

### Clustering (hierárquico) aglomerativo

In [None]:
from scipy.cluster import hierarchy

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
dend = hierarchy.dendrogram(hierarchy.linkage(X_std, method = 'ward'))
fig.savefig('Desktop\oreilly\mlpr_1804.png', dpi = 300)

In [None]:
from scipy.cluster import hierarchy

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
dend = hierarchy.dendrogram(hierarchy.linkage(X_std, method = 'ward'), truncate_mode = 'lastp', p = 20, show_contracted = True)
fig.savefig('Desktop\oreilly\mlpr_1805.png', dpi = 300)

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
ag = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'ward')
ag.fit(X)

In [None]:
km = KMeans(n_clusters = 2)
km.fit(X_std)
labels = km.predict(X_std)
(X.assign(cluster = labels, survived = y).groupby('cluster').agg(['mean', 'var']).T)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
(X.assign(cluster = labels, survived = y).groupby('cluster').mean().T.plot.bar(ax = ax))
fig.savefig('Desktop\oreilly\mlpr_1806.png', dpi = 300)

In [None]:
fig, ax = plt.subplots(figsize = (6, 4))
sns.scatterplot('PC1', 'PC2', data = X.assign(PC1 = X_pca[:, 0], PC2 = X_pca[:, 1], cluster = labels), hue = 'cluster', alpha = 0.5, ax = ax)
fig.savefig('Desktop\oreilly\mlpr_1807.png', dpi = 300, bbox_inches = 'tight')

In [None]:
(X.assign(cluster = label).groupby('cluster').age.describe().T)

In [None]:
dt = tree.DecisionTreeClassifier()
dt.fit(X, labels)
for col, val in sorted(zip(X.columns, dt.feature_importances_), key = lambda col_val: col_val[1], reverse = True):
    print(f'{col:10}{val:10.3f}')

In [None]:
dot_data = StringIO()
tree.export_graphviz(dt, out_file = dot_data, feature_names = X.columns, class_names = ['0', '1'], max_depth = 2, filled = True)
g = pydotplus.graph_from_dot_data(dot_data.getvalue())
g.write_png('Desktop\oreilly\mlpr_1808.png', dpi = 300)

# Capítulo 19 - Pipelines

### Pipeline de Classificação

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
def tweak_titanic(df):
    df = df.drop(columns = ['name', 'ticket', 'home.dest', 'boat', 'body', 'cabin']).pipe(pd.get_dummies, drop_first = True)
    return df

In [None]:
class TitanicTransformer(BaseEstimator, TransformerMixin):
    def transform(self, X):
        X = tweak_titanic(X)
        X = X.drop(columns = 'survived')
        return X
    def fit(self, X, y):
        return self
pipe = Pipeline([('titan', TitanicTransformer()), ('impute', impute.IterativeImputer()), ('std', preprocessing.StandardScaler()), ('rf', RandomForestClassifier())])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(orig_df, orig_df.survived, test_size = 0.3, random_state = 42)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

In [None]:
params = {'rf__max_features': [0.4, 'auto'], 'rf__n_estimators': [15, 200]}
grid = model_selection.GridSearchCV(pipe, cv = 3, param_grid = params)
grid.fit(orig_df, orig_df.survived)

In [None]:
grid.best_params_

In [None]:
pipe.set_params(**grid.best_params_)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

In [None]:
metrics.roc_auc_score(y_test2, pipe.predict(X_test2))

### Pipeline de regressão

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
reg_pipe = Pipeline([('std', preprocessing.StandardScaler()), ('lr', LinearRegression())])
reg_pipe.fit(bos_X_train, bos_y_train)
reg_pipe.score(bos_X_test, bos_y_test)

In [None]:
reg_pipe.named_steps['lr'].intercept_

In [None]:
reg_pipe.named_steps['lr'].coef_

In [None]:
from sklearn import metrics

In [None]:
metrics.mean_squared_error(bos_y_test, reg_pipe.predict(bos_X_test))

### Pipeline de PCA

In [None]:
pca_pipe = Pipeline([('std', preprocessing.StandardScaler()), ('pca', PCA())])
X_pca = pca_pipe.fit_transform(X)

In [None]:
pca_pipe.named_steps['pca'].explained_variance_ratio_

In [None]:
pca_pipe.named_steps['pca'].components_[0]

# FIM!