In [43]:
#%pip install Pandas-Profiling
#%pip install ipywidgets
#%pip install xlrd
#%pip install pyjanitor
#%pip install configparser
#%pip install sklearn
#%pip install yellowbrick
#%pip install matplotlib

In [44]:
import pandas_profiling
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import (ensemble, model_selection, preprocessing, tree)
from sklearn.metrics import (auc, confusion_matrix, roc_auc_score, roc_curve)
from sklearn.model_selection import (train_test_split, StratifiedKFold)
from sklearn.experimental import (enable_iterative_imputer)
from sklearn import impute
from sklearn import metrics
from sklearn.dummy import DummyClassifier
from yellowbrick.classifier import (ConfusionMatrix, ROCAUC)
from yellowbrick.model_selection import (LearningCurve)
import janitor as jn

In [45]:
# URL que contém o dado. Página 25.
url = ("https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls")

In [46]:
# Leitura do arquivo. Página 25.
df = pd.read_excel(url)
orig_df = df

In [47]:
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [48]:
 # Página 26.
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [49]:
# Gera um relatório sobre o DataFrame.
#pandas_profiling.ProfileReport(df)

In [50]:
# Número de linhas e colunas. Página 27.
df.shape

(1309, 14)

In [51]:
# Página 29.
df.describe().iloc[:, :2]

Unnamed: 0,pclass,survived
count,1309.0,1309.0
mean,2.294882,0.381971
std,0.837836,0.486055
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.0
75%,3.0,1.0
max,3.0,1.0


In [52]:
# Número de dados ausentes. Página 29.
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [53]:
# Porcentagem. Dica na página 30.
# Número de dados ausentes.
df.isnull().mean()

pclass       0.000000
survived     0.000000
name         0.000000
sex          0.000000
age          0.200917
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000764
cabin        0.774637
embarked     0.001528
boat         0.628724
body         0.907563
home.dest    0.430863
dtype: float64

In [54]:
# Contadores dos atributos ausentes em cada amostra. Página 30.
df.isnull().sum(axis=1).loc[:10]

0     1
1     1
2     2
3     1
4     2
5     1
6     1
7     2
8     1
9     2
10    1
dtype: int64

In [55]:
# Informa se a linha contém dados ausentes ou não. Página 31.
mask = df.isnull().any(axis=1)

In [56]:
# Página 31.
mask.head()

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [57]:
# Página 31.
df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [58]:
# Contagem na coluna "age". Página 32.
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [59]:
# Página 32.
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

In [60]:
# Página 33.
name = df.name

In [61]:
# Página 33.
name.head(3)

0     Allen, Miss. Elisabeth Walton
1    Allison, Master. Hudson Trevor
2      Allison, Miss. Helen Loraine
Name: name, dtype: object

In [62]:
# Página 33.
df = df.drop(columns=["name", "ticket", "home.dest", "boat", "body", "cabin"])

In [63]:
# Criação de coluna dummy. Página 33.
df = pd.get_dummies(df)

In [64]:
# Página 33.
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [65]:
# Página 34.
df = df.drop(columns="sex_male")

In [66]:
# Página 34.
df = pd.get_dummies(df, drop_first=True)

In [67]:
# Página 34.
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [68]:
# Página 34.
y = df.survived
X = df.drop(columns="survived")

In [69]:
# Página 34.
X, y = jn.get_features_targets(df, target_columns="survived")



In [70]:
# Página 35.
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

In [71]:
X.columns

Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female', 'embarked_C',
       'embarked_Q', 'embarked_S'],
      dtype='object')

In [72]:
# Página 35.
num_cols = ["pclass", "age", "sibsp", "parch", "fare", "sex_female"]

In [73]:
# Página 36.
imputer = impute.IterativeImputer()
imputed = imputer.fit_transform(X_train[num_cols])
X_train.loc[:, num_cols] = imputed
imputed = imputer.transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed

In [74]:
# Página 36.
cols = "pclass,age,sibsp,fare".split(",")
cols

['pclass', 'age', 'sibsp', 'fare']

In [75]:
# Página 36.
sca = preprocessing.StandardScaler()
sca

StandardScaler()

In [76]:
# Página 37.
X_train = sca.fit_transform(X_train)
X_train = pd.DataFrame(X_train)
X_test = sca.fit_transform(X_test)
X_test = pd.DataFrame(X_test)

In [77]:
# Página 37.
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
            ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

In [78]:
# Página 37-38.
def get_train_test_X_y(df, y_col, size=0.3, std_cols=None):
    y = df[y_col]
    X = df.drop(columns=y_col)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=size, random_state=42)
    cols = X.columns
    num_cols = [
        "pclass",
        "age",
        "sibsp",
        "parch",
        "fare",
    ]
    fi = impute.IterativeImputer()
    fitted = fi.fit_transform(X_train[num_cols])
    X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(num_cols)})
    test_fit = fi.transform(X_test[num_cols])
    X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(num_cols)})
    if std_cols:
        std = preprocessing.StandardScaler()
        fitted = std.fit_transform(X_train[std_cols])
        X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(std_cols)})
        test_fit = std.transform(X_test[std_cols])
        X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(std_cols)})

    return X_train, X_test, y_train, y_test

In [79]:
# Página 38.
ti_df = tweak_titanic(orig_df)
std_cols = "pclass,age,sibsp,fare".split(",")
X_train, X_test, y_train, y_test = get_train_test_X_y(ti_df, "survived", std_cols=std_cols)

In [80]:
# Página 39.
# Correct result: 0.48346055979643765
# My result: 0.5699745547073791
bm = DummyClassifier()
bm.fit(X_train, y_train)
bm.score(X_test, y_test)

0.5699745547073791

In [81]:
# Correct result: 0.4896551724137931
# My result: 0.0
metrics.precision_score(y_test, bm.predict(X_test))

  _warn_prf(average, modifier, msg_start, len(result))


0.0