In [256]:
#%pip3 install Pandas-Profiling
#%pip install xlrd
#%pip uninstall sklearn
#%pip uninstall scikit-learn
#%pip install sklearn
#%pip uninstall scipy
#%pip3 install scipy

In [257]:
import pandas_profiling
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import (ensemble, preprocessing, tree)
from sklearn.metrics import (auc, confusion_matrix, roc_auc_score, roc_curve)
from sklearn.model_selection import (train_test_split, StratifiedKFold)
from sklearn.experimental import (enable_iterative_imputer)
from sklearn import impute


In [258]:
# URL que contém o dado.
url = ("https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls")

In [259]:
# Leitura do arquivo.
df = pd.read_excel(url)
orig_df = df

In [260]:
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [261]:
#pandas_profiling.ProfileReport(df)

In [262]:
# Número de linhas e colunas.
df.shape

(1309, 14)

In [263]:
df.describe().iloc[:, :2]

Unnamed: 0,pclass,survived
count,1309.0,1309.0
mean,2.294882,0.381971
std,0.837836,0.486055
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.0
75%,3.0,1.0
max,3.0,1.0


In [264]:
# Número de dados ausentes.
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [265]:
# Porcentagem.
# Número de dados ausentes.
df.isnull().mean()

pclass       0.000000
survived     0.000000
name         0.000000
sex          0.000000
age          0.200917
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000764
cabin        0.774637
embarked     0.001528
boat         0.628724
body         0.907563
home.dest    0.430863
dtype: float64

In [266]:
# Contadores dos atributos ausentes em cada amostra.
df.isnull().sum(axis=1).loc[:10]

0     1
1     1
2     2
3     1
4     2
5     1
6     1
7     2
8     1
9     2
10    1
dtype: int64

In [267]:
# Informa se a linha contém dados ausentes ou não.
mask = df.isnull().any(axis=1)

In [268]:
mask.head()

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [269]:
df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [270]:
# Contagem na coluna "age".
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [271]:
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

In [272]:
name = df.name

In [273]:
name.head(3)

0     Allen, Miss. Elisabeth Walton
1    Allison, Master. Hudson Trevor
2      Allison, Miss. Helen Loraine
Name: name, dtype: object

In [274]:
df = df.drop(columns=["name", "ticket", "home.dest", "boat", "body", "cabin"])

In [275]:
# Criação de coluna dummy.
df = pd.get_dummies(df)

In [276]:
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [277]:
df = df.drop(columns="sex_male")

In [278]:
df = pd.get_dummies(df, drop_first=True)

In [279]:
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [280]:
y = df.survived
X = df.drop(columns="survived")

In [281]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

In [282]:
num_cols = ["pclass", "age", "sibsp", "parch", "fare", "sex_female"]

In [283]:
imputer = impute.IterativeImputer()
imputed = imputer.fit_transform(X_train[num_cols])
X_train.loc[:, num_cols] = imputed
imputed = imputer.transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed

In [284]:
cols = "pclass,age,sibsp,fare".split(",")
cols

['pclass', 'age', 'sibsp', 'fare']

In [285]:
sca = preprocessing.StandardScaler()
sca

StandardScaler()

In [286]:
X_train = sca.fit_transform(X_train)
X_train = pd.DataFrame(X_train)
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.825248,-0.128878,-0.498616,-0.432553,-0.473599,-0.739795,-0.506474,-0.330089,0.660104
1,0.825248,-0.205639,-0.498616,-0.432553,-0.488120,-0.739795,-0.506474,-0.330089,0.660104
2,-0.363317,-0.751431,-0.498616,-0.432553,-0.145224,1.351725,-0.506474,-0.330089,0.660104
3,0.825248,-2.198733,6.897852,1.805972,0.679618,1.351725,-0.506474,-0.330089,0.660104
4,0.825248,-0.049698,-0.498616,-0.432553,-0.490408,1.351725,-0.506474,-0.330089,0.660104
...,...,...,...,...,...,...,...,...,...
911,0.825248,-0.282703,-0.498616,-0.432553,-0.493169,1.351725,-0.506474,3.029485,-1.514914
912,0.825248,-0.829401,-0.498616,-0.432553,-0.490408,1.351725,-0.506474,-0.330089,0.660104
913,0.825248,-0.010713,-0.498616,-0.432553,-0.332731,-0.739795,-0.506474,-0.330089,0.660104
914,0.825248,-0.205639,-0.498616,-0.432553,-0.487567,1.351725,-0.506474,-0.330089,0.660104


In [287]:
# Página 37.
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
            ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

In [288]:
# Página 37-38.
def get_train_test_X_y(df, y_col, size=0.3, std_cols=None):
    y = df[y_col]
    X = df.drop(columns=y_col)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=size, random_state=42)
    cols = X.columns
    num_cols = ["pclass", "age", "sibsp", "parch", "fare"]
    fi = impute.IterativeImputer()
    X_train.loc[:, num_cols] = fi.fit_transform(X_train[num_cols])
    X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])

    if std_cols:
        std = preprocessing.StandardScaler()
        X_train.loc[:, std_cols] = std.fit_transform(X_train[std_cols])
        X_test.loc[:, std_cols] = std.transform(X_test[std_cols])

    return X_train, X_test, y_train, y_test


In [289]:
# Página 38.
ti_df = tweak_titanic(orig_df)
std_cols = "pclass,age,sibsp,fare".split(",")
X_train, X_test, y_train, y_test = get_train_test_X_y(ti_df, "survived", std_cols=std_cols)

In [290]:
# Página 39.
from sklearn.dummy import DummyClassifier

In [291]:
# Página 39.
bm = DummyClassifier()
bm.fit(X_train, y_train)
bm.score(X_test, y_test)  # Precisão.

0.5699745547073791

In [292]:
# Página 39.
from sklearn import metrics

In [293]:
metrics.precision_score(y_test, bm.predict(X_test))

  _warn_prf(average, modifier, msg_start, len(result))


0.0