# Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

from feature_engine.imputation import MeanMedianImputer
from category_encoders.ordinal import OrdinalEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

# Data Load

In [2]:
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')

train_test = pd.concat([train, test], ignore_index=True)

dfs = [('Train Set', train), ('Test Set', test)]

# EDA

In [3]:
profile = ProfileReport(train)

In [4]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/25 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

# Train, Valid

In [5]:
random_state = 42

X = train.drop(['PassengerId', 'Survived'], axis=1)
y = train['Survived']

xtr, xval, ytr, yval = train_test_split(
    X, y, stratify=y, train_size=0.8, random_state=random_state)

# Baseline 1

In [6]:
dummy = DummyClassifier()

dummy.fit(xtr, ytr)

yp = dummy.predict(xval)

acc_score = accuracy_score(yval, yp)

print(f'O modelo Dummy teve acurácia de: {acc_score*100:.2f}%')

O modelo Dummy teve acurácia de: 61.45%


# Baseline 2

In [7]:
yp = [1 if i == 'female' else 0 for i in train['Sex']]

acc_score = accuracy_score(train['Survived'], yp)

print(f'O modelo "Mulheres Sobrevivem" teve acurácia de: {acc_score*100:.2f}%')

O modelo "Mulheres Sobrevivem" teve acurácia de: 78.68%


# Data Wrangling

## Missing Values

In [8]:
X = train.drop(['PassengerId', 'Survived', 'Name',
               'Ticket', 'Fare', 'Cabin'], axis=1)
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,22.0,1,0,S
1,1,female,38.0,1,0,C
2,3,female,26.0,0,0,S
3,1,female,35.0,1,0,S
4,3,male,35.0,0,0,S
...,...,...,...,...,...,...
886,2,male,27.0,0,0,S
887,1,female,19.0,0,0,S
888,3,female,,1,2,S
889,1,male,26.0,0,0,C


In [9]:
cat_cols = [col for col in X.columns if X[col].dtype == 'O']
num_cols = [col for col in X.columns if X[col].dtype != 'O']

In [10]:
X[cat_cols].isna().sum()

Sex         0
Embarked    2
dtype: int64

In [11]:
X['Embarked'].fillna('S', inplace=True)

In [12]:
X[cat_cols].isna().sum()

Sex         0
Embarked    0
dtype: int64

In [13]:
X[num_cols].isna().sum()

Pclass      0
Age       177
SibSp       0
Parch       0
dtype: int64

In [14]:
X['Age'].fillna(np.round(X['Age'].mean(), 0), inplace=True)

In [15]:
X[num_cols].isna().sum()

Pclass    0
Age       0
SibSp     0
Parch     0
dtype: int64

## Encoding

In [16]:
ordinal_enc = OrdinalEncoder(cols=cat_cols)

X = ordinal_enc.fit_transform(X)
X

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,1,22.0,1,0,1
1,1,2,38.0,1,0,2
2,3,2,26.0,0,0,1
3,1,2,35.0,1,0,1
4,3,1,35.0,0,0,1
...,...,...,...,...,...,...
886,2,1,27.0,0,0,1
887,1,2,19.0,0,0,1
888,3,2,30.0,1,2,1
889,1,1,26.0,0,0,2


# Novo Train, Test Split

In [17]:
xtr, xval, ytr, yval = train_test_split(
    X, y, stratify=y, train_size=0.8, random_state=random_state)

# ML

## Log Reg

In [43]:
log = LogisticRegression()

log.fit(xtr, ytr)

yp = log.predict(xval)

acc_score = accuracy_score(yval, yp)

print(f'A Regressão Logística teve acurácia de: {acc_score*100:.2f}%')

A Regressão Logística teve acurácia de: 81.01%


### Cross Validation

In [45]:
cv = cross_validate(log, X, y, cv=5, scoring='accuracy')

acc_score = cv['test_score'].mean()

print(f'A Regressão Logística teve acurácia média de {acc_score*100:.2f}% na validação cruzada.')

A Regressão Logística teve acurácia média de 79.35% na validação cruzada.


### Tunando o Modelo

In [None]:
from sko