In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
train = pd.read_csv('./input/titanic/train.csv')
test = pd.read_csv('./input/titanic/test.csv')
submission = pd.read_csv('./input/titanic/gender_submission.csv')
train.shape, test.shape, submission.shape

((891, 12), (418, 11), (418, 2))

In [22]:
display(train.info())
display(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


None

In [23]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [61]:
from sklearn.base import BaseEstimator, TransformerMixin
class myDataTrans(BaseEstimator, TransformerMixin):
    def __init__(self, age=True, farecut=True, cabinc=True, familysz=True, agecat=True, dummies=True):
        self.age, self.farecut, self.cabinc = age, farecut, cabinc
        self.familysz, self.agecat, self.dummies = familysz, agecat, dummies
        self.dummy_cols, self.drop_cols = ['Sex','Embarked'], ['PassengerId','Ticket']
        
    def fit(self, X, y=None):
        if self.age: self.AgeName_li, self.meanAge_li = self.fill_Age(X)
        if self.farecut: 
            self.fare_bin, self.fare_bin_label = self.mk_col_FareCut(X)
            X['FareCut'] = pd.cut(X['Fare'], bins=self.fare_bin, right=False, labels=self.fare_bin_label)
        if self.cabinc: self.Pclass_li, self.Cmode_li = self.fill_Cabin_C(X)
        return self
    
    def transform(self, X):
        if self.age:
            X['AgeName'] = X['Name'].str.extract('([A-Za-z]+)W.')
            for i in range(len(self.AgeName_li)):
                X.loc[(X['Age'].isnull())&(X['AgeName']==self.AgeName_li), 'Age'] = self.meanAge_li[i]
            self.drop_cols.extend(['AgeName','Name'])
        if self.farecut:
            X['FareCut'] = pd.cut(X['Fare'], bins=self.fare_bin, right=False, labels=self.fare_bin_label)
            X['FareCut'] = X['FareCut'].astype('float')
            self.drop_cols.append('Fare')
        if self.cabinc:
            X['Cabin_C'] = X['Cabin'].str[:1]
            for Pclass, mode_v in zip(self.Pclass_li, self.Cmode_li):
                X.loc[(X['Cabin_C'].isnull())&(X['Pclass']==Pclass), 'Cabin_C'] = mode_v
            self.dummy_cols.append('Cabin_C')
            self.drop_cols.append('Cabin')
        if self.familysz: 
            X['FamilySz'] = pd.cut(X['SibSp']+X['Parch']+1, bins=[0,1,2,4,12], 
                                   right=False, labels=['Single','SmallF','MedF','LargeF'])
            self.dummy_cols.append('FamilySz')
            self.drop_cols.extend(['SibSp','Parch'])
        if self.agecat:
            X['AgeCat'] = pd.cut(X['Age'], [-np.inf,18,25,35,60,np.inf], 
                                 right=False, labels=['Children', 'Youth', 'YoungAdult', 'MiddleAged', 'Senior'])
            self.dummy_cols.append('AgeCat')
            self.drop_cols.append('Age')
        if self.dummies:
            self.dummy_cols = list(set(self.dummy_cols))
            X[self.dummy_cols] = X[self.dummy_cols].astype('str')
        X.drop(columns=self.drop_cols, inplace=True)
        return X
    
    def get_feature_names_out(self, feature_names_in):
        return feature_names_in
        
    def fill_Age(self, X): # Age
        last_li = ['Capt','Col','Countess','Don','Dr','Jonkheer','Lady',
                   'Dona','Major','Mlle','Mme','Ms','Rev','Sir']
        replace_li = ['Mr','Other','Mrs','Mr','Mr','Other','Mrs','Miss',
                      'Mr','Miss','Miss','Miss','Other','Mr']
        X['AgeName'] = X['Name'].str.extract('([A-Za-z]+)W.')
        fill_df = X.groupby('AgeName')['Age'].mean().to_frame().astype(int)
        return list(fill_df.index), list(fill_df['Age'])    
    
    def mk_col_FareCut(self, X, n=10):
        Fmax = int(X['Fare'].max())+1
        Fmin = int(X['Fare'].min())
        Fminmax = (Fmax-Fmin)//n - (Fmax-Fmin)%n
        bins = list(range(Fmin,Fmax+Fminmax,Fminmax))
        bins_label = [n+Fminmax/2 for n in bins[:-1]]
        return bins, bins_label       
    
    def fill_Cabin_C(self, X):
        X['Cabin_C'] = X['Cabin'].str[:1]
        X.loc[(X['Pclass']==1)&((X['FareCut']==72.0)|(X['FareCut']==504.0)),'Cabin_C'] = 'B'
        X.loc[(X['Pclass']==2)&(X['FareCut']==72.0),'Cabin_C'] = 'C'
        fill_df = X.groupby('Pclass')['Cabin_C'].agg(**{'Cmode':lambda x:x.mode()})
        return list(fill_df.index), list(fill_df['Cmode'])

In [62]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(transform_output="pandas")

ct_impute = ColumnTransformer([
    ('impute_num', SimpleImputer(strategy='median'), 
         make_column_selector(dtype_include=float)),
    ('impute_cat', SimpleImputer(strategy='most_frequent'), 
         make_column_selector(dtype_include=object))], remainder='passthrough')
ct_onehot = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='if_binary', sparse_output=False), 
         make_column_selector(dtype_include=object))], remainder='passthrough')

mypipe = Pipeline([('myTrans', myDataTrans(age=True, farecut=True, cabinc=True, 
                                        familysz=True, agecat=True, dummies=True)),
                   ('simpleimpute', ct_impute),
                   ('onehot', ct_onehot)])

In [74]:
X_train = mypipe.fit_transform(train.drop('Survived', axis=1))
target = train['Survived'].copy()
X_test = mypipe.transform(test.copy())
display(X_train.head(3))
display(X_test.head(3))

Unnamed: 0,onehot__impute_cat__Sex_male,onehot__impute_cat__Embarked_C,onehot__impute_cat__Embarked_Q,onehot__impute_cat__Embarked_S,onehot__impute_cat__Embarked_nan,onehot__impute_cat__Cabin_C_A,onehot__impute_cat__Cabin_C_B,onehot__impute_cat__Cabin_C_C,onehot__impute_cat__Cabin_C_D,onehot__impute_cat__Cabin_C_E,...,onehot__impute_cat__FamilySz_MedF,onehot__impute_cat__FamilySz_SmallF,onehot__impute_cat__AgeCat_Children,onehot__impute_cat__AgeCat_MiddleAged,onehot__impute_cat__AgeCat_Senior,onehot__impute_cat__AgeCat_YoungAdult,onehot__impute_cat__AgeCat_Youth,onehot__impute_cat__AgeCat_nan,remainder__impute_num__FareCut,remainder__remainder__Pclass
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,24.0,3
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,72.0,1
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,24.0,3


Unnamed: 0,onehot__impute_cat__Sex_male,onehot__impute_cat__Embarked_C,onehot__impute_cat__Embarked_Q,onehot__impute_cat__Embarked_S,onehot__impute_cat__Embarked_nan,onehot__impute_cat__Cabin_C_A,onehot__impute_cat__Cabin_C_B,onehot__impute_cat__Cabin_C_C,onehot__impute_cat__Cabin_C_D,onehot__impute_cat__Cabin_C_E,...,onehot__impute_cat__FamilySz_MedF,onehot__impute_cat__FamilySz_SmallF,onehot__impute_cat__AgeCat_Children,onehot__impute_cat__AgeCat_MiddleAged,onehot__impute_cat__AgeCat_Senior,onehot__impute_cat__AgeCat_YoungAdult,onehot__impute_cat__AgeCat_Youth,onehot__impute_cat__AgeCat_nan,remainder__impute_num__FareCut,remainder__remainder__Pclass
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,24.0,3
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,24.0,3
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,24.0,2


In [97]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
train_df, valid_df, train_target, valid_target = train_test_split(X_train, target)
print(train_df.shape, valid_df.shape, train_target.shape, valid_target.shape)

# tf.convert_to_tensor(train_df)
train_set = tf.data.Dataset.from_tensor_slices((train_df, train_target))
valid_set = tf.data.Dataset.from_tensor_slices((valid_df, valid_target))
# test_set = tf.data.Dataset.from_tensor_slices((X_test))

batch_s = 16
train_batches = train_set.shuffle(1000).batch(batch_s)
valid_batches = valid_set.shuffle(1000).batch(batch_s)
# test_batches = test_dataset.shuffle(1000).batch(batch_s)

(668, 24) (223, 24) (668,) (223,)


In [144]:
# simple model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(12, activation='relu'),
    tf.keras.layers.Dense(1, activation='softmax')])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

training = model.fit(X_train, target, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
val_acc = np.mean(training.history['val_accuracy'])
print("\n%s: %.2f%%" % ('val_accuracy', val_acc*100))
# model.fit(train_batches, epochs=5, validation_data=valid_batches)


val_accuracy: 35.75%


In [145]:
from tensorflow.keras import Model
class MyModel(Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.hidden_layer1 = Dense(8, 24, activation='relu')
        self.hidden_layer2 = Dense(8, 24, activation='relu')
        self.output_layer = Dense(1, activation='softmax')

    def call(self, x):
        x = self.hidden_layer1(x)
        x = Dropout(0.2)
        x = self.hidden_layer2(x)
        x = self.output_layer(x)
        return x

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

In [146]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

valid_loss = tf.keras.metrics.Mean(name='valid_loss')
valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='valid_accuracy')

In [147]:
@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images, training=True)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)
def valid_step(images, labels):
    predictions = model(images, training=False)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

In [149]:
EPOCHS = 5

for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    valid_loss.reset_states()
    valid_accuracy.reset_states()

    for images, labels in train_batches:
        train_step(images, labels)

    for valid_images, valid_labels in valid_batches:
        valid_step(test_images, test_labels)

    print(
    f'Epoch {epoch + 1}, '
    f'Loss: {train_loss.result()}, '
    f'Accuracy: {train_accuracy.result() * 100}, '
    f'Valid Loss: {valid_loss.result()}, '
    f'Valid Accuracy: {valid_accuracy.result() * 100}'
  )

ValueError: in user code:

    File "C:\Users\Playdata\AppData\Local\Temp\ipykernel_11240\375055457.py", line 5, in train_step  *
        loss = loss_object(labels, predictions)
    File "C:\Users\Playdata\anaconda3\lib\site-packages\keras\losses.py", line 142, in __call__  **
        losses = call_fn(y_true, y_pred)
    File "C:\Users\Playdata\anaconda3\lib\site-packages\keras\losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\Playdata\anaconda3\lib\site-packages\keras\losses.py", line 2078, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "C:\Users\Playdata\anaconda3\lib\site-packages\keras\backend.py", line 5660, in sparse_categorical_crossentropy
        res = tf.nn.sparse_softmax_cross_entropy_with_logits(

    ValueError: `labels.shape` must equal `logits.shape` except for the last dimension. Received: labels.shape=(16,) and logits.shape=(1, 16)


In [151]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader 
from torch.utils.data import random_split
from torchvision import datasets 
from torchvision import transforms 

In [153]:
# data_set
len(train_set), len(valid_set)

(668, 223)

In [154]:
# batch data
batch_s = 16
trainloader = DataLoader(train_set, batch_size=batch_s, shuffle=True)
validloader = DataLoader(valid_set, batch_size=batch_s, shuffle=True)
len(trainloader), len(validloader)

(42, 14)

In [160]:
# create model
class FMnist_DNN(nn.Module):
    def __init__(self):
        super().__init__()
        # linear layer, fully connected layer, affine layer, dense layer : np.dot(x, w) + b
        self.hidden_linear1 = nn.Linear(24,24) 
        self.batch_norm1 = nn.BatchNorm1d(24)
        self.hidden_linear2 = nn.Linear(24,12)
        self.batch_norm2 = nn.BatchNorm1d(12)
        self.hidden_linear3 = nn.Linear(12,6)
        self.batch_norm3 = nn.BatchNorm1d(6)
        self.ouput_linear = nn.Linear(6,1)

        # 가중치 초기화 (he초기화)
        nn.init.kaiming_normal_(self.hidden_linear1.weight, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_normal_(self.hidden_linear2.weight, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_normal_(self.hidden_linear3.weight, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_normal_(self.ouput_linear.weight, mode='fan_in', nonlinearity='relu')

    def forward(self, x):
        x = self.hidden_linear1(x)
        x = self.batch_norm1(x)
        x = F.relu(x)
        x = F.dropout(x, 0.3) # dropout
        x = self.hidden_linear2(x)
        x = self.batch_norm2(x) 
        x = F.relu(x)  
        x = F.dropout(x, 0.2)
        x = self.hidden_linear3(x)
        x = self.batch_norm3(x)
        x = F.relu(x)
        x = self.ouput_linear(x)    
        return x

In [161]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [162]:
def train_loop(model, loss_fn, trainloader, validloader, optimizer, scheduler, epochs=17, patience=5):
    n = 3; min_loss = 2000; trigger = 0
    # print('-'*((epochs*n-15)//2),' now loading ','-'*((epochs*n-15)//2))
    for epoch in range(epochs):
        model.train() # 훈련모드
        train_loss = 0
        for imgs, labels in trainloader:
            imgs.resize_(imgs.shape[0], 784) # (100,1,28,28) -> (100,784)
            loss = loss_fn(model(imgs), labels) 
            optimizer.zero_grad() 
            loss.backward() 
            optimizer.step() 
            train_loss += loss.item()

        # 검증
        valid_loss, valid_accuracy = 0, 0
        model.eval() # 평가 모드
        with torch.no_grad():
            for imgs, labels in validloader:
                imgs.resize_(imgs.shape[0], 784)
                logit = model(imgs)
                _, preds = torch.max(logit, 1) # preds = logit.max(dim=1)[1]
                valid_accuracy += int((preds==labels).sum())
                valid_loss += loss_fn(logit, labels).item()

        # print('-'*n, end='')
        print('Epoch : {}/{}.......'.format(epoch+1, epochs),            
                  'Train Loss : {:.3f}'.format(train_loss/len(trainloader)), 
                  'Valid Loss : {:.3f}'.format(valid_loss/len(validloader)), 
                  'Valid Accuracy : {:.3f}'.format(valid_accuracy/(batch_s*len(validloader))))
        writer.add_scalar('Train Loss', train_loss/len(trainloader), epoch+1)
        writer.add_scalar('Valid Loss', valid_loss/len(validloader), epoch+1)
        writer.add_scalars('TrainLoss and ValidLoss', {'Train':train_loss/len(trainloader),
                                                      'Valid':valid_loss/len(validloader)}, epoch+1)
        writer.add_scalar('Valid Accuracy', valid_accuracy/(batch_s*len(validloader)), epoch+1)

        if valid_loss/len(validloader) > min_loss: # early stop
            trigger += 1
            if trigger > patience:
                print('\nEarly Stopping!! epoch/epochs: {}/{}'.format(epoch, epochs))
                writer.flush()
                break
        else:
            trigger = 0
            min_loss = valid_loss/len(validloader) # min_loss 갱신
            torch.save(model.state_dict(), 'best_checkpoint.pth') # valid loss가 커지기 전의 모델 저장

        scheduler.step(valid_loss) # learning rate scheduler
    writer.flush()
    return

In [163]:
lr = 0.001; epochs = 30; patience = 7
model = FMnist_DNN()
loss_fn = nn.CrossEntropyLoss() # loss function
optimizer = optim.Adam(model.parameters(), lr=lr) # optimizer
# 모니터링하고 있는 값(예:valid_loss)의 최소값(min) 또는 최대값(max) patience 기간동안 줄어들지 않을 때(OnPlateau) lr에 factor(0.1)를 곱해주는 전략
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

train_loop(model, loss_fn, trainloader, validloader, optimizer, scheduler, epochs, patience)

TypeError: '_TensorSliceDataset' object is not subscriptable

In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs

In [None]:
writer.close()