<a href="https://colab.research.google.com/github/jungeun919/Pytorch_study/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Dataset & Library Loading

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Titanic/titanic/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Titanic/titanic/test.csv')
df_sub = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Titanic/titanic/gender_submission1.csv')

In [3]:
# Make Dataset

df_train.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True) # inplace의 경우 drop한 후의 데이터프레임으로 기존 데이터프레임을 대체하겠다는 뜻
df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

sex = pd.get_dummies(df_train['Sex'], drop_first=True) # 가변수의 첫번째 변수를 자동으로 삭제를 해주며, 가변수 함정을 피할 수 있게 해줌
embark = pd.get_dummies(df_train['Embarked'], drop_first=True)
df_train = pd.concat([df_train, sex, embark], axis=1) # DataFrame 결합

df_train.drop(['Sex', 'Embarked'], axis=1, inplace=True)

sex = pd.get_dummies(df_test['Sex'], drop_first=True)
embark = pd.get_dummies(df_test['Embarked'], drop_first=True)
df_test = pd.concat([df_test, sex, embark], axis=1)

df_test.drop(['Sex', 'Embarked'], axis=1, inplace=True)

df_train.fillna(df_train.mean(), inplace=True) # 결측값 평균값으로 채우기
df_test.fillna(df_test.mean(), inplace=True)

Scaler1 = StandardScaler() # 평균을 제거하고 데이터를 단위 분산으로 조정
Scaler2 = StandardScaler() # 그러나 이상치가 있다면 평균과 표준편차에 영향을 미쳐 변환된 데이터의 확산은 매우 달라짐

train_columns = df_train.columns
test_columns = df_test.columns

df_train = pd.DataFrame(Scaler1.fit_transform(df_train))
df_test = pd.DataFrame(Scaler2.fit_transform(df_test))

df_train.columns = train_columns
df_test.columns = test_columns


# iloc: 행번호로 선택하는 방법
# loc: label이나 조건표현으로 선택하는 방법
feature = df_train.iloc[:, 2:].columns.tolist() # tolist(): 같은 위치에 있는 데이터끼리 묶어준다 ex) 'value'=[1, 2, 3], 'test=['a', 'b', 'c']
target = df_train.loc[:, 'Survived'].name

X_train = df_train.iloc[:, 2:].values
y_train = df_train.loc[:, 'Survived'].values

In [4]:
# Pytorch

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable # autograd: 자동 미분화(역전파)

In [5]:
# Logistic Regression Model

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(8, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 2)
        self.dropout = nn.Dropout(0.2)

    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

model = Net()
print(model)

Net(
  (fc1): Linear(in_features=8, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [6]:
# Optimizer (확률적 경사하강법 SGD)
# Loss Function

optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [7]:
# Training

batch_size = 64
n_epochs = 500
batch_no = len(X_train) // batch_size

train_loss = 0
train_loss_min = np.Inf
for epoch in range(n_epochs):
    for i in range(batch_no):
        start = i * batch_size
        end = start + batch_size
        x_var = Variable(torch.FloatTensor(X_train[start:end]))
        y_var = Variable(torch.LongTensor(y_train[start:end]))

        optimizer.zero_grad()
        output = model(x_var)
        loss = criterion(output, y_var)
        loss.backward()
        optimizer.step()

        values, labels = torch.max(output, 1)
        num_right = np.sum(labels.data.numpy() == y_train[start:end])
        train_loss += loss.item() * batch_size

    train_loss = train_loss / len(X_train)
    if train_loss <= train_loss_min:
        print("Validation loss decreased ({:6f} ===> {:6f}). Saving  the model..."\
              .format(train_loss_min, train_loss))
        torch.save(model.state_dict(), "model.pt")
        train_loss_min = train_loss

    if epoch % 200 == 0:
        print('')
        print("Epoch: {} \tTrain Loss: {} \tTrain Accuracy: {}"\
              .format(epoch+1, train_loss, num_right / len(y_train[start:end])))
        print('Training Ended!')

Validation loss decreased (   inf ===> 0.666620). Saving  the model...

Epoch: 1 	Train Loss: 0.6666198370833991 	Train Accuracy: 0.0
Training Ended!
Validation loss decreased (0.666620 ===> 0.664644). Saving  the model...
Validation loss decreased (0.664644 ===> 0.664220). Saving  the model...
Validation loss decreased (0.664220 ===> 0.658981). Saving  the model...
Validation loss decreased (0.658981 ===> 0.655179). Saving  the model...
Validation loss decreased (0.655179 ===> 0.654461). Saving  the model...
Validation loss decreased (0.654461 ===> 0.654140). Saving  the model...
Validation loss decreased (0.654140 ===> 0.654075). Saving  the model...
Validation loss decreased (0.654075 ===> 0.649203). Saving  the model...
Validation loss decreased (0.649203 ===> 0.646196). Saving  the model...
Validation loss decreased (0.646196 ===> 0.646019). Saving  the model...
Validation loss decreased (0.646019 ===> 0.645368). Saving  the model...
Validation loss decreased (0.645368 ===> 0.6439

In [8]:
# Prediction

X_test = df_test.iloc[:, 1:].values
X_test_var = Variable(torch.FloatTensor(X_test), requires_grad=False) # requires_grad=True: autograd가 텐서의 추적 기록에 남지 않게 함

with torch.no_grad():
    test_result = model(X_test_var)
values, labels = torch.max(test_result, 1)
survived = labels.data.numpy()

In [9]:
# Submission

submission = pd.DataFrame({'PassengerId': df_sub['PassengerId'], 'Survived': survived})
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/Titanic/titanic/submission_predict.csv', index=False)