In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset , DataLoader
from torch.optim import Adam
from torchvision import models
from torch.autograd import Variable
from torchvision import transforms

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
from sys import path
from zipfile import ZipFile

from tqdm import tqdm
from PIL import Image
import torch.nn.functional as F

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cpu


In [6]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/train.csv')
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [14]:
# 데이터 가공하기
tuned_train_df = train_df.drop(labels=['Name', 'Cabin', 'Ticket'], axis=1) # cabin은 Nan 값이 너무 많아 삭제
tuned_test_df = test_df.drop(labels=['Name', 'Cabin', 'Ticket'], axis=1)

sex = pd.get_dummies(tuned_train_df['Sex'])
embark = pd.get_dummies(tuned_train_df['Embarked'])

tuned_train_df = pd.concat([tuned_train_df,sex,embark],axis=1)
tuned_train_df = tuned_train_df.drop(['Sex','Embarked'],axis=1)
tuned_train_df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,1,0,3,22.0,1,0,7.2500,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.9250,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1000,1,0,0,0,1
4,5,0,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0000,0,1,0,0,1
887,888,1,1,19.0,0,0,30.0000,1,0,0,0,1
888,889,0,3,,1,2,23.4500,1,0,0,0,1
889,890,1,1,26.0,0,0,30.0000,0,1,1,0,0


In [15]:
sex = pd.get_dummies(tuned_test_df['Sex'])
embark = pd.get_dummies(tuned_test_df['Embarked'])

tuned_test_df = pd.concat([tuned_test_df,sex,embark],axis=1)
tuned_test_df = tuned_test_df.drop(['Sex','Embarked'],axis=1)
tuned_test_df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,1,0,3,22.0,1,0,7.2500,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.9250,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1000,1,0,0,0,1
4,5,0,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0000,0,1,0,0,1
887,888,1,1,19.0,0,0,30.0000,1,0,0,0,1
888,889,0,3,,1,2,23.4500,1,0,0,0,1
889,890,1,1,26.0,0,0,30.0000,0,1,1,0,0


In [19]:
# 결측치
tuned_train_df = tuned_train_df.dropna()
tuned_test_df = tuned_test_df.dropna()

In [25]:
from sklearn.preprocessing import StandardScaler

train_scaler = StandardScaler()
test_scaler = StandardScaler()

train_columns = tuned_train_df.columns
test_columns  = tuned_test_df.columns
print(train_columns)
print(test_columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'female', 'male', 'C', 'Q', 'S'],
      dtype='object')
Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'female', 'male', 'C', 'Q', 'S'],
      dtype='object')


In [27]:
train = pd.DataFrame(train_scaler.fit_transform(tuned_train_df)) # 정규화 하여 dataframe화
test  = pd.DataFrame(test_scaler.fit_transform(tuned_test_df))

train.columns = train_columns
test.columns = test_columns


In [47]:
features = train.columns[2:].to_list() # column 중 passengerID와 survived를 제외한 항목
target = train.loc[:, 'Survived'].name # 예측하고자 하는 값은 survived 하나

'Survived'

In [49]:
x_train = train.iloc[:,2:].values
y_train = train.loc[:, 'Survived'].values
len(x_train)

714