### [예제 2_1] Kaggle Titanic Data 로 부터 training data / test dat 생성 예제
#### test data 의 input data 는 titanic_test.csv 에서 가져오고, test data 정답은 titanic_gender_submission.csv 에서 가져옴
#### Missing Data 를 평균, 중간값 등으로 모두 채워 넣는다. 즉 dropna() 않음

In [1]:
import numpy as np
import pandas as pd

### pd.read_csv() 실행

In [2]:
# Load Kaggle Data as matrix
train_df = pd.read_csv('./titanic_train.csv')
test_df = pd.read_csv('./titanic_test.csv')
test_sub_df = pd.read_csv('./titanic_gender_submission.csv')

### DataFrame  확인

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
test_sub_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### 2_1 [1]  각 데이터프레임의 Missing Data 및 Embarked 종류 확인

In [6]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
test_sub_df.isnull().sum()

PassengerId    0
Survived       0
dtype: int64

In [9]:
train_df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [10]:
train_df['Age'].median()

28.0

### 2-1 [2] Missing Data 처리
#### train_df['Age'] 는 median() 값으로 대체, train_df['Embarked'] 는 가장 빈도수가 많은 'S' 로 대체 
#### 그 외의 column 은 딥러닝 학습에 중요하지 않기 때문에 NaN 값 변경하지 않음

In [12]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

train_df['Embarked'].fillna('S', inplace=True)

train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [13]:
print(test_df['Age'].median())
print(test_df['Fare'].mean())

27.0
35.6271884892086


### test_df['Age'] 열의 NaN 또한 median() 대체, test_df['Fare'] 는 mean() 대체

In [14]:
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

### 2_1 [3] 딥러닝 학습을 위한 matrix 생성

In [15]:
train_csv_data = train_df.values
test_csv_data = test_df.values
test_csv_sub = test_sub_df.values

print(type(train_csv_data), train_csv_data.shape)
print(type(test_csv_data), test_csv_data.shape)
print(type(test_csv_sub), test_csv_sub.shape)

<class 'numpy.ndarray'> (891, 12)
<class 'numpy.ndarray'> (418, 11)
<class 'numpy.ndarray'> (418, 2)


### Data Conversion (문자 => 숫자)

In [16]:
# train_csv_data 에서 male -> 1, female -> 0
for i in range(len(train_csv_data)):
    
    if train_csv_data[i, 4] == 'male':
        
        train_csv_data[i, 4] = 1
    
    else:
        
        train_csv_data[i, 4] = 0
        
# test_csv_data 에서 male -> 1, female -> 0
for i in range(len(test_csv_data)):
    
    if test_csv_data[i, 3] == 'male':
        
        test_csv_data[i, 3] = 1
    
    else:
        
        test_csv_data[i, 3] = 0

In [17]:
# train_csv_data 에서 Embarked, Empty -> 0, S -> 1, C -> 2, Q -> 3
for i in range(len(train_csv_data)):
    
    if train_csv_data[i, 11] == 'S':
        
        train_csv_data[i, 11] = 1
        
    elif train_csv_data[i, 11] == 'C':
        
        train_csv_data[i, 11] = 2
        
    elif train_csv_data[i, 11] == 'Q':
        
        train_csv_data[i, 11] = 3
        

# test_csv_data 에서 Embarked, Empty -> 0, S -> 1, C -> 2, Q -> 3
for i in range(len(test_csv_data)):
    
    if test_csv_data[i, 10] == 'S':
        
        test_csv_data[i, 10] = 1
        
    elif test_csv_data[i, 10] == 'C':
        
        test_csv_data[i, 10] = 2
        
    elif test_csv_data[i, 10] == 'Q':
        
        test_csv_data[i, 10] = 3

### training data 생성에 필요한 칼럼 재 확인

In [18]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
# training input data / training_target_data 생성
training_input_data = train_csv_data[ : , [ 2, 4, 5, 6, 7, 9 ] ].astype('float32')  # Pclass, Sex, Age, SibSp, Parch, Fare

training_target_data = train_csv_data[ :, 1:2 ].astype('float32')  # Survived

print('training_input_data.shape = ', training_input_data.shape, ', training_target_data.shape = ', training_target_data.shape)

training_input_data.shape =  (891, 7) , training_target_data.shape =  (891, 1)


### test data 생성에 필요한 칼럼 재 확인

In [20]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [21]:
test_sub_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [22]:
# test_input_data / test_target_data 생성

test_input_data = test_csv_data[ : , [ 1, 3, 4, 5, 6, 8] ].astype('float32')  # Pclass, Sex, Age, SibSp, Parch, Fare
test_target_data = test_csv_sub[ :, -1 ].astype('float32')  # test_csv_sub 에서 데이터 가져옴. 주의 요함

print("test_input_data.shape = ", test_input_data.shape, ", test_target_data.shape = ", test_target_data.shape)

test_input_data.shape =  (418, 7) , test_target_data.shape =  (418,)
