In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## 티켓번호 최빈값으로 채우기

In [3]:
train_mode = train.copy()
test_mode = test.copy()

In [4]:
# 두자리 수 이상의 숫자만 추출해 TicketNumber에 넣기
train_mode['TicketNumber'] = train_mode['Ticket'].str\
                                .extract('(\d{2,})', expand = True)
train_mode['TicketNumber'] = train_mode['TicketNumber'].apply(pd.to_numeric)
test_mode['TicketNumber'] = test_mode['Ticket'].str\
                                .extract('(\d{2,})', expand = True)
test_mode['TicketNumber'] = test_mode['TicketNumber'].apply(pd.to_numeric)

In [5]:
# 누락된 데이터 채워주는 함수
def data_fillna(dataset):
    dataset['TicketNumber']\
        .fillna(dataset['TicketNumber'].mode()[0], inplace=True)

In [6]:
data_fillna(train_mode)
data_fillna(test_mode)

In [9]:
train_mode['TicketNumber'].isnull().sum()

0

## sklearn의 imputer 사용해 Age 채우기

In [12]:
train_imputer = train.copy()
test_imputer = test.copy()

In [13]:
from sklearn.preprocessing import Imputer

# sklearn의 imputer는 default로 mean값을 채워준다
def nan_padding(data, columns):
    for column in columns:
        imputer = Imputer()
        data[column] = imputer.fit_transform(\
                            data[column].values.reshape(-1, 1))
    return data

train_imputer = nan_padding(train_imputer, ['Age'])
test_imputer = nan_padding(test_imputer, ['Age'])

In [14]:
train_imputer.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### 다른 컬럼 정보를 참고하여 결측치 채우기 
(Name -Age)

In [15]:
train_cp = train.copy()
test_cp = test.copy()

In [16]:
train['Name'].isnull().sum()

0

In [18]:
# 이름에서 호칭을 부르는 말 떼어오기
train_cp['Title'] = train_cp['Name'].str.split(\
                            ', ', expand=True)[1].str.split(\
                            '.', expand=True)[0]
test_cp['Title'] = test_cp['Name'].str.split(\
                            ', ', expand=True)[1].str.split(\
                            '.', expand=True)[0]

In [19]:
train_cp['Title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Col               2
Major             2
Mlle              2
Don               1
Jonkheer          1
Sir               1
Ms                1
Mme               1
Capt              1
Lady              1
the Countess      1
Name: Title, dtype: int64

In [25]:
train_cp.query("Title in ( 'Lady', 'Ms', 'Mme', 'the Countess')").head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
369,370,1,1,"Aubart, Mme. Leontine Pauline",female,24.0,0,0,PC 17477,69.3,B35,C,Mme
443,444,1,2,"Reynaldo, Ms. Encarnacion",female,28.0,0,0,230434,13.0,,S,Ms
556,557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherl...",female,48.0,1,0,11755,39.6,A16,C,Lady


In [29]:
train_cp.loc[train_cp['Title'].isin(['Lady', 'Ms', 'Mme', 'the Countess', 'Mlle']), 'Title'] = 'Ms'

In [37]:
train_cp.loc[train_cp['Title'].isin(['Jonkheer', 'Sir', 'Capt', 'Col', 'Don', 'Major', 'Rev']), 'Title'] = 'Mr'

In [40]:
train_cp['Title'].value_counts()

Mr        531
Miss      182
Mrs       125
Master     40
Dr          7
Ms          6
Name: Title, dtype: int64

In [41]:
train_cp['Age'].fillna(train_cp.groupby('Title')['Age']\
                       .transform('median'), inplace=True)
test_cp['Age'].fillna(test_cp.groupby('Title')['Age']\
                       .transform('median'), inplace=True)

## 다른 칼럼 정보를 학습시켜 결측치 예측하기

In [68]:
train_rf = train.copy()
test_rf = test.copy()

In [69]:
# family size 칼럼 만들기
train_rf['FamilySize'] = train_rf['SibSp'] + train_rf['Parch']+1
test_rf['FamilySize'] = test_rf['SibSp'] + test_rf['Parch']+1

In [70]:
# family size 카테고리화 하기 (single, small, large)

bins = [0, 1, 5, 100]
group_names = ['single', 'short', 'long']
train_rf['FsizeD'] = pd.cut(train_rf['FamilySize'], bins, labels=group_names)
test_rf['FsizeD'] = pd.cut(test_rf['FamilySize'], bins, labels=group_names)

print(train_rf['FsizeD'].value_counts())

single    537
short     307
long       47
Name: FsizeD, dtype: int64


In [81]:
train_rf = train_rf[['Age', 'Sex', 'Fare', 'Embarked', 'FsizeD']]
# test_rf = test_rf[['Age', 'Sex', 'Fare', 'Embarked', 'FsizeD']]

train_rf.head(3)

Unnamed: 0,Age,Sex,Fare,Embarked,FsizeD
0,22.0,male,7.25,S,short
1,38.0,female,71.2833,C,short
2,26.0,female,7.925,S,single


In [88]:
from sklearn.ensemble import RandomForestRegressor

def fill_missing_age(df):
    # age값을 기준으로 학습/테스트 세트 분리
    train = df.loc[(df.Age.notnull())] # age 값이 있는 row
    test = df.loc[(df.Age.isnull())] # age 값이 없는 row
    
    # label은 age
    y = train['Age']
    
    # 피쳐
    train_X = train.values[:, 1::]
    test_X = test.values[:, 1::]
    
    print(train_X.shape)
    print(test_X.shape)
    print(len(y))
    
    # 모델 학습
    rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
    rtr.fit(train_X, y)
    
    # 학습한 데이터를 바탕으로 결측치 예측
    predictedAges = rtr.predict(test_X)
    
    # 예측값을 Age가 null인 데이터에 채우기
    df.loc[(df.Age.isnull()), 'Age'] = predictedAges
    
    return df

In [89]:
train_rf = fill_missing_age(train_rf)

(714, 4)
(177, 4)
714


ValueError: could not convert string to float: 'single'