In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
data_df = pd.concat([train,test])

In [None]:
# 훈련 및 테스트 데이터 세트 결합
train_test_data = [train, test]

---
### Title
---

In [None]:
# Name값에서 성별 정보 추출
# 정규표현식으로 [문자]. 으로 끝나는 문자열 추출
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract('([A-za-z]+)\.', expand=False)

In [None]:
title_mapping = {
      'Mr' : 0
    , 'Miss' : 1
    , 'Mrs' : 2
    , 'Master' : 3
    , 'Dr' : 4
    , 'Rev' : 4
    , 'Mlle' : 4
    , 'Major' : 4
    , 'Col' : 4
    , 'Countess' : 4
    , 'Capt' : 4
    , 'Ms' : 4
    , 'Sir' : 4
    , 'Lady' : 4
    , 'Mme' : 4
    , 'Don' : 4
    , 'Jonkheer' : 4
}
train['Title'] = train['Title'].map(title_mapping)
train.head()

In [None]:
title_mapping = {
      'Mr' : 0 
    , 'Miss' : 1
    , 'Mrs' : 2
    , 'Master' : 3 
    , 'Ms' : 4
    , 'Col' : 4
    , 'Rev' : 4
    , 'Dr' : 4
    , 'Dona' : 4
}
test['Title']= test['Title'].map(title_mapping)
test.head()

---
### 성별
---

In [None]:
# 성별에 숫자 매핑
sex_mapping = {'male': 0, 'female':1}

In [None]:
# 반복문으로 매핑결과 데이터에 적용
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

---
### groupsize
---

In [None]:
for ticket in train['Ticket'].unique():
    train.loc[train['Ticket']==ticket,'group_size'] = len(train[train['Ticket']==ticket])

for ticket in test['Ticket'].unique():
    test.loc[test['Ticket']==ticket,'group_size'] = len(test[test['Ticket']==ticket])

In [None]:
train['Fare'] = train['Fare'] / train['group_size']
test['Fare'] = test['Fare'] / test['group_size']

In [None]:
train['group_size'].value_counts(), test['group_size'].value_counts()

In [None]:
train_test_data = [train,test]

for dataset in train_test_data:
    dataset.loc[dataset['group_size'] == 1, 'group_size'] = 0
    dataset.loc[dataset['group_size'] == 2, 'group_size'] = 0.4
    dataset.loc[(dataset['group_size'] == 3) | (dataset['group_size'] == 4), 'group_size'] = 0.8
    dataset.loc[dataset['group_size'] > 4, 'group_size'] = 1.2

---
## age
---

In [None]:
train.loc[train['Age'].isnull(), 'Null_Age'] = 0
test.loc[test['Age'].isnull(), 'Null_Age'] = 0

train.loc[train['Age'].notnull(), 'Null_Age'] = 1
test.loc[test['Age'].notnull(), 'Null_Age'] = 1
 

In [None]:
# train data set
train['Age'].fillna(train.groupby('Title')['Age'].transform('mean'), inplace=True)

# test data set
test['Age'].fillna(test.groupby('Title')['Age'].transform('mean'), inplace=True)

In [None]:
for dataset in train_test_data:
    dataset.loc[dataset['Age'] <= 17, 'Age'] =0
    dataset.loc[(dataset['Age'] >17) & (dataset['Age'] <= 24), 'Age'] = 1
    dataset.loc[(dataset['Age'] >24) & (dataset['Age'] <= 34), 'Age'] = 2
    dataset.loc[(dataset['Age'] >34) & (dataset['Age'] <= 44), 'Age'] = 3
    dataset.loc[(dataset['Age'] >44) & (dataset['Age'] <= 60), 'Age'] = 4
    dataset.loc[dataset['Age'] >60, 'Age'] = 5

train.head()

In [None]:
# 나이대별 생사여부 확인
survived = train[train['Survived']==1]['Age'].value_counts()
dead = train[train['Survived']==0]['Age'].value_counts()

---
### Embarked
---

In [None]:
# 좌석별 승선 항구 확인하기
Pclass1 = train[train['Pclass']==1]['Embarked'].value_counts()
Pclass2 = train[train['Pclass']==2]['Embarked'].value_counts()
Pclass3 = train[train['Pclass']==3]['Embarked'].value_counts()

In [None]:
# DataFrame으로 만들어 인덱스 주기 
df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ['1st class', '2nd class', '3rd class']

In [None]:
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [None]:
# 머신러닝 Classifier를 위해 텍스트 숫자 변경(매핑)
embarked_mapping = {'S':0, 'C':1, 'Q':2}

# map 함수 사용해서 처리
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

---
### Fare
---

In [None]:
# 탑승권 가격이 결측값일 경우, 좌석 등급별 중간값으로 대치
# train data set
train["Fare"].fillna(train.groupby('Pclass')['Fare'].transform('median'), inplace=True)

# test data set
test["Fare"].fillna(test.groupby('Pclass')['Fare'].transform('median'), inplace=True)

In [None]:
train_test_data = [train,test]
for dataset in train_test_data:
    dataset.loc[dataset['Fare'] <= 7,'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7) & (dataset['Fare'] <= 8.8), 'Fare'] = 0.4
    dataset.loc[(dataset['Fare'] > 8.8) & (dataset['Fare'] <= 17), 'Fare'] = 0.8
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1.2
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 1.6
    dataset.loc[dataset['Fare'] > 100,'Fare'] = 2

---
### Cabin
---

In [None]:
train.loc[train['Cabin'].isnull(), 'Null_Cabin'] = 0
test.loc[test['Cabin'].isnull(), 'Null_Cabin'] = 0

train.loc[train['Cabin'].notnull(), 'Null_Cabin'] = 1
test.loc[test['Cabin'].notnull(), 'Null_Cabin'] = 1

In [None]:
# 반복문을 통해 객실번호의 알파벳과 숫자 분리 후, 알파벳만 뽑아오기
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]

In [None]:
# 클래스별로 객실 종류 count
Pclass1 = train[train['Pclass']==1]['Cabin'].value_counts()
Pclass2 = train[train['Pclass']==2]['Cabin'].value_counts()
Pclass3 = train[train['Pclass']==3]['Cabin'].value_counts()

df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ['1st class', '2nd class', '3rd class']

df.plot(kind='bar', stacked=True, figsize=(10,5))

In [None]:
# classifier를 위해 매핑
# feature scaling : raw data 전처리하는 과정 (feature들의 크기, 범위 정규화)/ 소수점 사용
# 숫자의 범위가 비슷하지 않으면 먼 거리에 있는 데이터를 조금 더 중요하게 생각할 수 있음 주의

cabin_mapping = {'A':0, 'B':0.4, 'C':0.8, 'D':1.2, 'E':1.6, 'F':2, 'G':2.4, 'T': 2.8}

for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

In [None]:
# Cabin의 missing field는 1등급 2등급 3등급 클래스와 밀접한 관계
# # fillna
train['Cabin'].fillna(
        train.groupby('Pclass')['Cabin'].transform('median')
    ,   inplace=True
)
test['Cabin'].fillna(
        test.groupby('Pclass')['Cabin'].transform('median')
    ,   inplace=True
)
train.isnull().sum(), test.isnull().sum()
# train.tail(10)

---
### Family Size
---

In [None]:
# 혼자타면 SibSp, Parch 모두 0으로 표시되므로 +1 해주기
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

---
### Family_Survival
---

In [None]:
data_df['Lastname'] = data_df['Name'].apply(lambda x: str.split(x, ",")[0])

default_survival_value = 0.5
data_df['Family_Survival'] = default_survival_value

for group, group_df in data_df[['Survived','Name', 'Lastname', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Lastname', 'Fare']):
    if (len(group_df) != 1):
        for ind, row in group_df.iterrows():
            smax = group_df.drop(ind)['Survived'].max()
            smin = group_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

    data_df.loc[data_df['Family_Survival']!=0.5].shape[0]

In [None]:
for _, group_df in data_df.groupby('Ticket'):
    if (len(group_df) != 1):
        for ind, row in group_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = group_df.drop(ind)['Survived'].max()
                smin = group_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
train['Family_Survival'] = data_df['Family_Survival'][:891]
test['Family_Survival'] = data_df['Family_Survival'][891:]

In [None]:
family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
train['FamilySize'] = train['FamilySize'].map(family_mapping)
test['FamilySize'] = test['FamilySize'].map(family_mapping)

---
### 컬럼 정리
---

In [None]:
# 불필요한 데이터 삭제 : drop
# Ticket, SibSp, Parch, PassengerId 정보 제거

features_drop = ['Ticket', 'SibSp', 'Parch', 'Name']

train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)

---
### 정규화
---

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
columns = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin',
       'Embarked', 'Title', 'Null_Age', 'group_size', 'Null_Cabin',
       'FamilySize','Family_Survival']
# 객체 생성
scaler = MinMaxScaler()
# 데이터 셑 변환, fit(), transform()
scaler.fit(train[columns])
scaled = scaler.transform(train[columns])

#transforma()시 스케일 변환된 데이터 세트가 ndarray로 반환돼 이를 DataFrame으로 변환
df_train = pd.DataFrame(data=scaled, columns=columns)

print('최솟값')             # 0에 가까워짐
print(df_train.min())
print('\n최댓값')
print(df_train.max()) 

In [None]:
columns = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin',
       'Embarked', 'Title', 'Null_Age', 'group_size', 'Null_Cabin',
       'FamilySize','Family_Survival']
# 객체 생성
scaler = MinMaxScaler()
# 데이터 셑 변환, fit(), transform()
scaler.fit(test[columns])
scaled = scaler.transform(test[columns])

#transforma()시 스케일 변환된 데이터 세트가 ndarray로 반환돼 이를 DataFrame으로 변환
df_test = pd.DataFrame(data=scaled, columns=columns)

print('최솟값')             # 0에 가까워짐
print(df_test.min())
print('\n최댓값')
print(df_test.max()) 

In [None]:
train.loc[:,columns] = df_train
test.loc[:,columns] = df_test

---
### train survived 분리
---

In [None]:
train_data = train.drop('Survived',axis=1)
target = train['Survived']

train_data.shape, target.shape

---
### modeling
---

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
import xgboost as xgb 
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
clf = KNeighborsClassifier(n_neighbors=13) # Knn Model object
scoring = 'accuracy' #평가지표 : 정확도
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=-1, scoring=scoring)
print(score.mean())

In [None]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
clf = DecisionTreeClassifier() # Knn Model object
scoring = 'accuracy' #평가지표 : 정확도
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=-1, scoring=scoring)
print(score.mean())

---
### random_forest fitting 코드
---

In [None]:
# listr = []
# for r in range(2,300,1):
#     clf = RandomForestClassifier(
#                               max_depth=4
#                               ,n_estimators=69
#                              ,min_samples_leaf=32
#                              , random_state=993
#                              ,n_jobs=-1
#                                  )
#     clf.fit(train_data, target) # 학습
#     pred = clf.predict(test) # 테스트 데이터로 예측값 추출    
#     submission1 = pd.read_csv('../답/submission (1).csv')
#     del submission1['PassengerId']
#     from sklearn.metrics import accuracy_score
#     accuracy = accuracy_score(pred, submission1)
#     print(r,accuracy)
#     listr.append(accuracy)
# print(listr.index(max(listr)),max(listr))

---
## 하이퍼파라미터찾기 베이지안
---

In [None]:
# submission1 = pd.read_csv('../답/submission (1).csv')
# del submission1['PassengerId']

In [None]:
# from hyperopt import hp , STATUS_OK

# rf_search_space = {'max_depth': hp.quniform('max_depth', 1,30,1), 
#                     'random_state': hp.quniform('random_state', 1,1000,1),
#                     'n_estimators': hp.quniform('n_estimators', 1,1000,1),
#                     'min_samples_leaf': hp.quniform('min_samples_leaf', 1,30,1)}

In [None]:
# def objective_func(search_space):
#     rf_clf = RandomForestClassifier(n_estimators=int(search_space['n_estimators'])
#                             , max_depth=int(search_space['max_depth'])
#                             ,random_state=int(search_space['random_state'])
#                             ,min_samples_leaf=int(search_space['min_samples_leaf'])
#                            )
#     rf_clf.fit(train_data , target)
#     pred = rf_clf.predict(test)
#     submission1 = pd.read_csv('../답/submission (1).csv')
#     del submission1['PassengerId']
#     from sklearn.metrics import accuracy_score
#     accuracy = accuracy_score(pred, submission1)
#     return {'loss' : -accuracy, 'status' : STATUS_OK}

In [None]:
# from hyperopt import fmin, tpe, Trials

# trials = Trials()

# # fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출.
# best = fmin(fn=objective_func,
#             space=rf_search_space,
#             algo=tpe.suggest,
#             max_evals=3000, # 최대 반복 횟수를 지정합니다.
#             trials=trials)
#             # rstate=np.random.default_rng(seed=))
# print(best)

---
### randomForest
---

In [None]:
clf = RandomForestClassifier(
                               max_depth=4
                              ,n_estimators=69
                             ,min_samples_leaf=32
                             , random_state=993
                                 )
clf.fit(train_data, target) # 학습
pred = clf.predict(test) # 테스트 데이터로 예측값 추출

---
### 컬럼 관련도
---

In [None]:
# feature importance 추출

print("Feature importances:\n{0}".format(np.round(clf.feature_importances_,3)))

# feature 별 importance 매핑

for name, value in zip(train_data.columns
                       ,clf.feature_importances_):
    print('{0} : {1:.3f}'.format(name, value))

---
### random_forest 파일저장
---

In [None]:
# submission = pd.DataFrame(
#     {
#         "PassengerId":test["PassengerId"], # 앞에서 PassendgerId 삭제했으므로 다시 불러 옴
#         "Survived": pred
#     }
# )
# submission.to_csv('../result/titanic83.7.csv', index=False)

---
### 정확도 평가 캐글에 올리지 않아도 됨
---

In [None]:
submission1 = pd.read_csv('../답/submission (1).csv')
del submission1['PassengerId']
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(pred, submission1)
accuracy

---
### Xgboost
---

In [None]:
# xgboost  = XGBClassifier()
# xgboost.fit(train_data, target)
# scoring = 'accuracy'
# score = cross_val_score(xgboost, train_data, target, cv=k_fold, n_jobs=-1, scoring=scoring)
# Y_pred = xgboost.predict(test)
# print(score.mean())

# submission = pd.DataFrame({
#         "PassengerId": test["PassengerId"],
#         "Survived": Y_pred
#     })
# submission.to_csv('../result/titanic-3.csv', index=False)

In [None]:
# xgboost  = XGBClassifier()
# xgboost.fit(train_data, target)
# Y_pred = xgboost.predict(test)
# submission1 = pd.read_csv('./답/submission (1).csv')
# del submission1['PassengerId']
# from sklearn.metrics import accuracy_score
# accuracy = accuracy_score(Y_pred, submission1)
# print(accuracy)