In [1]:
import pandas as pd
import numpy as np

<br>

## 1. Preparing dataset (2번부터 실습 진행)

In [2]:
data_df = pd.read_csv('titanic.csv')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Data info

- **PassengerId** : Unique ID of passenger
- **Survived** : 0 = No, 1 = Yes
- **pclass** : Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
- **sibsp** : # of siblings & spouses aboard the Titanic
- **parch** : # of parents / children aboard the Titanic
- **ticket** : Ticket number
- **cabin** : Cabin number
- **embarked** : Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [3]:
y_data = data_df[['Survived']]
y_data.head(3)

Unnamed: 0,Survived
0,0
1,1
2,1


In [4]:
del data_df['Survived']

x_data = data_df.copy()

x_data.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


<br>

## 2. Feature engineering & Feature selection

#### 시도해볼 수 있는 전략들

- 불필요한 열이나 예측에 방해가 되는 열은 아예 지우기 (ex. PassengerId)
- 결측치 채우기 
- Text로 되어있는 Category(Factor)는 숫자로 바꿔주기 (ex. Male/Female -> 0/1)
- 실수 범위를 구간 범위로 바꿔주기 
- 필요한 경우 기존 열을 바탕으로 새로운 열을 계산해 추가하기

In [34]:
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


### 2-1.Name


In [5]:
# 이름 분류

data_df['name_group']=''
for x in range(len(data_df['Name'])):
    data_df['name_group'][x]=data_df['Name'][x].split(',')[1].split('.')[0]

In [36]:
data_df['name_group'].value_counts()

 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Mlle              2
 Col               2
 Major             2
 Jonkheer          1
 Don               1
 Sir               1
 Capt              1
 the Countess      1
 Ms                1
 Mme               1
 Lady              1
Name: name_group, dtype: int64

In [37]:
# data_df['name_group'] = data_df['name_group'].replace(['Lady','Countess','Capt','Col',
#                                                        'Don','Dr','Major','Rev',
#                                                        'Sir', 'Jonkheer'], 'Rare')

# data_df['name_group'] = data_df['name_group'].replace('Mlle', 'Miss')
# data_df['name_group'] = data_df['name_group'].replace('Ms', 'Miss')
# data_df['name_group'] = data_df['name_group'].replace('Mme', 'Miss')

### 2-2. Sex

In [6]:
#Sex
data_df['Sex']=data_df['Sex'].replace(['male', 'female'], [0, 1])

### 2-3. Age

In [7]:
age_group_mean = data_df.groupby(['name_group'])['Age'].mean().to_dict()

# iterate through the rows with missing age data and substitue w/ avg
for passenger in data_df[data_df['Age'].isnull()].index:
    title = data_df[data_df['Age'].isnull()]['name_group'][passenger]
    data_df['Age'][passenger] = age_group_mean[title]

In [8]:
data_df['Age'] = pd.qcut(data_df['Age'], 10)

In [9]:
data_df['Age'] = data_df['Age'].astype('category').cat.codes

In [10]:
data_df[data_df['Age'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,name_group


### 2-4. SibSp&Parch --> Companion

In [11]:
count_dict = dict(data_df['Ticket'].value_counts())

In [12]:
#동행자 수를 열 값으로 갖는 새로운 'companion' 열 생성
data_df['companion'] = data_df['Ticket'].apply(lambda  x : count_dict[x])

In [13]:
#sibsp, parch 둘다 0 이고 티켓 넘버 같은 것도 없을 경우 일행 없는 것으로 간주하고 0, 나머지는 1(일행있음)
data_df['companion'] = [0 if data_df['SibSp'][x] == 0 and data_df['Parch'][x] == 0 and count_dict[data_df['Ticket'][x]] == 1 else 1 for x in data_df.index ]

### 2-5. Fare

In [14]:
data_df['FareBin'] = pd.qcut(data_df['Fare'], 5)

In [15]:
data_df['Fare'] = data_df['FareBin'].astype('category').cat.codes

### 2-6. Embarked

In [55]:
data_df[data_df['Embarked'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,name_group,companion,FareBin
61,62,1,"Icard, Miss. Amelie",1,7,0,0,113572,4,B28,,Miss,1,"(39.688, 512.329]"
829,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",1,9,0,0,113572,4,B28,,Mrs,1,"(39.688, 512.329]"


In [16]:
# create df based on class
class1st = data_df[data_df['Pclass'] == 1]

# find average fare of class 1 passengers
class1st_C = class1st[class1st['Embarked'] == 'C']['Fare'].mean()
class1st_S = class1st[class1st['Embarked'] == 'S']['Fare'].mean()
class1st_Q = class1st[class1st['Embarked'] == 'Q']['Fare'].mean()

In [17]:
# fill in missing data for both passengers (61 and 829) with Southampton
data_df['Embarked'][61] = 'S'
data_df['Embarked'][829] = 'S'

In [18]:
data_df['Embarked']=data_df['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2])

In [19]:
data_df[data_df['Embarked'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,name_group,companion,FareBin


### 2-7.불필요한 열 삭제

In [21]:
data_df= data_df.drop(['PassengerId','Name','Cabin','SibSp','Parch','Ticket','FareBin','name_group'], axis=1)
data_df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,companion
0,3,0,2,0,0,1
1,1,1,7,4,1,1
2,3,1,3,1,0,0
3,1,1,7,4,0,1
4,3,0,7,1,0,0


---------------------------------------------------------------------------------------------------------------

<br>

## 3. Train - Test split (비율을 7:3 으로 유지해주시고, seed는 0을 적용해주세요)

In [22]:
from sklearn.model_selection import train_test_split

In [30]:
X_train,X_test,y_train,y_test = train_test_split(data_df,y_data,test_size=0.3, random_state=0)

<br>

## 4. Create model instance variable (동시에 여러 모델을 다른 이름으로 만들 수 있습니다.)

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [33]:
import warnings 
warnings.filterwarnings(action='ignore')

### K-fold

In [26]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

### 4-1. kNN

In [34]:
clf_kNN=KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf_kNN, data_df, y_data, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.75555556 0.78651685 0.79775281 0.74157303 0.76404494 0.79775281
 0.76404494 0.76404494 0.73033708 0.79775281]


In [38]:
round(np.mean(score)*100,2)

78.45

### 4-2. Decision Tree

In [36]:
clf_DT = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf_DT, data_df, y_data, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.76666667 0.80898876 0.76404494 0.76404494 0.84269663 0.82022472
 0.78651685 0.75280899 0.76404494 0.7752809 ]


In [37]:
round(np.mean(score)*100,2)

78.45

### 4-3. Random Foreset

In [39]:
clf_RF = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf_RF, data_df, y_data, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.75555556 0.84269663 0.76404494 0.78651685 0.84269663 0.82022472
 0.79775281 0.79775281 0.7752809  0.76404494]


In [40]:
round(np.mean(score)*100, 2)

79.47

### 4-4. Naive Bayes

In [41]:
clf_NB = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf_NB, data_df, y_data, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.76666667 0.78651685 0.71910112 0.70786517 0.74157303 0.79775281
 0.7752809  0.73033708 0.80898876 0.78651685]


In [42]:
round(np.mean(score)*100, 2)

76.21

### 4-5. SVM

In [43]:
clf_SVC = SVC()
scoring = 'accuracy'
score = cross_val_score(clf_SVC, data_df, y_data, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.78888889 0.82022472 0.79775281 0.78651685 0.84269663 0.83146067
 0.83146067 0.82022472 0.79775281 0.82022472]


In [44]:
round(np.mean(score)*100, 2)

81.37

<br>

## 5. Train the model

In [45]:
clf_kNN.fit(data_df, y_data)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=13, p=2,
           weights='uniform')

In [46]:
clf_DT.fit(data_df, y_data)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [47]:
clf_RF.fit(data_df, y_data)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=13, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
clf_NB.fit(data_df, y_data)

GaussianNB(priors=None, var_smoothing=1e-09)

In [49]:
clf_SVC.fit(data_df, y_data)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

<br>

## 6. Predict on test data & Check the result with metrics (모델 간 비교가 가능합니다.)

In [56]:
from sklearn.metrics import accuracy_score

In [57]:
predict_kNN = clf_kNN.predict(data_df)
accuracy_kNN = accuracy_score(y_data,predict_kNN)
accuracy_kNN

0.8148148148148148

In [58]:
predict_DT = clf_DT.predict(data_df)
accuracy_DT = accuracy_score(y_data,predict_DT)
accuracy_DT

0.8843995510662177

In [59]:
predict_RF = clf_RF.predict(data_df)
accuracy_RF = accuracy_score(y_data,predict_RF)
accuracy_RF

0.8832772166105499

In [60]:
predict_NB = clf_NB.predict(data_df)
accuracy_NB = accuracy_score(y_data,predict_NB)
accuracy_NB

0.7710437710437711

In [61]:
predict_SVC = clf_SVC.predict(data_df)
accuracy_SVC = accuracy_score(y_data,predict_SVC)
accuracy_SVC

0.8361391694725028