# 분류 - 타이타닉 생존자

---

In [None]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Read CSV file

In [None]:
# Visual Python: Data Analysis > File
df = pd.read_csv('./data/titanic.csv')
df

In [None]:
# Visual Python: Data Analysis > Data Info
df.info()

In [None]:
# Visual Python: Data Analysis > Data Info
df.describe()

### 컬럼 삭제: PassengerId, Name, Ticket, Cabin

In [None]:
# Visual Python: Data Analysis > Frame
df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)
df

### 상관계수

In [None]:
# Visual Python: Data Analysis > Data Info
df.corr(numeric_only=True)

#### headmap

In [None]:
# Visual Python: Data Analysis > Data Info
_corr = df.corr(numeric_only=True)
_corr.style.background_gradient(cmap='coolwarm')

### Factor 별 생존율 분석

#### Pclass vs Survived

In [None]:
# Visual Python: Visualization > Seaborn
sns.countplot(data=df, x='Pclass', hue='Survived')
plt.show()

#### Gender vs Survived

In [None]:
# Visual Python: Visualization > Seaborn
sns.countplot(data=df, x='Gender', hue='Survived')
plt.show()

### 결측치 확인

In [None]:
# Visual Python: Data Analysis > Data Info
pd.DataFrame({'Null Count': df.isnull().sum(), 'Non-Null Count': df.notnull().sum()})

### Embarked 결측치 채우기

In [None]:
# Visual Python: Visualization > Seaborn
sns.countplot(data=df, x='Embarked')
plt.show()

In [None]:
# Visual Python: Data Analysis > Data Info
df['Embarked'].value_counts()

In [None]:
# Visual Python: Data Analysis > Frame
df['Embarked'] = df['Embarked'].fillna('S')
df

In [None]:
# Visual Python: Data Analysis > Data Info
pd.DataFrame({'Null Count': df.isnull().sum(), 'Non-Null Count': df.notnull().sum()})

### Age 결측치 채우기

In [None]:
# Visual Python: Visualization > Seaborn
sns.histplot(data=df, x='Age', bins=40)
plt.show()

In [None]:
# Visual Python: Data Analysis > Data Info
df[['Age']].mean(numeric_only=True)

#### 성별에 따른 나이 평균

In [None]:
# 남자 나이 평균
# Visual Python: Data Analysis > Instance
df.loc[(df['Gender'] == 'male'), 'Age'].mean()

In [None]:
# 여자 나이 평균
# Visual Python: Data Analysis > Instance
df.loc[(df['Gender'] == 'female'), 'Age'].mean()

#### 선실 등급('Pclass')별 나이 평균값

In [None]:
# Visual Python: Data Analysis > Data Info
df['Pclass'].value_counts()

In [None]:
# Pclass 1 나이 평균값
# Visual Python: Data Analysis > Instance
df.loc[(df['Pclass'] == 1), 'Age'].mean()

In [None]:
# Pclass 2 나이 평균값
# Visual Python: Data Analysis > Instance
df.loc[(df['Pclass'] == 2), 'Age'].mean()

In [None]:
# Pclass 3 나이 평균값
# Visual Python: Data Analysis > Instance
df.loc[(df['Pclass'] == 3), 'Age'].mean()

#### 성별(Sex), 선실 등급(Pclass)별 나이 평균값

In [None]:
# Visual Python: Data Analysis > Instance
mean_m_1 = df.loc[(df['Gender'] == 'male')&(df['Pclass'] == 1), 'Age'].mean()
mean_m_1

In [None]:
# Visual Python: Data Analysis > Instance
mean_m_2 = df.loc[(df['Gender'] == 'male')&(df['Pclass'] == 2), 'Age'].mean()
mean_m_2

In [None]:
# Visual Python: Data Analysis > Instance
mean_m_3 = df.loc[(df['Gender'] == 'male')&(df['Pclass'] == 3), 'Age'].mean()
mean_m_3

In [None]:
# Visual Python: Data Analysis > Instance
mean_f_1 = df.loc[(df['Gender'] == 'female')&(df['Pclass'] == 1), 'Age'].mean()
mean_f_1

In [None]:
# Visual Python: Data Analysis > Instance
mean_f_2 = df.loc[(df['Gender'] == 'female')&(df['Pclass'] == 2), 'Age'].mean()
mean_f_2

In [None]:
# Visual Python: Data Analysis > Instance
mean_f_3 = df.loc[(df['Gender'] == 'female')&(df['Pclass'] == 3), 'Age'].mean()
mean_f_3

#### Age 결측치 채우기

In [None]:
# Visual Python: Data Analysis > Frame
df.loc[(df['Age'].isnull())&(df['Gender'] == 'male')  &(df['Pclass'] == 1), 'Age'] = mean_m_1
df.loc[(df['Age'].isnull())&(df['Gender'] == 'male')  &(df['Pclass'] == 2), 'Age'] = mean_m_2
df.loc[(df['Age'].isnull())&(df['Gender'] == 'male')  &(df['Pclass'] == 3), 'Age'] = mean_m_3
df.loc[(df['Age'].isnull())&(df['Gender'] == 'female')&(df['Pclass'] == 1), 'Age'] = mean_f_1
df.loc[(df['Age'].isnull())&(df['Gender'] == 'female')&(df['Pclass'] == 2), 'Age'] = mean_f_2
df.loc[(df['Age'].isnull())&(df['Gender'] == 'female')&(df['Pclass'] == 3), 'Age'] = mean_f_3
df

In [None]:
# Visual Python: Data Analysis > Data Info
pd.DataFrame({'Null Count': df.isnull().sum(), 'Non-Null Count': df.notnull().sum()})

#### 컬럼 추가: Fsize = SibSp + Parch
#### 컬럼 삭제: SibSp, Parch

In [None]:
# Visual Python: Data Analysis > Frame
df['Fsize'] = df['SibSp'] + df['Parch'] + 1
df.drop(['SibSp','Parch'], axis=1, inplace=True)
df

---

## 머신러닝: 생존자 예측

### Encoding: 범주형 변수

In [None]:
# Visual Python: Data Analysis > Frame
df = pd.get_dummies(data=df, columns=['Gender','Embarked'])
df

### 생존자 예측

In [None]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['Pclass', 'Age', 'Fare', 'Fsize', 'Gender_female', 'Gender_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']], df['Survived'])

# [2] Classifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

# [3] Fit
model.fit(X_train, y_train)

# [4] Predict
pred = model.predict(X_test)

# [5] Evaluation
from sklearn import metrics
from IPython.display import display, Markdown
# Confusion Matrix
display(Markdown('### Confusion Matrix'))
display(pd.crosstab(y_test, pred, margins=True))
# Classification report
print(metrics.classification_report(y_test, pred))

In [None]:
# Visual Python: Machine Learning > Model Info
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:, 1])                                
plt.plot(fpr, tpr, label='ROC Curve')                                
plt.xlabel('Sensitivity')                                
plt.ylabel('Specificity')                                
plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
def vp_create_feature_importances(model, X_train=None, sort=False):
    if isinstance(X_train, pd.core.frame.DataFrame):
        feature_names = X_train.columns
    else:
        feature_names = [ 'X{}'.format(i) for i in range(len(model.feature_importances_)) ]
                        
    df_i = pd.DataFrame(model.feature_importances_, index=feature_names, columns=['Feature_importance'])
    df_i['Percentage'] = 100 * df_i['Feature_importance']
    if sort: df_i.sort_values(by='Feature_importance', ascending=False, inplace=True)
    df_i = df_i.round(2)
                        
    return df_i
def vp_plot_feature_importances(model, X_train=None, sort=False, top_count=0):
    df_i = vp_create_feature_importances(model, X_train, sort)
                        
    if sort: 
        if top_count > 0:
            df_i['Percentage'].sort_values().tail(top_count).plot(kind='barh')
        else:
            df_i['Percentage'].sort_values().plot(kind='barh')
    else: 
        df_i['Percentage'].plot(kind='barh')
    plt.xlabel('Feature importance Percentage')
    plt.ylabel('Features')
                        
    plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
vp_plot_feature_importances(model, X_train, sort=True)

---

In [None]:
# End of file