### 全華， 用機器學習掌握人工智慧， 徐聖訓
### https://drive.google.com/drive/folders/1gfItaJzpfBsRqdXKs-oiTVg3O4lzBXQY?usp=sharing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['DFKai-sb'] 
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('titanic_train.csv')
df.head(1)

In [None]:
df.info()

In [None]:
pd.concat([df['Survived'].value_counts(),
          df['Survived'].value_counts(normalize=True)], 
          axis=1, keys=['個數','百分比'])

In [None]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df.head()

In [None]:
df.isnull().sum()

In [None]:
sns.pairplot(data=df, hue='Survived',
             size=2, diag_kws={'bw':0.1});

In [None]:
df.groupby('Sex')['Survived'].value_counts().\
unstack(1).plot(kind='bar', figsize=(5,3));

In [None]:
sns.countplot(x='Sex', order=['female','male'], 
              hue='Survived', data=df);

In [None]:
df.groupby('Survived')['Age'].plot(kind='hist', alpha=0.6, 
                                   bins=30, legend=True);

In [None]:
X_col_num = ['Age', 'SibSp', 'Parch', 'Fare']
X_col_cat = ['Pclass', 'Sex', 'Embarked']
X_cols = X_col_num + X_col_cat
y_col = 'Survived'

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
num_pl = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
#檢查數值管道器的運作  
print(f'數值型資料的欄位有：{X_col_num}')
num_pl.fit_transform(df[X_col_num])[:3]

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_pl = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse=False)
)
# 檢查類別管道器的運作  
cat_pl.fit_transform(df[X_col_cat])[:3]

In [None]:
oh = cat_pl.named_steps['onehotencoder']
oh_cols = oh.get_feature_names(X_col_cat)
oh_cols

In [None]:
pd.DataFrame(cat_pl.fit_transform(df[X_col_cat]), 
             columns=oh_cols).head()

In [None]:
from sklearn.compose import ColumnTransformer
data_pl = ColumnTransformer([
    ('num_pl', num_pl, X_col_num),
    ('cat_pl', cat_pl, X_col_cat)
])
data_pl.fit_transform(df[X_cols])[:1].round(2)

In [None]:
from sklearn.model_selection import train_test_split
X = df[X_cols]
y = df[y_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.svm import SVC
model_pl_svc = make_pipeline(data_pl, SVC())
model_pl_svc

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
model_pl_svc.fit(X_train, y_train)
y_pred = model_pl_svc.predict(X_test)
print('正確率：', accuracy_score(y_test, y_pred).round(2))
print('混亂矩陣')
print(confusion_matrix(y_test, y_pred))
print('綜合報告')
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
model_pl_lr = make_pipeline(data_pl, LogisticRegression())
model_pl_lr.fit(X_train, y_train)
y_pred = model_pl_lr.predict(X_test)
print('正確率：', accuracy_score(y_test, y_pred).round(2))
print('混亂矩陣')
print(confusion_matrix(y_test, y_pred))
print('綜合報告')
print(classification_report(y_test, y_pred))

In [None]:
data_pl = ColumnTransformer([
    ('num_pl', num_pl, ['Age', 'SibSp', 'Parch', 'Fare', 'Pclass']),
    ('cat_pl', cat_pl, ['Sex', 'Embarked'])
])
model_pl_svc = make_pipeline(data_pl, SVC())
model_pl_svc.fit(X_train, y_train)
y_pred = model_pl_svc.predict(X_test)
print('正確率：', accuracy_score(y_test, y_pred).round(2))
print('混亂矩陣')
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
data_pl = ColumnTransformer([
    ('num_pl', num_pl, X_col_num),
    ('cat_pl', cat_pl, X_col_cat)
])
model_pl_svc = make_pipeline(data_pl, 
                             SelectKBest(f_classif, 3), 
                             SVC())
model_pl_svc.fit(X_train, y_train)
y_pred = model_pl_svc.predict(X_test)
print('正確率：', accuracy_score(y_test, y_pred).round(2))
print('混亂矩陣')
print(confusion_matrix(y_test, y_pred))

In [None]:
# 先取到所有欄位名稱含獨熱編碼的欄位
cols = X_col_num + oh_cols.tolist()
selector = model_pl_svc.named_steps['selectkbest']
# 先將資料變成array的資料型態，再用布林值取出欄位名稱
np.array(cols)[selector.get_support()]

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
# 欄位
X_col_num = ['Fare', 'Age']
X_col_bin = ['SibSp', 'Parch']
X_col_cat = ['Pclass', 'Sex', 'Embarked']
# 資料管道器
num_pl = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)
bin_pl = make_pipeline(
    SimpleImputer(strategy='mean'),
    KBinsDiscretizer(n_bins=5, encode='ordinal'),
)
cat_pl = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder()
)
# 合併後的資料管道器
data_pl = ColumnTransformer([
    ('num', num_pl, X_col_num),
    ('bin', bin_pl, X_col_bin),
    ('cat', cat_pl, X_col_cat)
])
# 模型預測
model_pl = make_pipeline(data_pl, SVC())
model_pl.fit(X_train, y_train)
y_pred = model_pl.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('整體正確率:',accuracy_score(y_test, y_pred).round(2))