# 1. 모델 특징 및 사용 방법

In [54]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

# 분류
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 성능 평가
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score

# 화면 옵션 설정
pd.options.display.max_rows = 500     # 출력할 max row를 지정
pd.options.display.max_columns = 20   # 출력할 max columns를 지정
pd.set_option('display.float_format', '{:.4f}'.format)

In [55]:
def modelTrain(model, df, y):
    # 1) X, Y 데이터 분리
    Y = df[y]
    X = df.drop(columns=y, axis=1)
    
    # 2) 학습, 검증 데이터로 분리
    X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                          test_size=0.3,
                                                          stratify=Y, # 분류 문제일 때
                                                          random_state=42)
    
    # 3) 분리된 데이터의 shape 출력
    print([x.shape for x in [X_train, X_test, y_train, y_test]])
    
    # 4) 모델 선택 및 학습
    model.fit(X_train, y_train)
    
    # 5) 성능 평가
    print('train accuracy:', model.score(X_train, y_train))
    print('test accuracy:', model.score(X_test, y_test))
    
    return model

In [56]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [58]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [59]:
# 결측치 제거
df.dropna(axis=1, thresh=500, inplace=True)

# 결측치 대체
df['age'].fillna(df.groupby('class')['age'].transform('mean'), inplace=True)
df['embarked'].fillna(method='ffill', inplace=True)
df['embark_town'].fillna(method='ffill', inplace=True)

In [7]:
df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [8]:
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
embark_town      object
alive            object
alone              bool
dtype: object

In [9]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')

In [10]:
df['sex'] = df['sex'].astype('category').cat.codes
df['embarked'] = df['embarked'].astype('category').cat.codes
df['class'] = df['class'].astype('category').cat.codes
df['who'] = df['who'].astype('category').cat.codes
df['adult_male'] = df['adult_male'].astype('category').cat.codes
df['embark_town'] = df['embark_town'].astype('category').cat.codes
df['alive'] = df['alive'].astype('category').cat.codes
df['alone'] = df['alone'].astype('category').cat.codes

In [11]:
df.dtypes

survived         int64
pclass           int64
sex               int8
age            float64
sibsp            int64
parch            int64
fare           float64
embarked          int8
class             int8
who               int8
adult_male        int8
embark_town       int8
alive             int8
alone             int8
dtype: object

In [12]:
Y = df['survived']
X = df.drop(columns='survived', axis=1)

scaledX = StandardScaler().fit_transform(X)
scaledX = pd.DataFrame(scaledX, columns=X.columns)
df2 = pd.concat([scaledX, Y], axis=1)

## (1) LogisticRegression

- 독립 변수의 선형 결합을 이용하여 사건의 발생 가능성을 예측하는데 사용되는 통계 기법
- 반복하면서 기울기 값을 갱신, 기울기 미분값이 0이 되는 지점을 찾음
```python
LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
```
- `max_iter`: 반복 횟수
- `tol`: 허용 오차(반복을 중단하는 조건으로 사용됨)
- `penalty`: penalty 종류
- `C`: penalty 세기

In [13]:
# help(LogisticRegression())

In [14]:
# 스케일링 이전
lr = LogisticRegression(max_iter=1000)
modelTrain(lr, df, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


In [15]:
# 스케일링 이후
modelTrain(lr, df2, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


## 2) KNeighborsClassifier

- k 개의 근접 이웃을 확인하여 클래스를 선택함
- `n_neighbors`를 변경하여 성능을 개선할 수 있음
- `n_neighbors`: 확인할 근접 이웃 개수

```python
KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
```

In [16]:
# help(KNeighborsClassifier())

In [17]:
# 스케일링 이전
knn = KNeighborsClassifier()
modelTrain(knn, df, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 0.8443017656500803
test accuracy: 0.7201492537313433


In [18]:
# 스케일링 이후
modelTrain(knn, df2, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 0.985553772070626
test accuracy: 0.9850746268656716


## 3) DecisionTreeClassifier

- overfitting 경향이 있음
- max_depth를 줄여 overfitting을 해결할 수 있음
```python
DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
```

In [19]:
# help(DecisionTreeClassifier())

In [20]:
# 스케일링 이전
dt = DecisionTreeClassifier(max_depth=10)
modelTrain(dt, df, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


In [21]:
# 스케일링 이후
modelTrain(dt, df2, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


## 4) RandomForestClassifier

- `n_estimators`의 개수를 늘리거나, `max_depth`의 숫자를 조절하는 방법으로 성능 조절 가능
```python
RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
```

In [22]:
# help(RandomForestClassifier())

In [23]:
# 스케일링 이전
rf = RandomForestClassifier()
modelTrain(rf, df, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


In [24]:
# 스케일링 이후
modelTrain(rf, df2, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


## 5) XGBClassifier

```python
XGBClassifier(*, objective: Union[str, Callable[[numpy.ndarray, numpy.ndarray], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = 'binary:logistic', use_label_encoder: Optional[bool] = None, **kwargs: Any) -> None
```

In [25]:
# help(XGBClassifier())

In [26]:
# 스케일링 이전
xgb = XGBClassifier()
modelTrain(xgb, df, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


In [27]:
# 스케일링 이후
modelTrain(xgb, df2, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


## 6) CatBoostClassifier

- **L2 규제**는 L1 규제와 비슷한 양상을 보이나, **규제가 강해져도 과소 적합이 심해지지 않는다**는 특성을 가지고 있음
- 규제를 통해서 기존 학습에 큰 영향을 끼칠 수 있는 데이터를 지양할 수 있음
- L2의 경우에는 가충치의 값을 이용하기 때문에 **이상치나 노이즈에 있는 데이터에 대한 학습을 진행할 때 좋음**
- **L1 규제** 경우에는 가중치의 크기에 상관없이 상수값을 빼는데 이는 중요한 가중치만을 취하기 때문에 **sparse feature에 대한 모델을 구성하는데 적합함**

In [28]:
# help(CatBoostClassifier())

In [29]:
# 스케일링 이전
cat = CatBoostClassifier(iterations=100)
modelTrain(cat, df, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
Learning rate set to 0.069531
0:	learn: 0.6384808	total: 147ms	remaining: 14.6s
1:	learn: 0.5850242	total: 149ms	remaining: 7.3s
2:	learn: 0.5392936	total: 150ms	remaining: 4.86s
3:	learn: 0.4973411	total: 151ms	remaining: 3.63s
4:	learn: 0.4557481	total: 152ms	remaining: 2.88s
5:	learn: 0.4268693	total: 153ms	remaining: 2.4s
6:	learn: 0.3981397	total: 154ms	remaining: 2.05s
7:	learn: 0.3712550	total: 155ms	remaining: 1.79s
8:	learn: 0.3498387	total: 157ms	remaining: 1.59s
9:	learn: 0.3238504	total: 158ms	remaining: 1.42s
10:	learn: 0.3039245	total: 159ms	remaining: 1.29s
11:	learn: 0.2850301	total: 161ms	remaining: 1.18s
12:	learn: 0.2668929	total: 162ms	remaining: 1.09s
13:	learn: 0.2501948	total: 164ms	remaining: 1.01s
14:	learn: 0.2366875	total: 165ms	remaining: 935ms
15:	learn: 0.2227616	total: 167ms	remaining: 875ms
16:	learn: 0.2070625	total: 168ms	remaining: 818ms
17:	learn: 0.1924859	total: 168ms	remaining: 766ms
18:	learn: 0.1825139	tota

<catboost.core.CatBoostClassifier at 0x18c97bd44f0>

In [30]:
# 스케일링 이후
modelTrain(cat, df2, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
Learning rate set to 0.069531
0:	learn: 0.6384808	total: 1.91ms	remaining: 189ms
1:	learn: 0.5850242	total: 3.33ms	remaining: 163ms
2:	learn: 0.5392936	total: 4.38ms	remaining: 142ms
3:	learn: 0.4973411	total: 5.25ms	remaining: 126ms
4:	learn: 0.4557481	total: 5.72ms	remaining: 109ms
5:	learn: 0.4268693	total: 7.05ms	remaining: 110ms
6:	learn: 0.3981397	total: 8.59ms	remaining: 114ms
7:	learn: 0.3712550	total: 9.57ms	remaining: 110ms
8:	learn: 0.3498387	total: 10.9ms	remaining: 110ms
9:	learn: 0.3238504	total: 11.6ms	remaining: 104ms
10:	learn: 0.3039245	total: 13ms	remaining: 105ms
11:	learn: 0.2850301	total: 14.7ms	remaining: 108ms
12:	learn: 0.2668929	total: 16.5ms	remaining: 111ms
13:	learn: 0.2501948	total: 18.3ms	remaining: 112ms
14:	learn: 0.2366875	total: 19.9ms	remaining: 113ms
15:	learn: 0.2227616	total: 21.3ms	remaining: 112ms
16:	learn: 0.2070625	total: 21.8ms	remaining: 107ms
17:	learn: 0.1924859	total: 22.3ms	remaining: 101ms
18:	lea

<catboost.core.CatBoostClassifier at 0x18c97bd44f0>

## 7) LGBMClassifier

In [31]:
# 스케일링 이전
lgbm = LGBMClassifier()
modelTrain(lgbm, df, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


In [32]:
# 스케일링 이후
modelTrain(lgbm, df2, 'survived')

[(623, 13), (268, 13), (623,), (268,)]
train accuracy: 1.0
test accuracy: 1.0


# 2. 성능 평가

## 1) 이항 분류

```python
confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)
```

In [33]:
# help(confusion_matrix)

In [34]:
label = ['생존', '사망']

# XGBClassifier
# 스케일링 이전
print(f'accuracy: {xgb.score(X, Y)}')
y_pred = xgb.predict(X)
cm = confusion_matrix(Y, y_pred)
result = pd.DataFrame(cm, columns=label, index=label)
print(result)
print()

# 스케일링 이후
print(f'accuracy: {xgb.score(scaledX, Y)}')
y_pred = xgb.predict(scaledX)
cm = confusion_matrix(Y, y_pred)
result = pd.DataFrame(cm, columns=label, index=label)
print(result)

accuracy: 1.0
     생존   사망
생존  549    0
사망    0  342

accuracy: 1.0
     생존   사망
생존  549    0
사망    0  342


In [35]:
# KNN
# 스케일링 이전
print(f'accuracy: {knn.score(X, Y)}')
y_pred = knn.predict(X)
cm = confusion_matrix(Y, y_pred)
result = pd.DataFrame(cm, columns=label, index=label)
print(result)
print()

# 스케일링 이후
print(f'accuracy: {knn.score(scaledX, Y)}')
y_pred = knn.predict(scaledX)
cm = confusion_matrix(Y, y_pred)
result = pd.DataFrame(cm, columns=label, index=label)
print(result)

accuracy: 0.6374859708193041
     생존   사망
생존  319  230
사망   93  249

accuracy: 0.9854096520763187
     생존   사망
생존  541    8
사망    5  337


## 2) 다항 분류

- `accuracy_score(y_true, y_pred)`(TP/TN) / (TP+TN+FP+FN)
- `precision_score(y_true, y_pred)`: TP / (TP + FP)
- `recall_score(y_true, y_pred)`: TP / (TP + FN)
- `f1_score(y_true, y_pred)`: 2 * (Precison * Recall) / (Precision + Recall)
- precision_score, recall_score, f1_score의 `average parameter`에 `None`, `micro`, `macro`, `weighted`를 사용해야 함(class가 binary가 아닌 경우 필수)
  - `average=None`: 예측한 값 대비 실제 값
  - `average='micro'`: 전체 데이터 개수 중에서 맞춘 값
  - `average='macro'`: `average=None`일 때의 모든 값을 더해 class의 개수로 나눈 것
  - `average='weighted'`: `average=None`일 때의 모든 값에 각 class의 데이터 개수를 곱하여 구하고 전체 데이터 개수로 나눈 것

In [36]:
# 다항 분류를 위해 새롭게 iris dataset을 이용
df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [38]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [39]:
df['species'] = df['species'].astype('category').cat.codes

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    int8   
dtypes: float64(4), int8(1)
memory usage: 5.0 KB


In [41]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [42]:
Y = df['species']
X = df.drop(columns='species', axis=1)

In [43]:
modelTrain(lr, df, 'species')

[(105, 4), (45, 4), (105,), (45,)]
train accuracy: 0.9714285714285714
test accuracy: 0.9333333333333333


In [44]:
modelTrain(knn, df, 'species')

[(105, 4), (45, 4), (105,), (45,)]
train accuracy: 0.9714285714285714
test accuracy: 0.9777777777777777


In [45]:
modelTrain(rf, df, 'species')

[(105, 4), (45, 4), (105,), (45,)]
train accuracy: 1.0
test accuracy: 0.9111111111111111


In [46]:
modelTrain(dt, df, 'species')

[(105, 4), (45, 4), (105,), (45,)]
train accuracy: 1.0
test accuracy: 0.9777777777777777


In [47]:
modelTrain(xgb, df, 'species')

[(105, 4), (45, 4), (105,), (45,)]
train accuracy: 1.0
test accuracy: 0.9333333333333333


In [48]:
modelTrain(cat, df, 'species')

[(105, 4), (45, 4), (105,), (45,)]
Learning rate set to 0.493633
0:	learn: 0.6522915	total: 1.07ms	remaining: 106ms
1:	learn: 0.4849216	total: 1.95ms	remaining: 95.8ms
2:	learn: 0.3557662	total: 2.93ms	remaining: 94.8ms
3:	learn: 0.2742744	total: 3.98ms	remaining: 95.6ms
4:	learn: 0.2176451	total: 4.77ms	remaining: 90.7ms
5:	learn: 0.1819968	total: 5.92ms	remaining: 92.7ms
6:	learn: 0.1580970	total: 6.65ms	remaining: 88.3ms
7:	learn: 0.1300213	total: 7.4ms	remaining: 85.1ms
8:	learn: 0.1147230	total: 8.58ms	remaining: 86.8ms
9:	learn: 0.1006030	total: 9.61ms	remaining: 86.5ms
10:	learn: 0.0937223	total: 10.4ms	remaining: 84.2ms
11:	learn: 0.0843322	total: 12.1ms	remaining: 88.6ms
12:	learn: 0.0762246	total: 12.9ms	remaining: 86.2ms
13:	learn: 0.0690354	total: 13.7ms	remaining: 84.3ms
14:	learn: 0.0628545	total: 14.9ms	remaining: 84.3ms
15:	learn: 0.0593997	total: 15.8ms	remaining: 82.9ms
16:	learn: 0.0555182	total: 16.6ms	remaining: 80.9ms
17:	learn: 0.0523483	total: 17.4ms	remaining: 

<catboost.core.CatBoostClassifier at 0x18c97bd44f0>

In [49]:
modelTrain(lgbm, df, 'species')

[(105, 4), (45, 4), (105,), (45,)]
train accuracy: 1.0
test accuracy: 0.8666666666666667


In [50]:
label = ['setosa', 'versicolor', 'virginica']
print(f'accuracy: {xgb.score(X, Y)}')
print()

y_pred = xgb.predict(X)
cm = confusion_matrix(Y, y_pred)
result = pd.DataFrame(cm, columns=label, index=label)
print(result)

accuracy: 0.98

            setosa  versicolor  virginica
setosa          50           0          0
versicolor       0          49          1
virginica        0           2         48


In [51]:
creport = classification_report(Y, y_pred)
print(creport)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.96      0.98      0.97        50
           2       0.98      0.96      0.97        50

    accuracy                           0.98       150
   macro avg       0.98      0.98      0.98       150
weighted avg       0.98      0.98      0.98       150



In [52]:
print('accuracy  : %.4f' % accuracy_score(Y, y_pred))
print(precision_score(Y, y_pred, average=None))  
print('precision (micro)   : %.4f' % precision_score(Y, y_pred, average='micro'))
print('precision (macro)   : %.4f' % precision_score(Y, y_pred, average='macro'))
print('precision (weighted): %.4f' % precision_score(Y, y_pred, average='weighted'))

accuracy  : 0.9800
[1.         0.96078431 0.97959184]
precision (micro)   : 0.9800
precision (macro)   : 0.9801
precision (weighted): 0.9801


# 3. 예측값 저장

In [53]:
submission = pd.DataFrame({'ID': X.index, 'survived': y_pred})
submission.to_csv('submission.csv', index=False)

# 참고

- [[Pandas] 22. Transform을 이용하여 그룹별 통계값으로 결측치 대체하기](https://zephyrus1111.tistory.com/155)