<a href="https://colab.research.google.com/github/jooeun921/Big-Data-Analyst/blob/main/Part03_Section_03_scikit_learn_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

### Section 03 학습 : scikit-learn을 활용한 분류 모델 적합

### 이진분류 지표
Confusion-matrix, 평가지표(Accuracy, Precision, Recall, F1 score), ROC curve 등이 있음.

#### confusion-matrix , 평가지표

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

df = pd.DataFrame(X, columns = data.feature_names)
df['target'] = y

print(df.head())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = 'target'), df['target'], test_size = 0.3, random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 10000, random_state = 0)
model.fit(X_train, y_train)

In [None]:
y_prob_org = model.predict_proba(X_test)
print(pd.DataFrame(y_prob_org[:4].round(3)))

In [None]:
y_pred = model.predict(X_test)
print(pd.DataFrame(y_pred, columns = ['pred']).head())

In [None]:
y_pred_ths = (model.predict_proba(X_test)[:, 1] >= 0.5).astype(int)

In [None]:
print("값이 같은지 확인 : ", np.array_equal(y_pred, y_pred_ths))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
isp = ConfusionMatrixDisplay(confusion_matrix = cm)
isp.plot(cmap = plt.cm.Blues)
plt.show()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy : {accuracy:.2f}')
print(f'Precision : {precision:.2f}')
print(f'Recall : {recall:.2f}')
print(f'F1 : {f1:.2f}')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

precision2 = precision_score(y_test, y_pred, pos_label = 1, average = 'macro')
recall2 = recall_score(y_test, y_pred, pos_label = 1)
f12 = f1_score(y_test, y_pred, pos_label = 1)

print(f'Precision : {precision2:.2f}')
print(f'Recall : {recall2:.2f}')
print(f'F1 : {f12:.2f}')

In [None]:
y_pred_ths1 = (model.predict_proba(X_test)[:, 1] > 0.1).astype(int)
y_pred_ths2 = (model.predict_proba(X_test)[:, 1] > 0.9).astype(int)

from sklearn.metrics import confusion_matrix

cm1 = confusion_matrix(y_test, y_pred_ths1)
cm2 = confusion_matrix(y_test, y_pred_ths2)

print("임계값이 0.1일 때 : \n", cm1)
print("임계값이 0.9일 때 : \n", cm2)


#### ROC curve

In [None]:
from sklearn.metrics import roc_auc_score

y_prob = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_prob)

print(f"AUC score : {auc_score: f}")

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris( )
X = iris.data
y = iris.target

In [None]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

print(conf_matrix)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average = 'macro')
recall_macro = recall_score(y_test, y_pred, average = 'macro')
f1_macro = f1_score(y_test, y_pred, average = 'macro')

print(f'Accuracy : {accuracy:.2f}')
print(f'Precision : {precision_macro:.2f}')
print(f'Recall : {recall_macro:.2f}')
print(f'F1 : {f1_macro:.2f}')

In [None]:
from sklearn.metrics import roc_auc_score

y_prob = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_prob, multi_class = 'ovr', average = 'macro')
print(f"AUC Score (One-vs-Rest, Macro Average) : {auc:.4f}")

### 분류 알고리즘

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s13_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/s13_test.csv')
print(train.head())

In [None]:
train_X = train.drop(['diagnosis'], axis = 1)
train_y = train['diagnosis']
test_X = test.drop(['diagnosis'], axis = 1)
test_y = test['diagnosis']
print(train_y.head(3))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(train_X, train_y)
KNeighborsClassifier()

In [None]:
y_pred = model.predict(test_X)
from sklearn.metrics import classification_report
print(classification_report(test_y, y_pred))

In [None]:
from sklearn.metrics import f1_score
f1 = f1_score(test_y, y_pred, pos_label = 'A')
print(f'Test set F1 score: {f1:.2f}')

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
grid_search = GridSearchCV(model,
  param_grid,
  cv = 3,
  scoring = 'f1')
grid_search.fit(train_X, train_y)

In [None]:
print(f'Best parameters found: {grid_search.best_params_}')
print(f'Best cross-validation F1 score: {grid_search.best_score_:.2f}')
print(pd.DataFrame(grid_search.cv_results_))

In [None]:
train_y2 = train_y.map({'A': 1, 'B': 0})
test_y2 = test_y.map({'A': 1, 'B': 0})

In [None]:
from sklearn.preprocessing import LabelEncoder
labels = ['A', 'B']

# 라벨 인코딩
labelencoder = LabelEncoder()
encoded_labels = labelencoder.fit_transform(labels)

print(f'Original labels: {labels}')
print(f'Encoded labels: {encoded_labels}')
print(f'Classes: {labelencoder.classes_}')

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/wisconsin_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/wisconsin_test.csv')

In [None]:
train_X = train.drop(['diagnosis'], axis = 1)
train_y = train['diagnosis']
test_X = test.drop(['diagnosis'], axis = 1)
test_y = test['diagnosis']

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder( )
train_y = labelencoder.fit_transform(train_y)
test_y = labelencoder.transform(test_y)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV

num_columns = train_X.select_dtypes('number').columns.tolist( )
num_preprocess = make_pipeline(
  StandardScaler( ),
  PCA(n_components = 0.8, svd_solver = 'full'))

preprocess = ColumnTransformer(
[("num", num_preprocess, num_columns)]
)

#### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
full_pipe = Pipeline(
  [
  ("preprocess", preprocess),
  ("classifier", KNeighborsClassifier())
  ]
)

In [None]:
KNeighborsClassifier().get_params()

In [None]:
knn_param = {'classifier__n_neighbors': np.arange(5, 10, 1)}

knn_search = GridSearchCV(estimator = full_pipe,
  param_grid = knn_param,
  cv = 3,
  scoring = 'f1_macro')
knn_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합:', knn_search.best_params_)
print('교차검증 f1 스코어:', knn_search.best_score_)

In [None]:
from sklearn.metrics import f1_score
knn_pred = knn_search.predict(test_X)
print('테스트 f1-score:', f1_score(test_y, knn_pred))

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
full_pipe = Pipeline(
  [
  ("preprocess", preprocess),
  ("classifier", DecisionTreeClassifier( ))
  ]
)

In [None]:
DecisionTreeClassifier( ).get_params( )

In [None]:
decisiontree_param = {'classifier__ccp_alpha': np.arange(0.01, 0.3, 0.05)}
decisiontree_search = GridSearchCV(estimator = full_pipe,
  param_grid = decisiontree_param,
  cv = 5,
  scoring = 'roc_auc')
decisiontree_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합:', decisiontree_search.best_params_)
print('교차검증 AUC:', decisiontree_search.best_score_)

In [None]:
from sklearn.metrics import roc_auc_score
y_prob = decisiontree_search.predict_proba(test_X)[:, 1]
auc_score = roc_auc_score(test_y, y_prob)
print("AUC score: %f" % auc_score)

#### Ensemble

##### Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
full_pipe = Pipeline(
  [
  ("preprocess", preprocess),
  ("classifier", BaggingClassifier( ))
  ]
)

In [None]:
BaggingClassifier( ).get_params( )

In [None]:
Bagging_param = {'classifier__n_estimators': np.arange(10, 100, 20)}
Bagging_search = GridSearchCV(estimator = full_pipe,
  param_grid = Bagging_param,
  cv = 5,
  scoring = 'f1_macro')
Bagging_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합:', Bagging_search.best_params_)
print('교차검증 f1 score:', Bagging_search.best_score_)

In [None]:
from sklearn.metrics import f1_score
bag_pred = Bagging_search.predict(test_X)
print('테스트 f1 score:', f1_score(test_y, bag_pred))

##### RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
full_pipe = Pipeline(
 [
 ("preprocess", preprocess),
 ("classifier", RandomForestClassifier())
 ]
)

In [None]:
RandomForestClassifier().get_params()

In [None]:
RandomForest_param = {'classifier__n_estimators': np.arange(100, 500, 100)}
RandomForest_search = GridSearchCV(estimator = full_pipe,
param_grid = RandomForest_param,
cv = 3,
scoring = 'accuracy')
RandomForest_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합:', RandomForest_search.best_params_)
print('교차검증 accuracy score:', RandomForest_search.best_score_)

In [None]:
from sklearn.metrics import accuracy_score
rf_pred = Bagging_search.predict(test_X)
print('테스트 accuracy score :', accuracy_score(test_y, rf_pred))

##### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
full_pipe = Pipeline(
  [
  ("preprocess", preprocess),
  ("classifier", GradientBoostingClassifier())
  ]
)

In [None]:
GradientBoostingClassifier().get_params()

In [None]:
GradientBoosting_param = {'classifier__learning_rate': np.arange(0.1, 0.3, 0.05)}
GradientBoosting_search = GridSearchCV(estimator = full_pipe,
  param_grid = GradientBoosting_param,
  cv = 5,
  scoring = 'f1_macro')
GradientBoosting_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합:', GradientBoosting_search.best_params_)
print('교차검증 f1 score:', GradientBoosting_search.best_score_)

In [None]:
from sklearn.metrics import f1_score
gb_pred = Bagging_search.predict(test_X)
print('테스트 f1 score:', f1_score(test_y, gb_pred))

#### SVM(Support Vector Machine)

In [None]:
from sklearn.svm import SVC
full_pipe = Pipeline(
 [
 ("preprocess", preprocess),
 ("regressor", SVC(probability = True))
 ]
)

In [None]:
SVC(probability = True).get_params( )

In [None]:
SVC_param = {'regressor__C': np.arange(1, 100, 20)}
SVC_search = GridSearchCV(estimator = full_pipe,
param_grid = SVC_param,
cv = 3,
scoring = 'roc_auc')
SVC_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합:', SVC_search.best_params_)
print('교차검증 AUC score:', SVC_search.best_score_)

In [None]:
from sklearn.metrics import roc_auc_score
y_prob = SVC_search.predict_proba(test_X)[:, 1]
auc_score = roc_auc_score(test_y, y_prob)
print("AUC score: %f" % auc_score)

### 모범답안 작성 예시

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

X = data.data
y = data.target

df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = 'target'), df['target'], test_size = 0.2, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

columns = X_train.columns
transformer = make_column_transformer((StandardScaler(), columns)).set_output(transform = 'pandas')

X_train_pre = transformer.fit_transform(X_train)
X_test_pre = transformer.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

randf_classifier = RandomForestClassifier(random_state = 0)
randf_classifier.fit(X_train, y_train)

In [None]:
y_pred = randf_classifier.predict(X_test)

pd.DataFrame({'pred': y_pred}).to_csv("result.csv", index = False)

In [None]:
from sklearn.metrics import roc_auc_score

score = roc_auc_score(y_test, y_pred)
print(score)
# print(roc_auc_score.__doc__)

### Section 03 연습문제 : scikit-learn을 활용한 분류 모델 적합

다음 학습용 데이터(mroz_train.csv)는 미국 기혼 여성의 노동시장 참여 여부, 자녀 수, 교육 수준, 예상 시급 및 가구 소득 등을 조사한 자료이다.

| 변수명 | 설명 |
|:--|:--|
| lfp | 노동시장 참여 여부(범주형: no - 참여하지 않음, yes - 참여함) |
| k5 | 5세 이하 자녀 수 |
| k618 | 6세 ~ 18세 자녀 수 |
| age | 여성의 나이 (단위: 세) |
| wc | 아내의 대학교 졸업 여부(범주형: no - 미졸업, yes - 졸업) |
| hc | 남편의 대학교 졸업 여부(범주형: no - 미졸업, yes - 졸업) |
| lwg | 여성의 예상 시급 로그값(예측 대상 변수) |
| inc | 가구 총소득 (단위: 가구 소득에서 아내 소득을 제외한 금액) |

```
import pandas as pd
import numpy as np
train = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/data/main/mroz_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/data/main/mroz_test.csv")
print(train.head())
```

학습용 데이터를 활용하여 노동 시장 참여 여부(lfp)를 예측하는 모델을 개발하고, 이 중 가장 우수한 모델을 평가용 데이터(mroz_test.csv)에 적용하여 노동시장 참여 여부(lfp)를 예측하시오.

% 예측 결과는 f1-macro 평가지표에 따라 평가

제출형식
- csv 파일명 : result.csv (파일명에 디렉토리, 폴더 지정 불가)
- 예측 칼럼명 : pred
- 제출 칼럼 개수 : pred 칼럼 1개
- 평가용 데이터 개수와 예측 결과 데이터 개수 일치

In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/data/main/mroz_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/data/main/mroz_test.csv")

In [None]:
train.info()

In [None]:
train.head(3)

In [None]:
train_X = train.drop(columns = 'lfp')
train_y = train['lfp']

test_X = test.drop(columns = 'lfp')
test_y = test['lfp']

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_column = train_X.select_dtypes('number').columns
cat_column = train_X.select_dtypes('object').columns

transformer = make_column_transformer(
    (StandardScaler(), num_column),
    (OneHotEncoder(handle_unknown = 'ignore', sparse_output = False), cat_column)
).set_output(transform = 'pandas')

train_X_pre = transformer.fit_transform(train_X)
test_X_pre = transformer.transform(test_X)

In [None]:
train_y = train_y.map({'no' : 0, 'yes' : 1})
test_y = test_y.map({'no' : 0, 'yes' : 1})

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state = 0)
rf_classifier.fit(train_X_pre, train_y)

In [None]:
from sklearn.metrics import f1_score

y_pred = rf_classifier.predict(test_X_pre)
print(f1_score(test_y, y_pred, average = 'macro'))

pd.DataFrame({'pred' : y_pred}).to_csv('result.csv', index = False)

In [None]:
label = {0 : 'no', 1 : 'yes'}
y_pred = pd.Series(y_pred).map(label)
print(y_pred)

In [None]:
pd.DataFrame({'pred' : y_pred}).to_csv('result.csv', index = False)

### 분류 모델 관련
y(종속변수)가 범주형이라면, map를 활용해서 직접 인코딩을 해준다. (encoder를 사용하게 되면, 직접 지정할 수 없으니까 결과에 문제가 생길 수 있음)   
그리고 만약에 원래 범주형 형태로 되돌려야 한다. -> map을 사용하되, predict 결과는 ndarry 타입이기 때문에, `pd.Series(pred)`로 타입 변환을 해야 map 함수를 사용할 수 있음.

> **그리고 오타 주의❗❗❗❗❗**



###