# Machine Learning Project
- 작성자: 박인애 🍀
- 작성날짜: 2024.06.01
- 목적: binary classification
- 목표: F1-Score가 높게 나오는 전처리 기법 + 모델 조합 찾기 🌟

In [None]:
for df in encoded_df_group:
  df["y"] = df["y"].replace({"no": 0, "yes": 1})

## Train-Valid Split
- processed data set 이용

In [None]:
from sklearn.model_selection import train_test_split

random_state = 100
shuffle = True
test_size_ratio = 0.25

processed_train_df_group = []
processed_valid_df_group = []

for df in encoded_df_group:
  processed_train_df, processed_valid_df = train_test_split(df, test_size=test_size_ratio, random_state=random_state, shuffle=shuffle)
  processed_train_df_group.append(processed_valid_df)
  processed_valid_df_group.append(processed_valid_df)

# 📌 머신러닝을 이용한 이진 분류- supervised learning 🌟

#### Lab4
- Logistic Regression
- Polynomial Regression
- Decision Tree

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score

In [None]:
max_iter = 10000

# 모든 모델들을 dictionary 형태로 넣어서 각각 학습
models = {
  "Baseline": LogisticRegression(solver="saga", max_iter=max_iter, penalty=None),
  "L2": LogisticRegression(solver="saga", max_iter=max_iter, penalty="l2", C=1.0),
  "L1": LogisticRegression(solver="saga", max_iter=max_iter, penalty="l1", C=1.0),
  "Polynomial": Pipeline([("poly_features", PolynomialFeatures(degree=2)),
                          ("softmax_reg", LogisticRegression(solver="saga", max_iter=max_iter, penalty=None))]),
  "DecisionTree": DecisionTreeClassifier(criterion="gini", max_depth=5, min_samples_split=2, min_impurity_decrease=0.0)
}


## 머신러닝 모델 평가
# n_splites = number of k  (default: 5)
random_state = 100
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

# 머신러닝으로 분류한 k개의 samples에 대해서 score를 확인하기
# cross validation 방식으로 train set을 가지고 모델을 평가하기
# scoring은 5가지 방식
'''
1) accuracy: Accuracy (default)
2) roc_auc: Area under the receiver operating characteristic (ROC) curve
3) f1: F1 score
4) precision: Precision
5) recall: Recal
'''

'\nscores = cross_val_score(model, X_train, y_train, cv=kf, scoring="roc_auc")\n\n# k개 samples의 score를 각각 배열(list)로 확인\nprint("Scores from each iteration:", scores)\n# k개 samples의 score를 평균내기\nprint("Average score:", scores.mean())\n'

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
print(len(encoded_df_group))

104


- 일단 sample 하나에 대해 테스트
   - 잘 돌아감 확인 완료

In [None]:
# Train
df = processed_train_df_group[5]
X_train = df.drop('y', axis=1, inplace=False).values
y_train = df['y'].values

# Validation
X_valid = processed_valid_df_group[5].drop('y', axis=1, inplace=False).values
y_valid = processed_valid_df_group[5]['y'].values

# 모델 학습 및 평가
for name, model in models.items():
  model.fit(X_train, y_train)

  print(f'Model: {name}')

  y_prob = model.predict_proba(X_valid)
  y_cls = model.predict(X_valid)

  # 3가지 평가 지표 (evalution metrics)
  print("Accuracy:", accuracy_score(y_valid, y_cls))
  print("F1:", f1_score(y_valid, y_cls))
  print("ROC AUC:", roc_auc_score(y_valid, y_prob[:, 1]))
  print()

In [None]:
# 모든 전처리 데이터셋에 대해서
for i in range(6, len(encoded_df_group)):
  # Train
  df = processed_train_df_group[i]
  X_train = df.drop('y', axis=1, inplace=False).values
  y_train = df['y'].values

  # Validation
  X_valid = processed_valid_df_group[i].drop('y', axis=1, inplace=False).values
  y_valid = processed_valid_df_group[i]['y'].values



  # for문을 이용해 각각의 모델에 대해 모델 평가 수행 및 ROC 값을 프린트해서 성능 비교
  # 모델 학습 및 평가
  for name, model in models.items():
    model.fit(X_train, y_train)


    y_prob = model.predict_proba(X_valid)
    y_cls = model.predict(X_valid)

    # 3가지 평가 지표 (evalution metrics)
    print(f'Model: {name}, {i}-th data, F1: {f1_score(y_valid, y_cls)}')
    '''
    print("Accuracy:", accuracy_score(y_valid, y_cls))
    print("F1:", f1_score(y_valid, y_cls))
    print("ROC AUC:", roc_auc_score(y_valid, y_prob[:, 1]))
    print()
    '''

Model: Baseline, 0-th data, F1: 0.32676518883415434
Model: L2, 0-th data, F1: 0.32676518883415434
Model: L1, 0-th data, F1: 0.32676518883415434
Model: Polynomial, 0-th data, F1: 0.32418952618453867
Model: DecisionTree, 0-th data, F1: 0.4319880418535127
Model: Baseline, 1-th data, F1: 0.32676518883415434
Model: L2, 1-th data, F1: 0.32676518883415434
Model: L1, 1-th data, F1: 0.32676518883415434
Model: Polynomial, 1-th data, F1: 0.32418952618453867
Model: DecisionTree, 1-th data, F1: 0.4319880418535127
Model: Baseline, 2-th data, F1: 0.32676518883415434
Model: L2, 2-th data, F1: 0.32676518883415434
Model: L1, 2-th data, F1: 0.32676518883415434
Model: Polynomial, 2-th data, F1: 0.32418952618453867
Model: DecisionTree, 2-th data, F1: 0.4319880418535127
Model: Baseline, 3-th data, F1: 0.32676518883415434
Model: L2, 3-th data, F1: 0.32676518883415434
Model: L1, 3-th data, F1: 0.32676518883415434
Model: Polynomial, 3-th data, F1: 0.32418952618453867
Model: DecisionTree, 3-th data, F1: 0.43198

In [None]:
for i in range(90, len(encoded_df_group)):
  # Train
  df = processed_train_df_group[i]
  X_train = df.drop('y', axis=1, inplace=False).values
  y_train = df['y'].values

  # Validation
  X_valid = processed_valid_df_group[i].drop('y', axis=1, inplace=False).values
  y_valid = processed_valid_df_group[i]['y'].values


  for name, model in models.items():
    model.fit(X_train, y_train)


    y_prob = model.predict_proba(X_valid)
    y_cls = model.predict(X_valid)

    print(f'Model: {name}, {i}-th data, F1: {f1_score(y_valid, y_cls)}')
    '''
    print("Accuracy:", accuracy_score(y_valid, y_cls))
    print("F1:", f1_score(y_valid, y_cls))
    print("ROC AUC:", roc_auc_score(y_valid, y_prob[:, 1]))
    print()
    '''

Model: Baseline, 90-th data, F1: 0.0
Model: L2, 90-th data, F1: 0.0
Model: L1, 90-th data, F1: 0.0
Model: Polynomial, 90-th data, F1: 0.0
Model: DecisionTree, 90-th data, F1: 0.3082077051926298
Model: Baseline, 91-th data, F1: 0.0
Model: L2, 91-th data, F1: 0.0
Model: L1, 91-th data, F1: 0.0
Model: Polynomial, 91-th data, F1: 0.0
Model: DecisionTree, 91-th data, F1: 0.3082077051926298
Model: Baseline, 92-th data, F1: 0.0
Model: L2, 92-th data, F1: 0.0
Model: L1, 92-th data, F1: 0.0
Model: Polynomial, 92-th data, F1: 0.0
Model: DecisionTree, 92-th data, F1: 0.3082077051926298
Model: Baseline, 93-th data, F1: 0.0
Model: L2, 93-th data, F1: 0.0
Model: L1, 93-th data, F1: 0.0
Model: Polynomial, 93-th data, F1: 0.0
Model: DecisionTree, 93-th data, F1: 0.3082077051926298
Model: Baseline, 94-th data, F1: 0.0
Model: L2, 94-th data, F1: 0.0
Model: L1, 94-th data, F1: 0.0
Model: Polynomial, 94-th data, F1: 0.0
Model: DecisionTree, 94-th data, F1: 0.3082077051926298
Model: Baseline, 95-th data, F

```
[결과]
Model: Baseline, 30-th data, F1: 0.32676518883415434
Model: L2, 30-th data, F1: 0.32676518883415434
Model: L1, 30-th data, F1: 0.32676518883415434
```

```
[결과]
90th - 103th
odel: Baseline, 90-th data, F1: 0.0
Model: L2, 90-th data, F1: 0.0
Model: L1, 90-th data, F1: 0.0
Model: Polynomial, 90-th data, F1: 0.0
Model: DecisionTree, 90-th data, F1: 0.3082077051926298
Model: Baseline, 91-th data, F1: 0.0
Model: L2, 91-th data, F1: 0.0
Model: L1, 91-th data, F1: 0.0
Model: Polynomial, 91-th data, F1: 0.0
Model: DecisionTree, 91-th data, F1: 0.3082077051926298
Model: Baseline, 92-th data, F1: 0.0
Model: L2, 92-th data, F1: 0.0
Model: L1, 92-th data, F1: 0.0
Model: Polynomial, 92-th data, F1: 0.0
Model: DecisionTree, 92-th data, F1: 0.3082077051926298
Model: Baseline, 93-th data, F1: 0.0
Model: L2, 93-th data, F1: 0.0
Model: L1, 93-th data, F1: 0.0
Model: Polynomial, 93-th data, F1: 0.0
Model: DecisionTree, 93-th data, F1: 0.3082077051926298
Model: Baseline, 94-th data, F1: 0.0
Model: L2, 94-th data, F1: 0.0
Model: L1, 94-th data, F1: 0.0
Model: Polynomial, 94-th data, F1: 0.0
Model: DecisionTree, 94-th data, F1: 0.3082077051926298
Model: Baseline, 95-th data, F1: 0.0
Model: L2, 95-th data, F1: 0.0
Model: L1, 95-th data, F1: 0.0
Model: Polynomial, 95-th data, F1: 0.0
Model: DecisionTree, 95-th data, F1: 0.3082077051926298
Model: Baseline, 96-th data, F1: 0.047619047619047616
Model: L2, 96-th data, F1: 0.047619047619047616
Model: L1, 96-th data, F1: 0.047619047619047616
Model: Polynomial, 96-th data, F1: 0.0
Model: DecisionTree, 96-th data, F1: 0.6785714285714285
Model: Baseline, 97-th data, F1: 0.047619047619047616
Model: L2, 97-th data, F1: 0.047619047619047616
Model: L1, 97-th data, F1: 0.047619047619047616
Model: Polynomial, 97-th data, F1: 0.0
Model: DecisionTree, 97-th data, F1: 0.6545454545454547
Model: Baseline, 98-th data, F1: 0.047619047619047616
Model: L2, 98-th data, F1: 0.047619047619047616
Model: L1, 98-th data, F1: 0.047619047619047616
Model: Polynomial, 98-th data, F1: 0.0
Model: DecisionTree, 98-th data, F1: 0.6785714285714285
Model: Baseline, 99-th data, F1: 0.047619047619047616
Model: L2, 99-th data, F1: 0.047619047619047616
Model: L1, 99-th data, F1: 0.047619047619047616
Model: Polynomial, 99-th data, F1: 0.0
Model: DecisionTree, 99-th data, F1: 0.6785714285714285
Model: Baseline, 100-th data, F1: 0.047619047619047616
Model: L2, 100-th data, F1: 0.047619047619047616
Model: L1, 100-th data, F1: 0.047619047619047616
Model: Polynomial, 100-th data, F1: 0.0
Model: DecisionTree, 100-th data, F1: 0.6785714285714285
Model: Baseline, 101-th data, F1: 0.047619047619047616
Model: L2, 101-th data, F1: 0.047619047619047616
Model: L1, 101-th data, F1: 0.047619047619047616
Model: Polynomial, 101-th data, F1: 0.0
Model: DecisionTree, 101-th data, F1: 0.6545454545454547
Model: Baseline, 102-th data, F1: 0.047619047619047616
Model: L2, 102-th data, F1: 0.047619047619047616
Model: L1, 102-th data, F1: 0.047619047619047616
Model: Polynomial, 102-th data, F1: 0.0
Model: DecisionTree, 102-th data, F1: 0.6785714285714285
Model: Baseline, 103-th data, F1: 0.047619047619047616
Model: L2, 103-th data, F1: 0.047619047619047616
Model: L1, 103-th data, F1: 0.047619047619047616
Model: Polynomial, 103-th data, F1: 0.0
Model: DecisionTree, 103-th data, F1: 0.6545454545454547
```

In [None]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(50, 30),
                      max_iter=300,
                      activation='relu',
                      solver='adam',
                      batch_size=200,
                      learning_rate='invscaling',
                      learning_rate_init=0.01,
                      power_t=0.5,  # Exponent for inverse scaling learning rate
                      warm_start=True,
                      random_state=100,
                      verbose=True) # Enable verbose to monitor


df = processed_train_df_group[24]
X_train = df.drop('y', axis=1, inplace=False).values
y_train = df['y'].values

# Validation
X_valid = processed_valid_df_group[24].drop('y', axis=1, inplace=False).values
y_valid = processed_valid_df_group[24]['y'].values


# Fit the model
model.fit(X_train, y_train)
y_prob = model.predict_proba(X_valid)
y_cls = model.predict(X_valid)

Iteration 1, loss = 6.77884429
Iteration 2, loss = 3.77146987
Iteration 3, loss = 3.63350087
Iteration 4, loss = 3.58931429
Iteration 5, loss = 3.98281644
Iteration 6, loss = 3.38896534
Iteration 7, loss = 2.37042087
Iteration 8, loss = 3.13216584
Iteration 9, loss = 2.38938907
Iteration 10, loss = 2.38910141
Iteration 11, loss = 1.74714135
Iteration 12, loss = 1.52783845
Iteration 13, loss = 2.49791048
Iteration 14, loss = 1.51129688
Iteration 15, loss = 1.24739805
Iteration 16, loss = 1.04036795
Iteration 17, loss = 0.66481931
Iteration 18, loss = 0.81678280
Iteration 19, loss = 0.79415071
Iteration 20, loss = 0.62368814
Iteration 21, loss = 0.57667125
Iteration 22, loss = 0.46404389
Iteration 23, loss = 0.38727159
Iteration 24, loss = 0.46836977
Iteration 25, loss = 0.53286497
Iteration 26, loss = 0.41824085
Iteration 27, loss = 0.51313425
Iteration 28, loss = 0.31095144
Iteration 29, loss = 0.37891513
Iteration 30, loss = 0.41891456
Iteration 31, loss = 0.37814555
Iteration 32, los

#### Lab5. Ensemble

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import KFold, GridSearchCV


from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score

In [None]:
# 성능 측정 방식은 Negative MSE로 할 것임
# 다른 방식으로 해도 됨
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
scoring = "neg_mean_squared_error"

models = {}

- 학습 시작

In [None]:
def run_models(i):
  # Train
  df = processed_train_df_group[i]
  X_train = df.drop('y', axis=1, inplace=False).values
  y_train = df['y'].values

  # Validation
  X_valid = processed_valid_df_group[i].drop('y', axis=1, inplace=False).values
  y_valid = processed_valid_df_group[i]['y'].values

  #
  model = DecisionTreeRegressor(random_state=random_state)
  param_grid = {
      "max_depth": [5, 10, 20],
      "min_samples_split": [2, 10, 20],
      "ccp_alpha": [0.0, 0.01],
  }
  grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True, n_jobs=-1)
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(X_valid)
  # y_pred를 0과 1로 이진 분류
  y_cls = np.where(y_pred > 0.5, 1, 0)
  print(f'name: Decision Tree, {i}-th, F1-Score: {f1_score(y_valid, y_cls)}')


  # Bagging
  base_model = DecisionTreeRegressor()
  model = BaggingRegressor(estimator=base_model,
                          bootstrap=True,
                          n_jobs=-1,
                          random_state=random_state)
  param_grid = {
      "n_estimators": [25, 50]
  }
  grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True, n_jobs=-1)
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(X_valid)
  # y_pred를 0과 1로 이진 분류
  y_cls = np.where(y_pred > 0.5, 1, 0)
  print(f'name: Bagging, {i}-th, F1-Score: {f1_score(y_valid, y_cls)}')


  #
  model = RandomForestRegressor(max_depth=None,
                                min_samples_split=2,
                                bootstrap=True,
                                n_jobs=-1,
                                random_state=random_state)
  param_grid = {
      "n_estimators": [25, 50],
      "max_features": [0.5, "sqrt", "log2", None],
  }
  grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True, n_jobs=-1)
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(X_valid)
  # y_pred를 0과 1로 이진 분류
  y_cls = np.where(y_pred > 0.5, 1, 0)
  print(f'name: Random Forest, {i}-th, F1-Score: {f1_score(y_valid, y_cls)}')

  #
  model = AdaBoostRegressor(loss="linear",
                          random_state=random_state)
  param_grid = {
      "n_estimators": [25, 50],
      "estimator": [DecisionTreeRegressor(max_depth=3), DecisionTreeRegressor(max_depth=6)],
      "learning_rate": [0.1, 1.0],
  }
  grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True, n_jobs=-1)
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(X_valid)
  # y_pred를 0과 1로 이진 분류
  y_cls = np.where(y_pred > 0.5, 1, 0)
  print(f'name: AdaBoost, {i}-th, F1-Score: {f1_score(y_valid, y_cls)}')


  #
  model = GradientBoostingRegressor(loss="squared_error",
                                  subsample=1.0,
                                  random_state=random_state)
  param_grid = {
      "n_estimators": [25, 50],
      "max_depth": [3, 6],
      "learning_rate": [0.0, 0.1],
  }
  grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True, n_jobs=-1)
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(X_valid)
  # y_pred를 0과 1로 이진 분류
  y_cls = np.where(y_pred > 0.5, 1, 0)
  print(f'name: Gradient Boosting, {i}-th, F1-Score: {f1_score(y_valid, y_cls)}')


  #
  model = XGBRegressor(subsample=1.0,
                     learning_rate=0.1,
                     max_depth=6,
                     n_jobs=-1,
                     random_state=random_state)
  param_grid = {
      "n_estimators": [25, 50],
      "reg_alpha": [0, 0.1],
      "reg_lambda": [0, 0.1],
  }
  grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True)
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(X_valid)
  # y_pred를 0과 1로 이진 분류
  y_cls = np.where(y_pred > 0.5, 1, 0)
  print(f'name: XGBoost, {i}-th, F1-Score: {f1_score(y_valid, y_cls)}')


  #
  model = LGBMRegressor(learning_rate=0.1,
                      data_sample_strategy="goss",
                      top_rate=0.2,
                      other_rate=0.1,
                      force_col_wise=True,
                      verbosity=0,
                      n_jobs=-1,
                      random_state=random_state)
  param_grid = {
      "n_estimators": [25, 50],
      "reg_alpha": [0, 0.1],
      "reg_lambda": [0, 0.1],
      "enable_bundle": [True, False]
  }
  grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True)
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(X_valid)
  # y_pred를 0과 1로 이진 분류
  y_cls = np.where(y_pred > 0.5, 1, 0)
  print(f'name: LightGBM, {i}-th, F1-Score: {f1_score(y_valid, y_cls)}')


In [None]:
for i in range(30, len(processed_train_df_group)):
  run_models(i)
'''
[결과]
name: Decision Tree, 30-th, F1-Score: 0.4498918529199712
name: Bagging, 30-th, F1-Score: 0.996201844818231
name: Random Forest, 30-th, F1-Score: 0.9956568946796959
name: AdaBoost, 30-th, F1-Score: 0.3462132921174652
name: Gradient Boosting, 30-th, F1-Score: 0.42289348171701113
name: XGBoost, 30-th, F1-Score: 0.5093167701863354
name: LightGBM, 30-th, F1-Score: 0.4604651162790698
name: Decision Tree, 31-th, F1-Score: 0.4498918529199712
name: Bagging, 31-th, F1-Score: 0.996201844818231
name: Random Forest, 31-th, F1-Score: 0.9956568946796959
name: AdaBoost, 31-th, F1-Score: 0.3462132921174652
name: Gradient Boosting, 31-th, F1-Score: 0.42289348171701113
name: XGBoost, 31-th, F1-Score: 0.5093167701863354
name: LightGBM, 31-th, F1-Score: 0.4604651162790698
name: Decision Tree, 32-th, F1-Score: 0.44992743105950656
name: Bagging, 32-th, F1-Score: 0.9967462039045553
'''

name: Decision Tree, 30-th, F1-Score: 0.4498918529199712
name: Bagging, 30-th, F1-Score: 0.996201844818231
name: Random Forest, 30-th, F1-Score: 0.9956568946796959
name: AdaBoost, 30-th, F1-Score: 0.3462132921174652
name: Gradient Boosting, 30-th, F1-Score: 0.42289348171701113
name: XGBoost, 30-th, F1-Score: 0.5093167701863354
name: LightGBM, 30-th, F1-Score: 0.4604651162790698
name: Decision Tree, 31-th, F1-Score: 0.4498918529199712
name: Bagging, 31-th, F1-Score: 0.996201844818231
name: Random Forest, 31-th, F1-Score: 0.9956568946796959
name: AdaBoost, 31-th, F1-Score: 0.3462132921174652
name: Gradient Boosting, 31-th, F1-Score: 0.42289348171701113
name: XGBoost, 31-th, F1-Score: 0.5093167701863354
name: LightGBM, 31-th, F1-Score: 0.4604651162790698
name: Decision Tree, 32-th, F1-Score: 0.44992743105950656
name: Bagging, 32-th, F1-Score: 0.9967462039045553


KeyboardInterrupt: 