드라이브 마운트

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import pandas as pd
# 모든 행을 출력하도록 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 기본값으로 설정 (처음 5개와 마지막 5개 행만 출력)
pd.reset_option('display.max_rows')

# 출력 포맷 설정 (소수점 4자리까지)
pd.options.display.float_format = '{:.4f}'.format

import platform
import seaborn as sns

import matplotlib.pyplot as plt

# 운영 체제 확인
if platform.system() == 'Darwin':  # Mac
    print('apple gothic')
    font_name = 'AppleGothic'
elif platform.system() == 'Windows':  # Windows
    font_name = 'NanumGothic'
else:
    font_name = None

# 한글 폰트 설정
if font_name:
    plt.rcParams['font.family'] = font_name

# 마이너스 부호 설정
plt.rcParams['axes.unicode_minus'] = False

## 데이터 로드

In [2]:
X_train = pd.read_csv('/content/dataset_final/X_train.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')
X_test = pd.read_csv('/content/dataset_final/X_test.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')

y_train = pd.read_csv('/content/dataset_final/y_train.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')
y_test = pd.read_csv('/content/dataset_final/y_test.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1187, 10), (198, 10), (1187, 1), (198, 1))

In [3]:
X_train.head(2)

Unnamed: 0_level_0,순운전자본대비총자본,비유동장기적합률,자기자본순이익률,당좌비율,총자본정상영업이익률,자본금회전율,총자산회전율,배당금지급(-),미처분이익잉여금(결손금),영업손익
거래소코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
40,-0.1204,-0.2665,0.1511,-0.3273,-0.1546,-0.6327,0.0008,-0.2343,-0.1,0.0551
50,-0.86,0.3045,0.4107,-0.3775,0.7126,0.5597,-1.1884,-0.2343,-0.4147,0.3254


## 파라미터 튜닝

In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [5]:
from modules import modeling as ml

from sklearn.model_selection import GridSearchCV as grid

In [6]:
res_base = {}
res_best = {}

res_grid = {}

### (1) LR

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
model = ml.get_model_base(model_name='lr')

grid_lg=grid(estimator=model,
     param_grid= {
          #l1, l2, 엘라스틱 규제 설정
          'penalty': ['l1', 'l2', 'elasticnet', None],

          'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
               # liblinear는 데이터셋이 작은경우 좋음
               # sag와 saga는 대형데이터에서 속도가 빠름
               # 멀티클래스 분류문제일경우 newton-cg, sag, saga, lbfgs만 사용가능
               # newton-cholesky는 데이터가 변수보다 많을때 좋음. 원핫에서 가장효율적, 피쳐많으면 컴사용량 많이 먹음

          # 규제 강도. 높으면 규제가 약한것.
          'C': [0.001, 0.01, 0.1 , 1 , 10, 100]
     },
     scoring='f1',
     cv=5,

     # 가장 좋은 파라미터 설정으로 재학습
     refit=True
)

grid_lg.fit(X_train,y_train)
res_grid['lr'] = grid_lg

best_params = grid_lg.best_params_
print("Best Parameters:", best_params)

best_estimator = grid_lg.best_estimator_
print("\nBest Estimator:", best_estimator)

Best Parameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}

Best Estimator: LogisticRegression(C=0.001, random_state=42, solver='liblinear')


### (2) DT

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
model = ml.get_model_base(model_name='dt')

grid_DT=grid(estimator=model,
            param_grid= {
                'criterion':['gini','entropy','log_loss'],
                #지니, 엔트로피, 로그로스 방식으로 손실함수 지정
                'max_depth' : [2,3,4,5,7,9,11],
                #최대 깊이
                'min_samples_split' : [2,3,5,7,9],
                #나눠지기 위한 최소 개수
                'min_samples_leaf' : [1,3,5, ]
                #최소 잎 개수
            },
                  scoring='f1', cv=5
)



grid_DT.fit(X_train,y_train)
res_grid['dt'] = grid_DT

best_params = grid_DT.best_params_
print("Best Parameters:", best_params)

best_estimator = grid_DT.best_estimator_
print("\nBest Estimator:", best_estimator)

Best Parameters: {'criterion': 'entropy', 'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 5}

Best Estimator: DecisionTreeClassifier(criterion='entropy', max_depth=11, min_samples_split=5,
                       random_state=42)


### (3) SVM

In [11]:
from sklearn.svm import SVC

In [12]:
model = ml.get_model_base(model_name='svm')

grid_SVC=grid(estimator=model,
            param_grid= {
                'kernel':['rbf', 'sigmoid'],
                # 방사기저함수, 시그모이드 커널 지정.
                # 두 종류 모두 비선형 관계에 강함. 즉, 정규성없어도 문제 X
                # 이중 linear랑 poly는 사용안함. 선형관계를 가정하기 때문
                #

                #규제강도 높을수록 약함
                'C' : [0.001,0.01,0.1,1,10],

            },
                  scoring='f1', cv=5
)



grid_SVC.fit(X_train,y_train)
res_grid['svm'] = grid_SVC

best_params = grid_SVC.best_params_
print("Best Parameters:", best_params)

best_estimator = grid_SVC.best_estimator_
print("\nBest Estimator:", best_estimator)

Best Parameters: {'C': 10, 'kernel': 'sigmoid'}

Best Estimator: SVC(C=10, kernel='sigmoid', probability=True, random_state=42)


### (4) RF

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
model = ml.get_model_base(model_name='rf')

grid_RFC=grid(estimator=model,
            param_grid= {
                'criterion':['gini','entropy'],
                #지니, 엔트로피, 로그로스 방식으로 손실함수 지정

                #최대 깊이
                'max_depth' : [2,3,4, 5,7,9,11],
                # 'max_depth' : (3,5,7),

                #나눠지기 위한 최소 개수
                'min_samples_split' : [2,3,5,7,9],
                # 'min_samples_split' : (2,3,5),

                #최소 잎 개수
                'min_samples_leaf' : [1,3,5],

                #앙상블에 사용할 트리의 개수
                'n_estimators': [10,50, 100,200,300],

                # 불순도 일정수준으로 감소 안하면 모델 강제로 종료하는 변수
                'min_impurity_decrease' :[0.001,0.01,0.02]
            },
                  scoring='f1', cv=5
)


grid_RFC.fit(X_train,y_train)
res_grid['rf'] = grid_RFC

best_params = grid_RFC.best_params_
print("Best Parameters:", best_params)

best_estimator = grid_RFC.best_estimator_
print("\nBest Estimator:", best_estimator)

KeyboardInterrupt: 

### (5) XGB

In [None]:
from xgboost import XGBClassifier

In [None]:
model = ml.get_model_base(model_name='xgb')

grid_XGB=grid(
    estimator=model,
    param_grid = {
                #얼마나 가파르게 학습할지
                'learning_rate' : [0.001,0.01,0.1],

                #최대 깊이
                'max_depth' : [2, 3,4, 5,7,9,11],

                #최소 샘플 가중치 합 이 값보다 작은 노드분할 없음
                'min_child_weight' : [2,3,5,7,9],

                #앙상블에 사용할 트리의 개수
                'n_estimators': [10,100,200,300],

                # 불순도 일정수준으로 감소 안하면 모델 강제로 종료하는 변수
                'min_impurity_decrease' :[0.001,0.01,0.02]
    },
    scoring='f1', cv=5
)



grid_XGB.fit(X_train,y_train)
res_grid['xgb'] = grid_XGB


best_params = grid_XGB.best_params_
print("Best Parameters:", best_params)

best_estimator = grid_XGB.best_estimator_
print("\nBest Estimator:", best_estimator)

### (6) LGBM

In [None]:
from lightgbm import LGBMClassifier

In [None]:
# 메모리 오버

model = ml.get_model_base(model_name='lgbm')

grid_LGBM=grid(estimator=model,
            param_grid= {
                #얼마나 가파르게 학습할지
                'learning_rate' : [0.001,0.01,0.1],

                #최대 깊이
                'max_depth' : [2, 3,5,7,9,11],

                #앙상블에 사용할 트리의 개수
                'n_estimators': [10, 50, 100,200,300],

                #l1규제 강도
                'reg_alpha' : [0.001,0.01,0.1,1,10,100],

                #l2규제 강도
                'reg_lambda' : [0.001,0.01,0.1,1,10,100]
            },
                  scoring='f1', cv=5
)



grid_LGBM.fit(X_train,y_train)
res_grid['lgbm'] = grid_LGBM

best_params = grid_LGBM.best_params_
print("Best Parameters:", best_params)

best_estimator = grid_LGBM.best_estimator_
print("\nBest Estimator:", best_estimator)

### (7) catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
# 약 2시간 예상

model = ml.get_model_base(model_name='cat')

grid_cat=grid(estimator=model,
            param_grid= {
                #얼마나 가파르게 학습할지
                'learning_rate' : [0.001,0.01,0.1],

                #최대 깊이
                'depth' : [2, 3,5,7,9,11],

                #사용할 트리의 개수
                'iterations': [10,50, 100,200,300],

                #l2규제 강도
                'l2_leaf_reg' : [0.001,0.01,0.1,1,10,100],

                #손실함수
                'loss_function' : ['Logloss','CrossEntropy','AUC'],
                # 'loss_function' : ('CrossEntropy'),

                # 'eval_metric':['precision']
                #평가지표 부럽네
            },
                  scoring='f1', cv=5
)

grid_cat.fit(X_train,y_train)
res_grid['cat'] = grid_cat

best_params = grid_cat.best_params_
print("Best Parameters:", best_params)

best_estimator = grid_cat.best_estimator_
print("\nBest Estimator:", best_estimator)

## 튜닝 결과 확인

In [None]:
# res_base.keys()

In [None]:
# for key in res_base.keys():
#     print(key)
#     print('base precision', res_base[key]['results']['precision'])
#     print('best precision', res_best[key]['results']['precision'])

In [None]:
# res_grid_cv = {}
# for key in res_grid.keys():
#     g = res_grid[key]
#     res_grid_cv[key] = pd.DataFrame(g.cv_results_)