In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming the data preparation is similar to previous examples
df = pd.read_csv("하이닉스 power (3).csv", thousands=',', encoding='cp949')
df['next_day_return'] = (df['종가'].shift(-1) - df['종가']) / df['종가'] * 100
df['target'] = df['next_day_return'].apply(lambda x: 1 if x > 0.25 else 0)
df.dropna(inplace=True)

X = df.drop(['날짜', 'target', 'next_day_return'], axis=1)
y = df['target']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Decision Tree Classifier Hyperparameters Grid
param_grid_dt = {
    'max_depth': [None, 2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize GridSearchCV
grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid_dt, refit=True, verbose=2, cv=5, n_jobs=-1)

# Fit the model
grid_search_dt.fit(X_train, y_train)

# Best parameters and best model
print("Best parameters:", grid_search_dt.best_params_)
best_dt_model = grid_search_dt.best_estimator_

# Predictions and Evaluation
y_pred_dt = best_dt_model.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='macro')
recall_dt = recall_score(y_test, y_pred_dt, average='macro')
f1_dt = f1_score(y_test, y_pred_dt, average='macro')

print("Accuracy:", accuracy_dt)
print("Precision:", precision_dt)
print("Recall:", recall_dt)
print("F1 Score:", f1_dt)


In [1]:
from sklearn.model_selection import ParameterGrid
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from sklearn.metrics import accuracy_score

# 데이터 로드 및 전처리
df = pd.read_csv("하이닉스 power (3).csv", thousands=',', encoding='cp949')
df['next_day_return'] = (df['종가'].shift(-1) - df['종가']) / df['종가'] * 100
df['target'] = df['next_day_return'].apply(lambda x: 1 if x > 0.25 else 0)
df.dropna(inplace=True)

X = df.drop(['날짜', 'target', 'next_day_return'], axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# 결정 트리 모델을 위한 하이퍼파라미터 그리드 정의
param_grid = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 사용자 정의 함수로 모델 학습 및 평가
def evaluate_model(params):
    model = DecisionTreeClassifier(**params, random_state=1)
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    return scores.mean()

# 모든 하이퍼파라미터 조합에 대해 평가
results = Parallel(n_jobs=-1, verbose=10)(
    delayed(evaluate_model)(params) for params in tqdm(list(ParameterGrid(param_grid)))
)

# 최적의 하이퍼파라미터 조합 찾기
best_score_idx = np.argmax(results)
best_params = list(ParameterGrid(param_grid))[best_score_idx]

print(f"Best parameters: {best_params}")
print(f"Best score: {results[best_score_idx]}")

# 최적의 모델로 테스트 데이터 예측
best_model = DecisionTreeClassifier(**best_params, random_state=1)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# 예측 결과에 대한 평가 지표 출력
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


  0%|          | 0/36 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done  33 out of  36 | elapsed:   48.4s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   51.8s finished


NameError: name 'np' is not defined