In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from scipy.stats import ttest_ind
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from tqdm.auto import tqdm

# 데이터 로드 및 전처리 (위의 예제와 동일한 방식으로)
df = pd.read_csv("하이닉스 power (3).csv", thousands=',', encoding='cp949')
df['next_day_return'] = (df['종가'].shift(-1) - df['종가']) / df['종가'] * 100
df['target'] = df['next_day_return'].apply(lambda x: 1 if x > 0.25 else 0)
df.dropna(inplace=True)

X = df.drop(['날짜', 'target', 'next_day_return'], axis=1)
y = df['target']
# Random Forest for feature importance
rf = RandomForestRegressor()
rf.fit(X, y)

features_rf_sorted = sorted(zip(X.columns, rf.feature_importances_), key=lambda x: x[1], reverse=True)
features_rf_df = pd.DataFrame(features_rf_sorted, columns=['Feature', 'RF Importance'])

# 상관관계가 가장 높은 상위 8개의 피처를 선정
top_7_features = [feature for feature, importance in features_rf_sorted[:7]]
# X와 y 정의 (df2와 이전 코드에서의 처리 과정을 바탕으로)dp
X_top7 = X[top_7_features]
y = df['target']  # y 값을 0과 1로 조정

# 데이터를 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X_top7, y, test_size=0.26, random_state=7)

param_grid = {
    'C': [0.1, 1, 10, 100],  # 규제의 강도
    'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10],  # 커널 계수
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']  # 사용할 커널
}


# GridSearchCV 객체 초기화 및 학습
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=5, n_jobs=1)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터와 모델 출력
print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# 최적의 모델로 테스트 데이터 예측
y_pred = best_model.predict(X_test)

# 예측 결과에 대한 평가 지표 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END .....C=1, gamma=scale, kernel=poly;, score=0.558 total time=   0.8s
[CV 2/5] END .....C=1, gamma=scale, kernel=poly;, score=0.552 total time=   0.8s
[CV 3/5] END .....C=1, gamma=scale, kernel=poly;, score=0.571 total time=   0.7s
[CV 4/5] END .....C=1, gamma=scale, kernel=poly;, score=0.570 total time=   0.9s
[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.562 total time=   2.8s
