In [6]:
from pykrx import stock
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# SK하이닉스 종목코드
ticker = "377740"

# 시작 날짜와 종료 날짜 설정
start_date = "2014-03-20"
end_date = "2024-03-20"
# 주어진 기간 동안의 일별 거래량 정보 가져오기
df = stock.get_market_ohlcv_by_date(fromdate=start_date, todate=end_date, ticker=ticker)


def calculate_macd(df, short_window=12, long_window=26, signal_window=9):
    """MACD 및 MACD 신호 계산"""
    df['EMA_short'] = df['종가'].ewm(span=short_window, adjust=False).mean()
    df['EMA_long'] = df['종가'].ewm(span=long_window, adjust=False).mean()
    df['MACD'] = df['EMA_short'] - df['EMA_long']
    df['MACD_Signal'] = df['MACD'].ewm(span=signal_window, adjust=False).mean()
    return df

def calculate_rsi(df, window=14, signal_window=9):
    """RSI 및 RSI 신호 계산"""
    delta = df['종가'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()

    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # RSI 신호선 추가
    df['RSI_Signal'] = df['RSI'].ewm(span=signal_window, adjust=False).mean()
    return df

# MACD 및 MACD 신호 계산
df = calculate_macd(df)

df = calculate_rsi(df)

# '종가' 컬럼이 포함된 DataFrame을 가정합니다. 예를 들어, df라고 합시다.
# df = pd.read_csv('path_to_your_data.csv') # 데이터 파일을 로드하는 예시

# 지정된 기간에 대한 SMA 계산
periods = [5, 20, 60, 120, 240]
for period in periods:
    df[f'SMA_{period}'] = df['종가'].rolling(window=period).mean()

# 지정된 기간에 대한 EMA 계산
for period in periods:
    df[f'EMA_{period}'] = df['종가'].ewm(span=period, adjust=False).mean()

for i in range(len(periods)):
    for j in range(i + 1, len(periods)):
        smaller_period = periods[i]
        larger_period = periods[j]
        df[f'SMA_{smaller_period}_minus_SMA_{larger_period}'] = df[f'SMA_{smaller_period}'] - df[f'SMA_{larger_period}']

# EMA 간의 차이 계산
for i in range(len(periods)):
    for j in range(i + 1, len(periods)):
        smaller_period = periods[i]
        larger_period = periods[j]
        df[f'EMA_{smaller_period}_minus_EMA_{larger_period}'] = df[f'EMA_{smaller_period}'] - df[f'EMA_{larger_period}']

for column in ['시가', '고가', '저가','거래량','MACD','MACD_Signal','RSI','RSI_Signal']:
    df[f'{column}_등락률'] = df[column].pct_change() * 100

# SMA 및 EMA의 전 거래일 대비 등락률 계산 및 DataFrame에 추가
periods = [5, 20, 60, 120, 240]

# SMA 등락률 계산 및 추가
for period in periods:
    df[f'SMA_{period}_등락률'] = df[f'SMA_{period}'].pct_change() * 100

# EMA 등락률 계산 및 추가
for period in periods:
    df[f'EMA_{period}_등락률'] = df[f'EMA_{period}'].pct_change() * 100

# 'MACD'가 0 이상일 때 1, 아니면 0을 할당
df['MACD_Binary'] = (df['MACD'] >= 0).astype(int)

# 'MACD - MACD_Signal'이 0 이상일 때 1, 아니면 0을 할당
df['MACD_minus_Signal_Binary'] = ((df['MACD'] - df['MACD_Signal']) >= 0).astype(int)

# 'RSI - RSI_Signal'이 0 이상일 때 1, 아니면 0을 할당
df['RSI_minus_Signal_Binary'] = ((df['RSI'] - df['RSI_Signal']) >= 0).astype(int)

# 시가 대비 종가 등락률 컬럼 추가
df['시가_대비_종가_등락률'] = ((df['종가'] - df['시가']) / df['시가']) * 100
df['시가_대비_저가_등락률'] = ((df['저가'] - df['시가']) / df['시가']) * 100
df['시가_대비_고가_등락률'] = ((df['고가'] - df['시가']) / df['시가']) * 100
df['저가_대비_종가_등락률'] = ((df['종가'] - df['저가']) / df['저가']) * 100
df['저가_대비_고가_등락률'] = ((df['고가'] - df['저가']) / df['저가']) * 100
df['고가_대비_종가_등락률'] = ((df['종가'] - df['고가']) / df['고가']) * 100
# 종가 - SMA [5, 20, 60, 120, 240] 값의 차이 컬럼 추가
for period in [5, 20, 60, 120, 240]:
    df[f'종가_minus_SMA_{period}'] = df['종가'] - df[f'SMA_{period}']

# 종가 - EMA [5, 20, 60, 120, 240] 값의 차이 컬럼 추가
for period in [5, 20, 60, 120, 240]:
    df[f'종가_minus_EMA_{period}'] = df['종가'] - df[f'EMA_{period}']


#csv_file_path = "C:/apps/h1/이스트소프트_data.csv"  # 저장할 파일 경로 및 이름 설정
#df.to_csv(csv_file_path, encoding='utf-8-sig')

#print(f"Data saved to {csv_file_path}")

# 등락률을 기준으로 다음 날 등락률 계산 후 target 생성
df['next_day_return'] = df['등락률'].shift(-1)
df['target'] = df['next_day_return'].apply(lambda x: 1 if x > 0.25 else -1)
df.dropna(inplace=True)  # 마지막 행 삭제

first_column_name = df.columns[0]

df1= df.iloc[:-50, ]
X = df1.drop(['next_day_return','target'], axis=1)
y = df1['target']

# Calculating correlations for Forward Selection
correlations = X.corrwith(y).abs().sort_values(ascending=False).reset_index()
correlations.columns = ['Feature', 'Correlation']

# Performing T-tests
# t_tests = {column: ttest_ind(X[column][y == 1], X[column][y == -1], nan_policy='omit') for column in X.columns}
# t_tests_sorted = sorted(t_tests.items(), key=lambda x: x[1].pvalue)
# t_tests_df = pd.DataFrame(t_tests_sorted, columns=['Feature', 'T-test'])
# t_tests_df['T-test'] = t_tests_df['T-test'].apply(lambda x: x.pvalue)  # Only keep p-value for simplicity

# # Lasso and Ridge regression
# lasso = LassoCV().fit(X, y)
# ridge = RidgeCV().fit(X, y)

# lasso_importance = np.abs(lasso.coef_)
# ridge_importance = np.abs(ridge.coef_)

# features_lasso_sorted = sorted(zip(X.columns, lasso_importance), key=lambda x: x[1], reverse=True)
# features_ridge_sorted = sorted(zip(X.columns, ridge_importance), key=lambda x: x[1], reverse=True)

# features_lasso_df = pd.DataFrame(features_lasso_sorted, columns=['Feature', 'Lasso Importance'])
# features_ridge_df = pd.DataFrame(features_ridge_sorted, columns=['Feature', 'Ridge Importance'])

# # Random Forest for feature importance
# rf = RandomForestRegressor()
# rf.fit(X, y)

# features_rf_sorted = sorted(zip(X.columns, rf.feature_importances_), key=lambda x: x[1], reverse=True)
# features_rf_df = pd.DataFrame(features_rf_sorted, columns=['Feature', 'RF Importance'])

# # Combining all the data into a single dataframe without merging by feature
# final_df = pd.concat([correlations, t_tests_df.drop('Feature', axis=1), features_lasso_df.drop('Feature', axis=1), 
#                       features_ridge_df.drop('Feature', axis=1), features_rf_df.drop('Feature', axis=1)], axis=1)

# 상관관계가 가장 높은 상위 8개의 피처를 선정
top_8_features = correlations.sort_values(by='Correlation', ascending=False).head(8)['Feature']

# X와 y 정의 (df2와 이전 코드에서의 처리 과정을 바탕으로)
X_top8 = X[top_8_features]
y = df1['target'].map({-1: 0, 1: 1})  # y 값을 0과 1로 조정

# 데이터를 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X_top8, y, test_size=0.3, random_state=42)

# 모델 초기화
models = {
    'SVC': SVC(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

# 점수를 저장할 딕셔너리
scores = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1_Score': []
}

# 각 모델에 대해 학습 및 평가
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    scores['Model'].append(model_name)
    scores['Accuracy'].append(accuracy_score(y_test, y_pred))
    scores['Precision'].append(precision_score(y_test, y_pred, average='macro'))
    scores['Recall'].append(recall_score(y_test, y_pred, average='macro'))
    scores['F1_Score'].append(f1_score(y_test, y_pred, average='macro'))

# 점수 딕셔너리를 데이터프레임으로 변환
scores_df = pd.DataFrame(scores)
scores_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score
0,SVC,0.2,0.1,0.5,0.166667
1,LogisticRegression,0.6,0.666667,0.75,0.583333
2,DecisionTreeClassifier,0.6,0.666667,0.75,0.583333
3,RandomForestClassifier,0.6,0.666667,0.75,0.583333


In [1]:
from pykrx import stock
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv("하이닉스 임시.csv", thousands=',', encoding = 'cp949')

X = df.drop(['날짜','target'], axis=1) 
y = df['target']
# Calculating correlations for Forward Selection
correlations = X.corrwith(y).abs().sort_values(ascending=False).reset_index()
correlations.columns = ['Feature', 'Correlation']

#Performing T-tests
t_tests = {column: ttest_ind(X[column][y == 1], X[column][y == -1], nan_policy='omit') for column in X.columns}
t_tests_sorted = sorted(t_tests.items(), key=lambda x: x[1].pvalue)
t_tests_df = pd.DataFrame(t_tests_sorted, columns=['Feature', 'T-test'])
t_tests_df['T-test'] = t_tests_df['T-test'].apply(lambda x: x.pvalue)  # Only keep p-value for simplicity

# Lasso and Ridge regression
lasso = LassoCV().fit(X, y)
ridge = RidgeCV().fit(X, y)

lasso_importance = np.abs(lasso.coef_)
ridge_importance = np.abs(ridge.coef_)

features_lasso_sorted = sorted(zip(X.columns, lasso_importance), key=lambda x: x[1], reverse=True)
features_ridge_sorted = sorted(zip(X.columns, ridge_importance), key=lambda x: x[1], reverse=True)

features_lasso_df = pd.DataFrame(features_lasso_sorted, columns=['Feature', 'Lasso Importance'])
features_ridge_df = pd.DataFrame(features_ridge_sorted, columns=['Feature', 'Ridge Importance'])

# Random Forest for feature importance
rf = RandomForestRegressor()
rf.fit(X, y)

features_rf_sorted = sorted(zip(X.columns, rf.feature_importances_), key=lambda x: x[1], reverse=True)
features_rf_df = pd.DataFrame(features_rf_sorted, columns=['Feature', 'RF Importance'])

# Combining all the data into a single dataframe without merging by feature
final_df = pd.concat([correlations, t_tests_df.drop('Feature', axis=1), features_lasso_df.drop('Feature', axis=1), 
                      features_ridge_df.drop('Feature', axis=1), features_rf_df.drop('Feature', axis=1)], axis=1)

# 상관관계가 가장 높은 상위 8개의 피처를 선정
top_8_features = correlations.sort_values(by='Correlation', ascending=False).head(8)['Feature']

# X와 y 정의 (df2와 이전 코드에서의 처리 과정을 바탕으로)
X_top8 = X[top_8_features]
y = df['target']  # y 값을 0과 1로 조정

# 데이터를 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X_top8, y, test_size=0.3, random_state=42)

# 모델 초기화
models = {
    'SVC': SVC(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

# 점수를 저장할 딕셔너리
scores = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1_Score': []
}

# 각 모델에 대해 학습 및 평가
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    scores['Model'].append(model_name)
    scores['Accuracy'].append(accuracy_score(y_test, y_pred))
    scores['Precision'].append(precision_score(y_test, y_pred, average='macro'))
    scores['Recall'].append(recall_score(y_test, y_pred, average='macro'))
    scores['F1_Score'].append(f1_score(y_test, y_pred, average='macro'))

# 점수 딕셔너리를 데이터프레임으로 변환
scores_df = pd.DataFrame(scores)
scores_df

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score
0,SVC,1.0,1.0,1.0,1.0
1,LogisticRegression,0.986486,0.988636,0.983871,0.986056
2,DecisionTreeClassifier,1.0,1.0,1.0,1.0
3,RandomForestClassifier,1.0,1.0,1.0,1.0


In [9]:
final_df

Unnamed: 0,Feature,Correlation,T-test,Lasso Importance,Ridge Importance,RF Importance
0,종가_등락률,0.368306,2.553672e-09,3.334257e-07,3.240105e-01,0.119221
1,[일]개인_수량,0.316797,3.879484e-07,2.439243e-08,2.826308e-01,0.062286
2,RSI_등락률,0.291649,3.278096e-06,0.000000e+00,2.569317e-01,0.056271
3,고가_등락률,0.284503,5.799595e-06,0.000000e+00,2.535630e-01,0.046150
4,[일]외국인_수량,0.280868,7.706422e-06,0.000000e+00,2.193102e-01,0.034086
...,...,...,...,...,...,...
84,SMA_240_등락률,0.006264,9.221377e-01,0.000000e+00,5.017791e-07,0.000988
85,SMA_20_등락률,0.003501,9.564297e-01,0.000000e+00,4.686008e-07,0.000975
86,종가_minus_EMA_240,0.002824,9.648542e-01,0.000000e+00,2.470988e-07,0.000146
87,종가_minus_SMA_240,0.001820,9.773410e-01,0.000000e+00,8.344068e-08,0.000000


In [11]:
X_top8

Unnamed: 0,종가_등락률,[일]개인_수량,RSI_등락률,고가_등락률,[일]외국인_수량,저가_등락률,시가_대비_종가_등락률,시가_대비_고가_등락률
0,-0.119474,-14243,2.212389,0.000000,-253159,-0.120482,-0.594530,1.189061
1,3.947368,-727842,27.961672,2.585194,389364,1.085645,1.995305,2.464789
2,1.841197,-592656,10.094892,1.374570,-204360,1.551313,3.629977,3.629977
3,-1.355932,315529,-13.981234,2.485876,-650758,1.880141,-2.020202,1.795735
4,-2.061856,426142,-3.968254,-3.197354,-372493,-2.306805,-2.285714,0.342857
...,...,...,...,...,...,...,...,...
241,-1.039120,557001,-11.845730,-3.241014,-418025,-0.555556,-1.039120,0.366748
242,-0.432366,170734,-9.752604,-0.791717,-113095,-1.551831,1.002506,2.067669
243,1.923077,-120655,5.370008,0.982198,-575769,1.513241,1.923077,2.047146
244,-2.495435,700714,8.333333,-1.641337,-212573,-2.111801,-0.927644,0.061843


In [3]:
from pykrx import stock
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv("df1.csv", thousands=',')

X = df.drop(['날짜','target', '[일]프로그램순매수(종목)'], axis=1) 
y = df['target']
# Calculating correlations for Forward Selection
correlations = X.corrwith(y).abs().sort_values(ascending=False).reset_index()
correlations.columns = ['Feature', 'Correlation']

#Performing T-tests
t_tests = {column: ttest_ind(X[column][y == 1], X[column][y == -1], nan_policy='omit') for column in X.columns}
t_tests_sorted = sorted(t_tests.items(), key=lambda x: x[1].pvalue)
t_tests_df = pd.DataFrame(t_tests_sorted, columns=['Feature', 'T-test'])
t_tests_df['T-test'] = t_tests_df['T-test'].apply(lambda x: x.pvalue)  # Only keep p-value for simplicity

# Lasso and Ridge regression
lasso = LassoCV().fit(X, y)
ridge = RidgeCV().fit(X, y)

lasso_importance = np.abs(lasso.coef_)
ridge_importance = np.abs(ridge.coef_)

features_lasso_sorted = sorted(zip(X.columns, lasso_importance), key=lambda x: x[1], reverse=True)
features_ridge_sorted = sorted(zip(X.columns, ridge_importance), key=lambda x: x[1], reverse=True)

features_lasso_df = pd.DataFrame(features_lasso_sorted, columns=['Feature', 'Lasso Importance'])
features_ridge_df = pd.DataFrame(features_ridge_sorted, columns=['Feature', 'Ridge Importance'])

# Random Forest for feature importance
rf = RandomForestRegressor()
rf.fit(X, y)

features_rf_sorted = sorted(zip(X.columns, rf.feature_importances_), key=lambda x: x[1], reverse=True)
features_rf_df = pd.DataFrame(features_rf_sorted, columns=['Feature', 'RF Importance'])

# Combining all the data into a single dataframe without merging by feature
final_df = pd.concat([correlations, t_tests_df.drop('Feature', axis=1), features_lasso_df.drop('Feature', axis=1), 
                      features_ridge_df.drop('Feature', axis=1), features_rf_df.drop('Feature', axis=1)], axis=1)

# 상관관계가 가장 높은 상위 8개의 피처를 선정
top_8_features = correlations.sort_values(by='Correlation', ascending=False).head(8)['Feature']

# X와 y 정의 (df2와 이전 코드에서의 처리 과정을 바탕으로)
X_top8 = X[top_8_features]
y = df['target']  # y 값을 0과 1로 조정

# 데이터를 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X_top8, y, test_size=0.3, random_state=42)

# 모델 초기화
models = {
    'SVC': SVC(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

# 점수를 저장할 딕셔너리
scores = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1_Score': []
}

# 각 모델에 대해 학습 및 평가
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    scores['Model'].append(model_name)
    scores['Accuracy'].append(accuracy_score(y_test, y_pred))
    scores['Precision'].append(precision_score(y_test, y_pred, average='macro'))
    scores['Recall'].append(recall_score(y_test, y_pred, average='macro'))
    scores['F1_Score'].append(f1_score(y_test, y_pred, average='macro'))

# 점수 딕셔너리를 데이터프레임으로 변환
scores_df = pd.DataFrame(scores)
scores_df

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score
0,SVC,0.605263,0.654882,0.628852,0.59517
1,LogisticRegression,0.644737,0.684314,0.664566,0.639684
2,DecisionTreeClassifier,0.526316,0.543155,0.540616,0.523345
3,RandomForestClassifier,0.578947,0.594203,0.591036,0.577778


In [4]:
top_8_features

0       [일]외국인_수량
1          종가_등락률
2        [일]개인_수량
3    시가_대비_종가_등락률
4         RSI_등락률
5    시가_대비_고가_등락률
6    저가_대비_종가_등락률
7    시가_대비_저가_등락률
Name: Feature, dtype: object