In [19]:
from pykrx import stock
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# SK하이닉스 종목코드
ticker = "399720"
# 시작 날짜와 종료 날짜 설정
start_date = "2014-03-20"
end_date = "2024-03-20"
# 주어진 기간 동안의 일별 거래량 정보 가져오기
df = stock.get_market_ohlcv_by_date(fromdate=start_date, todate=end_date, ticker=ticker)


def calculate_macd(df, short_window=12, long_window=26, signal_window=9):
    """MACD 및 MACD 신호 계산"""
    df['EMA_short'] = df['종가'].ewm(span=short_window, adjust=False).mean()
    df['EMA_long'] = df['종가'].ewm(span=long_window, adjust=False).mean()
    df['MACD'] = df['EMA_short'] - df['EMA_long']
    df['MACD_Signal'] = df['MACD'].ewm(span=signal_window, adjust=False).mean()
    return df

def calculate_rsi(df, window=14, signal_window=9):
    """RSI 및 RSI 신호 계산"""
    delta = df['종가'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()

    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # RSI 신호선 추가
    df['RSI_Signal'] = df['RSI'].ewm(span=signal_window, adjust=False).mean()
    return df

# MACD 및 MACD 신호 계산
df = calculate_macd(df)

df = calculate_rsi(df)

# '종가' 컬럼이 포함된 DataFrame을 가정합니다. 예를 들어, df라고 합시다.
# df = pd.read_csv('path_to_your_data.csv') # 데이터 파일을 로드하는 예시

# 지정된 기간에 대한 SMA 계산
periods = [5, 20, 60, 120, 240]
for period in periods:
    df[f'SMA_{period}'] = df['종가'].rolling(window=period).mean()

# 지정된 기간에 대한 EMA 계산
for period in periods:
    df[f'EMA_{period}'] = df['종가'].ewm(span=period, adjust=False).mean()

for i in range(len(periods)):
    for j in range(i + 1, len(periods)):
        smaller_period = periods[i]
        larger_period = periods[j]
        df[f'SMA_{smaller_period}_minus_SMA_{larger_period}'] = df[f'SMA_{smaller_period}'] - df[f'SMA_{larger_period}']

# EMA 간의 차이 계산
for i in range(len(periods)):
    for j in range(i + 1, len(periods)):
        smaller_period = periods[i]
        larger_period = periods[j]
        df[f'EMA_{smaller_period}_minus_EMA_{larger_period}'] = df[f'EMA_{smaller_period}'] - df[f'EMA_{larger_period}']

for column in ['시가', '고가', '저가','거래량','MACD','MACD_Signal','RSI','RSI_Signal']:
    df[f'{column}_등락률'] = df[column].pct_change() * 100

# SMA 및 EMA의 전 거래일 대비 등락률 계산 및 DataFrame에 추가
periods = [5, 20, 60, 120, 240]

# SMA 등락률 계산 및 추가
for period in periods:
    df[f'SMA_{period}_등락률'] = df[f'SMA_{period}'].pct_change() * 100

# EMA 등락률 계산 및 추가
for period in periods:
    df[f'EMA_{period}_등락률'] = df[f'EMA_{period}'].pct_change() * 100

# 'MACD'가 0 이상일 때 1, 아니면 0을 할당
df['MACD_Binary'] = (df['MACD'] >= 0).astype(int)

# 'MACD - MACD_Signal'이 0 이상일 때 1, 아니면 0을 할당
df['MACD_minus_Signal_Binary'] = ((df['MACD'] - df['MACD_Signal']) >= 0).astype(int)

# 'RSI - RSI_Signal'이 0 이상일 때 1, 아니면 0을 할당
df['RSI_minus_Signal_Binary'] = ((df['RSI'] - df['RSI_Signal']) >= 0).astype(int)

# 시가 대비 종가 등락률 컬럼 추가
df['시가_대비_종가_등락률'] = ((df['종가'] - df['시가']) / df['시가']) * 100
df['시가_대비_저가_등락률'] = ((df['저가'] - df['시가']) / df['시가']) * 100
df['시가_대비_고가_등락률'] = ((df['고가'] - df['시가']) / df['시가']) * 100
df['저가_대비_종가_등락률'] = ((df['종가'] - df['저가']) / df['저가']) * 100
df['저가_대비_고가_등락률'] = ((df['고가'] - df['저가']) / df['저가']) * 100
df['고가_대비_종가_등락률'] = ((df['종가'] - df['고가']) / df['고가']) * 100
# 종가 - SMA [5, 20, 60, 120, 240] 값의 차이 컬럼 추가
for period in [5, 20, 60, 120, 240]:
    df[f'종가_minus_SMA_{period}'] = df['종가'] - df[f'SMA_{period}']

# 종가 - EMA [5, 20, 60, 120, 240] 값의 차이 컬럼 추가
for period in [5, 20, 60, 120, 240]:
    df[f'종가_minus_EMA_{period}'] = df['종가'] - df[f'EMA_{period}']


#csv_file_path = "C:/apps/h1/이스트소프트_data.csv"  # 저장할 파일 경로 및 이름 설정
#df.to_csv(csv_file_path, encoding='utf-8-sig')

#print(f"Data saved to {csv_file_path}")

# 등락률을 기준으로 다음 날 등락률 계산 후 target 생성
df['next_day_return'] = df['등락률'].shift(-1)
df['target'] = df['next_day_return'].apply(lambda x: 1 if x > 0.25 else -1)
df.dropna(inplace=True)  # 마지막 행 삭제

first_column_name = df.columns[0]

df1= df.iloc[:-50, :]
X = df1.drop(['next_day_return','target'], axis=1)
y = df1['target']


#X = df2.drop(['날짜', 'next_day_return','target'], axis=1)
#y = df2['target']

# Step 5: Correlation for Forward Selection
correlations = X.corrwith(y).abs().sort_values(ascending=False)

# Step 6: T-test for mean differences between label groups
t_tests = {column: ttest_ind(X[column][y == 1], X[column][y == -1], nan_policy='omit') for column in X.columns}
t_tests_sorted = sorted(t_tests.items(), key=lambda x: x[1].pvalue)

# Step 7: Lasso and Ridge for embedded methods
lasso = LassoCV().fit(X, y)
ridge = RidgeCV().fit(X, y)

lasso_importance = np.abs(lasso.coef_)
ridge_importance = np.abs(ridge.coef_)

features_lasso_sorted = sorted(zip(X.columns, lasso_importance), key=lambda x: x[1], reverse=True)
features_ridge_sorted = sorted(zip(X.columns, ridge_importance), key=lambda x: x[1], reverse=True)

# Step 8: Random Forest for feature importance
rf = RandomForestRegressor()
rf.fit(X, y)
features_rf_sorted = sorted(zip(X.columns, rf.feature_importances_), key=lambda x: x[1], reverse=True) 
# print(correlations)
# print(t_tests_sorted)
# print(features_lasso_sorted)
# print(features_ridge_sorted)
# print(features_rf_sorted)



  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [20]:
features_rf_sorted 

[('EMA_60_minus_EMA_120', 0.04536625032037243),
 ('SMA_60', 0.04423570687148038),
 ('SMA_60_minus_SMA_120', 0.03419362684361768),
 ('MACD_등락률', 0.02964588072249679),
 ('시가_등락률', 0.029419266775733206),
 ('RSI_Signal_등락률', 0.02926335681151429),
 ('고가_대비_종가_등락률', 0.027903637075195692),
 ('거래량_등락률', 0.027389519672163472),
 ('저가_대비_종가_등락률', 0.023724277980613894),
 ('SMA_20_minus_SMA_120', 0.023537086349080596),
 ('저가_등락률', 0.023528944376115482),
 ('등락률', 0.02350948669523943),
 ('MACD_Signal_등락률', 0.022124030586367156),
 ('RSI_등락률', 0.022011293214606164),
 ('RSI', 0.02199255876542748),
 ('SMA_5_등락률', 0.021475679730725748),
 ('시가_대비_저가_등락률', 0.021337567665116616),
 ('저가_대비_고가_등락률', 0.020154867103079555),
 ('SMA_60_등락률', 0.01960979422215315),
 ('거래량', 0.018987868356075837),
 ('고가_등락률', 0.018751743767473513),
 ('시가_대비_고가_등락률', 0.018613544166865437),
 ('SMA_60_minus_SMA_240', 0.016962262583483736),
 ('SMA_20_minus_SMA_60', 0.01642816830942531),
 ('시가', 0.016306974324110206),
 ('SMA_240_등락률', 0.0

In [15]:
df1

Unnamed: 0_level_0,시가,고가,저가,종가,거래량,등락률,EMA_short,EMA_long,MACD,MACD_Signal,...,종가_minus_SMA_60,종가_minus_SMA_120,종가_minus_SMA_240,종가_minus_EMA_5,종가_minus_EMA_20,종가_minus_EMA_60,종가_minus_EMA_120,종가_minus_EMA_240,next_day_return,target
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-03-16,19075,21100,19025,21025,221251,11.390728,18529.085772,17988.221425,540.864347,241.490869,...,4664.583333,8892.291667,10973.958333,1651.065935,2829.374496,4794.217962,7317.166318,9469.822352,-2.021403,-1
2015-03-17,21450,21450,20300,20600,80670,-2.021403,18847.687961,18181.686505,666.001456,326.392986,...,4071.250000,8358.541667,10502.500000,817.377290,2175.386449,4225.964914,6778.246213,8969.761585,0.970874,1
2015-03-18,20600,21700,20100,20800,79764,0.970874,19148.043659,18375.635653,772.408007,415.595990,...,4091.666667,8448.125000,10655.625000,678.251526,2149.159168,4280.851310,6862.903301,9093.663979,1.802885,1
2015-03-19,20800,21700,19525,21175,139834,1.802885,19459.883096,18582.995975,876.887122,507.854217,...,4284.583333,8709.583333,10981.354167,702.167684,2283.763057,4503.200447,7118.268535,9390.085855,-0.354191,-1
2015-03-20,20775,22350,20500,21100,120126,-0.354191,19712.208774,18769.440717,942.768057,594.836985,...,4021.250000,8520.625000,10857.187500,418.111790,1998.404670,4283.013547,6926.850873,9237.782238,5.687204,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-26,14200,14850,14080,14600,204373,2.816901,13995.569931,13927.901601,67.668330,-26.739942,...,617.166667,-108.500000,1109.708333,406.769060,669.149775,459.852238,418.356770,981.240689,0.821918,1
2023-12-27,14480,14950,14400,14720,162134,0.821918,14107.020711,13986.575557,120.445154,2.697077,...,762.166667,5.500000,1204.708333,351.179373,713.992653,560.840689,529.458311,1092.101762,0.407609,1
2023-12-28,14900,14910,14500,14780,88378,0.407609,14210.555986,14045.347738,165.208248,35.199311,...,854.166667,57.000000,1239.875000,274.119582,700.279067,600.485257,579.715199,1142.540752,5.615697,1
2024-01-02,15030,15800,14910,15610,343657,5.615697,14425.855065,14161.247905,264.607160,81.080881,...,1693.833333,871.000000,2040.166667,736.079722,1384.538204,1383.584101,1386.414121,1956.171119,-0.640615,-1
