In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from pykrx import stock
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
ticker_list = stock.get_market_ticker_list(market="KOSPI")

#ticker_list=[]
# 빈 리스트
results_list = []

# 각 종목에 대해 반복
for ticker in tqdm(ticker_list, desc="Processing"):
    try:
        ticker_name = stock.get_market_ticker_name(ticker)
        start_date = "2020-03-20"
        end_date = "2024-03-20"
        # 주어진 기간 동안의 일별 거래량 정보 가져오기
        df = stock.get_market_ohlcv_by_date(fromdate=start_date, todate=end_date, ticker=ticker)


        def calculate_macd(df, short_window=12, long_window=26, signal_window=9):
            """MACD 및 MACD 신호 계산"""
            df['EMA_short'] = df['종가'].ewm(span=short_window, adjust=False).mean()
            df['EMA_long'] = df['종가'].ewm(span=long_window, adjust=False).mean()
            df['MACD'] = df['EMA_short'] - df['EMA_long']
            df['MACD_Signal'] = df['MACD'].ewm(span=signal_window, adjust=False).mean()
            return df

        def calculate_rsi(df, window=14, signal_window=9):
            """RSI 및 RSI 신호 계산"""
            delta = df['종가'].diff()
            gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
            loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()

            rs = gain / loss
            df['RSI'] = 100 - (100 / (1 + rs))
            
            # RSI 신호선 추가
            df['RSI_Signal'] = df['RSI'].ewm(span=signal_window, adjust=False).mean()
            return df

        # MACD 및 MACD 신호 계산
        df = calculate_macd(df)

        df = calculate_rsi(df)


        # 지정된 기간에 대한 SMA 계산
        periods = [5, 20, 60, 120, 240]
        for period in periods:
            df[f'SMA_{period}'] = df['종가'].rolling(window=period).mean()

        # 지정된 기간에 대한 EMA 계산
        for period in periods:
            df[f'EMA_{period}'] = df['종가'].ewm(span=period, adjust=False).mean()

        for i in range(len(periods)):
            for j in range(i + 1, len(periods)):
                smaller_period = periods[i]
                larger_period = periods[j]
                df[f'SMA_{smaller_period}_minus_SMA_{larger_period}'] = df[f'SMA_{smaller_period}'] - df[f'SMA_{larger_period}']

        # EMA 간의 차이 계산
        for i in range(len(periods)):
            for j in range(i + 1, len(periods)):
                smaller_period = periods[i]
                larger_period = periods[j]
                df[f'EMA_{smaller_period}_minus_EMA_{larger_period}'] = df[f'EMA_{smaller_period}'] - df[f'EMA_{larger_period}']

        for column in ['시가', '고가', '저가','거래량','MACD','MACD_Signal','RSI','RSI_Signal']:
            df[f'{column}_등락률'] = df[column].pct_change() * 100

        # SMA 및 EMA의 전 거래일 대비 등락률 계산 및 DataFrame에 추가
        periods = [5, 20, 60, 120, 240]

        # SMA 등락률 계산 및 추가
        for period in periods:
            df[f'SMA_{period}_등락률'] = df[f'SMA_{period}'].pct_change() * 100

        # EMA 등락률 계산 및 추가
        for period in periods:
            df[f'EMA_{period}_등락률'] = df[f'EMA_{period}'].pct_change() * 100

        # 'MACD'가 0 이상일 때 1, 아니면 0을 할당
        df['MACD_Binary'] = (df['MACD'] >= 0).astype(int)

        # 'MACD - MACD_Signal'이 0 이상일 때 1, 아니면 0을 할당
        df['MACD_minus_Signal_Binary'] = ((df['MACD'] - df['MACD_Signal']) >= 0).astype(int)

        # 'RSI - RSI_Signal'이 0 이상일 때 1, 아니면 0을 할당
        df['RSI_minus_Signal_Binary'] = ((df['RSI'] - df['RSI_Signal']) >= 0).astype(int)

        # 시가 대비 종가 등락률 컬럼 추가
        df['시가_대비_종가_등락률'] = ((df['종가'] - df['시가']) / df['시가']) * 100
        df['시가_대비_저가_등락률'] = ((df['저가'] - df['시가']) / df['시가']) * 100
        df['시가_대비_고가_등락률'] = ((df['고가'] - df['시가']) / df['시가']) * 100
        df['저가_대비_종가_등락률'] = ((df['종가'] - df['저가']) / df['저가']) * 100
        df['저가_대비_고가_등락률'] = ((df['고가'] - df['저가']) / df['저가']) * 100
        df['고가_대비_종가_등락률'] = ((df['종가'] - df['고가']) / df['고가']) * 100
        # 종가 - SMA [5, 20, 60, 120, 240] 값의 차이 컬럼 추가
        for period in [5, 20, 60, 120, 240]:
            df[f'종가_minus_SMA_{period}'] = df['종가'] - df[f'SMA_{period}']

        # 종가 - EMA [5, 20, 60, 120, 240] 값의 차이 컬럼 추가
        for period in [5, 20, 60, 120, 240]:
            df[f'종가_minus_EMA_{period}'] = df['종가'] - df[f'EMA_{period}']


        # 등락률을 기준으로 다음 날 등락률 계산 후 target 생성
        df['next_day_return'] = df['등락률'].shift(-1)
        df['target'] = df['next_day_return'].apply(lambda x: 1 if x > 0.25 else -1)
        df.dropna(inplace=True)  # 마지막 행 삭제

        first_column_name = df.columns[0]

        
        X = df.drop(['next_day_return','target'], axis=1)
        y = df['target']

        # Calculating correlations for Forward Selection
        correlations = X.corrwith(y).abs().sort_values(ascending=False).reset_index()
        correlations.columns = ['Feature', 'Correlation']

        top_8_features = correlations.sort_values(by='Correlation', ascending=False).head(8)['Feature']

        # X와 y 정의 (df2와 이전 코드에서의 처리 과정을 바탕으로)
        X_top8 = X[top_8_features]
        y = df['target'] # y 값을 0과 1로 조정

        # 데이터를 학습 세트와 테스트 세트로 분할
        X_train, X_test, y_train, y_test = train_test_split(X_top8, y, test_size=0.25, random_state=42)

        # 선형회귀 모델 학습 및 평가
        linear_reg = LinearRegression()
        linear_reg.fit(X_train, y_train)
        y_pred_linear = linear_reg.predict(X_test)
        r2_linear = r2_score(y_test, y_pred_linear)

        # 다항회귀 모델 학습 및 평가
        poly_features = PolynomialFeatures(degree=2, include_bias=False)
        X_poly_train = poly_features.fit_transform(X_train)
        X_poly_test = poly_features.transform(X_test)
        poly_reg = LinearRegression()
        poly_reg.fit(X_poly_train, y_train)
        y_pred_poly = poly_reg.predict(X_poly_test)
        r2_poly = r2_score(y_test, y_pred_poly)

        # 라쏘 회귀 모델 학습 및 평가
        lasso_reg = Lasso(alpha=0.1)
        lasso_reg.fit(X_train, y_train)
        y_pred_lasso = lasso_reg.predict(X_test)
        r2_lasso = r2_score(y_test, y_pred_lasso)

        # 릿지 회귀 모델 학습 및 평가
        ridge_reg = Ridge(alpha=1.0)
        ridge_reg.fit(X_train, y_train)
        y_pred_ridge = ridge_reg.predict(X_test)
        r2_ridge = r2_score(y_test, y_pred_ridge)

        # 결과 저장
        results_list.append({
            'Ticker': ticker,
            'Name': ticker_name,
            'r2_linear': r2_linear,
            'r2_poly': r2_poly,
            'r2_lasso': r2_lasso,
            'r2_ridge': r2_ridge
        })
        pass
    except Exception as e:
        print(f"Error processing {ticker}: {e}")
        continue

# 결과를 데이터프레임으로 변환하고 precision_svm 값에 따라 내림차순 정렬
results_df = pd.DataFrame(results_list).sort_values(by='r2_linear', ascending=False)



Processing:   4%|▎         | 34/953 [00:15<04:44,  3.23it/s] 

Error processing 017860: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  14%|█▎        | 129/953 [00:42<03:13,  4.26it/s]

Error processing 456040: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  18%|█▊        | 172/953 [00:56<03:29,  3.72it/s]

Error processing 465770: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  25%|██▌       | 240/953 [01:19<03:20,  3.55it/s]

Error processing 092790: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  33%|███▎      | 312/953 [01:44<02:12,  4.86it/s]

Error processing 460850: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
Error processing 460860: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  36%|███▌      | 339/953 [01:53<02:31,  4.04it/s]

Error processing 111380: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  36%|███▋      | 346/953 [01:55<02:39,  3.81it/s]

Error processing 454910: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  45%|████▌     | 432/953 [02:24<02:18,  3.76it/s]

Error processing 448730: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  62%|██████▏   | 589/953 [03:14<01:14,  4.89it/s]

Error processing 278470: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
Error processing 450080: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  68%|██████▊   | 647/953 [03:31<01:31,  3.35it/s]

Error processing 457190: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  73%|███████▎  | 695/953 [03:46<01:12,  3.54it/s]

Error processing 462520: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  74%|███████▎  | 702/953 [03:48<01:19,  3.15it/s]

Error processing 109070: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  92%|█████████▏| 881/953 [04:45<00:20,  3.48it/s]

Error processing 452260: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
Error processing 45226K: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  93%|█████████▎| 883/953 [04:45<00:17,  4.11it/s]

Error processing 451800: With n_samples=1, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing:  94%|█████████▍| 900/953 [04:50<00:13,  3.89it/s]

Error processing 453340: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


Processing: 100%|██████████| 953/953 [05:06<00:00,  3.11it/s]


In [3]:
import pandas as pd

# 전체 행을 출력하도록 설정
pd.set_option('display.max_rows', None)

# 전체 열을 출력하도록 설정
pd.set_option('display.max_columns', None)

results_df

Unnamed: 0,Ticker,Name,r2_linear,r2_poly,r2_lasso,r2_ridge
397,377740,바이오노트,0.248553,-27.13093,0.263833,0.273069
369,357430,마스턴프리미어리츠,0.145425,-0.06915491,0.06909,0.138399
730,45014K,코오롱모빌리티그룹우,0.127887,-2074.89,0.137928,0.127916
628,003465,유화증권우,0.095164,-0.08076875,0.075466,0.091379
703,000650,천일고속,0.088697,-0.09883059,0.049345,0.079111
387,357250,미래에셋맵스리츠,0.07988,-0.0581958,0.074597,0.079886
60,035000,HS애드,0.070591,0.04325922,0.038844,0.06787
820,168490,한국패러랠,0.067262,-0.09360075,0.050611,0.048282
452,002810,삼영무역,0.06444,-0.2070739,0.001028,0.063954
516,016590,신대양제지,0.061781,-0.06927948,0.058941,0.061781
