In [5]:
from pykrx import stock
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv("삼성전자.csv", thousands=',', encoding='cp949')

df['next_day_return'] = (df['종가'].shift(+1) - df['종가'])/ df['종가'] *100
df['target'] = df['next_day_return'].apply(lambda x: 1 if x > 0.25 else -1)

 
df1= df.iloc[1:248, :]
X = df1.drop(['날짜','target', 'next_day_return', 'Unnamed: 12'], axis=1) 
y = df1['target']
# Calculating correlations for Forward Selection
correlations = X.corrwith(y).abs().sort_values(ascending=False).reset_index()
correlations.columns = ['Feature', 'Correlation']

#Performing T-tests
t_tests = {column: ttest_ind(X[column][y == 1], X[column][y == -1], nan_policy='omit') for column in X.columns}
t_tests_sorted = sorted(t_tests.items(), key=lambda x: x[1].pvalue)
t_tests_df = pd.DataFrame(t_tests_sorted, columns=['Feature', 'T-test'])
t_tests_df['T-test'] = t_tests_df['T-test'].apply(lambda x: x.pvalue)  # Only keep p-value for simplicity

# # Lasso and Ridge regression
# lasso = LassoCV().fit(X, y)
# ridge = RidgeCV().fit(X, y)

# lasso_importance = np.abs(lasso.coef_)
# ridge_importance = np.abs(ridge.coef_)

# features_lasso_sorted = sorted(zip(X.columns, lasso_importance), key=lambda x: x[1], reverse=True)
# features_ridge_sorted = sorted(zip(X.columns, ridge_importance), key=lambda x: x[1], reverse=True)

# features_lasso_df = pd.DataFrame(features_lasso_sorted, columns=['Feature', 'Lasso Importance'])
# features_ridge_df = pd.DataFrame(features_ridge_sorted, columns=['Feature', 'Ridge Importance'])

# Random Forest for feature importance
rf = RandomForestRegressor()
rf.fit(X, y)

features_rf_sorted = sorted(zip(X.columns, rf.feature_importances_), key=lambda x: x[1], reverse=True)
features_rf_df = pd.DataFrame(features_rf_sorted, columns=['Feature', 'RF Importance'])

# # Combining all the data into a single dataframe without merging by feature
# final_df = pd.concat([correlations, t_tests_df.drop('Feature', axis=1), features_lasso_df.drop('Feature', axis=1), 
#                       features_ridge_df.drop('Feature', axis=1), features_rf_df.drop('Feature', axis=1)], axis=1)

X = X[['프로그램순매수(종목)']]
y = df1['target']  # y 값을 0과 1로 조정
# 데이터를 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 모델 초기화
models = {
    'SVC': SVC(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

# 점수를 저장할 딕셔너리
scores = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1_Score': []
}

# 각 모델에 대해 학습 및 평가
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    scores['Model'].append(model_name)
    scores['Accuracy'].append(accuracy_score(y_test, y_pred))
    scores['Precision'].append(precision_score(y_test, y_pred, average='macro'))
    scores['Recall'].append(recall_score(y_test, y_pred, average='macro'))
    scores['F1_Score'].append(f1_score(y_test, y_pred, average='macro'))

# 점수 딕셔너리를 데이터프레임으로 변환
scores_df = pd.DataFrame(scores)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score
0,SVC,0.516129,0.587571,0.516129,0.392157
1,LogisticRegression,0.516129,0.587571,0.516129,0.392157
2,DecisionTreeClassifier,0.370968,0.367521,0.370968,0.36685
3,RandomForestClassifier,0.370968,0.367521,0.370968,0.36685


In [6]:
features_rf_df

Unnamed: 0,Feature,RF Importance
0,프로그램순매수(종목),0.168886
1,누적,0.109237
2,"MA_종가,단순,20",0.105553
3,"MA_종가,단순,5",0.103619
4,저가,0.088658
5,"MA_종가,단순,240",0.084969
6,종가,0.081214
7,"MA_종가,단순,60",0.070345
8,"MA_종가,단순,120",0.067805
9,시가,0.066022


In [7]:
correlations

Unnamed: 0,Feature,Correlation
0,누적,0.135552
1,저가,0.122573
2,시가,0.11937
3,종가,0.110871
4,"MA_종가,단순,5",0.106638
5,고가,0.101911
6,"MA_종가,단순,20",0.094505
7,"MA_종가,단순,60",0.079049
8,프로그램순매수(종목),0.075242
9,"MA_종가,단순,120",0.038853


In [8]:
from pykrx import stock
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv("삼성전자.csv", thousands=',', encoding='cp949')

df['next_day_return'] = (df['종가'].shift(+1) - df['종가'])/ df['종가'] *100
df['target'] = df['next_day_return'].apply(lambda x: 1 if x > 0.25 else -1)

 
df1= df.iloc[1:248, :]
X = df1.drop(['날짜','target', 'next_day_return', 'Unnamed: 12'], axis=1) 
y = df1['target']
# Calculating correlations for Forward Selection
correlations = X.corrwith(y).abs().sort_values(ascending=False).reset_index()
correlations.columns = ['Feature', 'Correlation']

#Performing T-tests
t_tests = {column: ttest_ind(X[column][y == 1], X[column][y == -1], nan_policy='omit') for column in X.columns}
t_tests_sorted = sorted(t_tests.items(), key=lambda x: x[1].pvalue)
t_tests_df = pd.DataFrame(t_tests_sorted, columns=['Feature', 'T-test'])
t_tests_df['T-test'] = t_tests_df['T-test'].apply(lambda x: x.pvalue)  # Only keep p-value for simplicity

# # Lasso and Ridge regression
# lasso = LassoCV().fit(X, y)
# ridge = RidgeCV().fit(X, y)

# lasso_importance = np.abs(lasso.coef_)
# ridge_importance = np.abs(ridge.coef_)

# features_lasso_sorted = sorted(zip(X.columns, lasso_importance), key=lambda x: x[1], reverse=True)
# features_ridge_sorted = sorted(zip(X.columns, ridge_importance), key=lambda x: x[1], reverse=True)

# features_lasso_df = pd.DataFrame(features_lasso_sorted, columns=['Feature', 'Lasso Importance'])
# features_ridge_df = pd.DataFrame(features_ridge_sorted, columns=['Feature', 'Ridge Importance'])

# Random Forest for feature importance
rf = RandomForestRegressor()
rf.fit(X, y)

features_rf_sorted = sorted(zip(X.columns, rf.feature_importances_), key=lambda x: x[1], reverse=True)
features_rf_df = pd.DataFrame(features_rf_sorted, columns=['Feature', 'RF Importance'])

# # Combining all the data into a single dataframe without merging by feature
# final_df = pd.concat([correlations, t_tests_df.drop('Feature', axis=1), features_lasso_df.drop('Feature', axis=1), 
#                       features_ridge_df.drop('Feature', axis=1), features_rf_df.drop('Feature', axis=1)], axis=1)

# Calculating correlations for Forward Selection


top_5_features = correlations.sort_values(by='Correlation', ascending=False).head(5)['Feature']
X_top5 = X[top_5_features]
y = df1['target']
X_train, X_test, y_train, y_test = train_test_split(X_top5, y, test_size=0.25, random_state=42)

# 모델 초기화
models = {
    'SVC': SVC(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

# 점수를 저장할 딕셔너리
scores = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1_Score': []
}

# 각 모델에 대해 학습 및 평가
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    scores['Model'].append(model_name)
    scores['Accuracy'].append(accuracy_score(y_test, y_pred))
    scores['Precision'].append(precision_score(y_test, y_pred, average='macro'))
    scores['Recall'].append(recall_score(y_test, y_pred, average='macro'))
    scores['F1_Score'].append(f1_score(y_test, y_pred, average='macro'))

# 점수 딕셔너리를 데이터프레임으로 변환
scores_df = pd.DataFrame(scores)
scores_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score
0,SVC,0.5,0.25,0.5,0.333333
1,LogisticRegression,0.548387,0.572998,0.548387,0.506818
2,DecisionTreeClassifier,0.516129,0.51728,0.516129,0.507937
3,RandomForestClassifier,0.516129,0.520261,0.516129,0.490132
