In [24]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df= pd.read_csv("./전처리/df_stock.csv")
df_stock = df.drop(['Unnamed: 0','Code'],axis = 1)

# 1월3일의 결과를 1월 2일의 기사를 토대로 예측해야 하므로 한칸 위로
df_stock['target']=df_stock['ChangeCode'].shift(-1)
# shift가 발생하면서 1월2일의 라벨들이 12월 30이롤 이동하기 때문에 삭제
check = (df_stock['Date'] == '2019-12-30')
# ~ 기호를 이용하여, 해당하지 않은 데이터만 남긴다
df_stock = df_stock[~check]

# 라벨인코딩을 통해서, 해당 타겟 값을 바꿔준다.
encoder = LabelEncoder()
encoder.fit(df_stock['target'])
labels = encoder.transform(df_stock['target'])
df_stock['target'] = labels

In [9]:
from pycaret.classification import *

# main - feature 500

df_news = pd.read_csv('./전처리/TF-IDF/main_tf-idf_500.csv')

# 주식 데이터와 뉴스 데이터를 맞추기 위해서,
# 회사의 이름과 날짜를 하나의 id(키) 역할로 만듬
df_stock['Date'] = df_stock['Date'].str.replace('-','.')
df_stock['id'] = df_stock['Name']+df_stock['Date']
df_news['id'] = df_news['Company']+df_news['Day']

total_df = pd.merge(df_stock, df_news, on = 'id').drop(['Name','Date','id','Company','Day','Unnamed: 0','ChangeCode'],axis=1)
total_df = total_df.dropna()

setup_clf = setup(data=total_df, target='target', session_id = 1, use_gpu = True)
top5_main_500 = compare_models(sort='Accuracy', n_select=5, exclude=['catboost','gbc']) # 해당 모델들이 시간이 너무 걸려서 그냥 빼버림 ㅠㅠ

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.5437,0.5498,0.6433,0.5573,0.5971,0.0773,0.0784,1.034
rf,Random Forest Classifier,0.5312,0.5301,0.6454,0.5458,0.5914,0.0505,0.0515,3.275
lr,Logistic Regression,0.5299,0.5271,0.6876,0.5418,0.606,0.0433,0.0452,0.576
ridge,Ridge Classifier,0.5275,0.0,0.6581,0.5417,0.5942,0.0413,0.0425,0.105
lightgbm,Light Gradient Boosting Machine,0.526,0.5262,0.6375,0.5418,0.5857,0.0402,0.041,1.605
lda,Linear Discriminant Analysis,0.5256,0.5259,0.643,0.5411,0.5876,0.0388,0.0396,1.365
knn,K Neighbors Classifier,0.5243,0.5317,0.5559,0.5468,0.5513,0.0452,0.0452,8.672
svm,SVM - Linear Kernel,0.5235,0.0,0.8233,0.5296,0.6395,0.0148,0.0214,0.522
et,Extra Trees Classifier,0.5211,0.5185,0.6433,0.5373,0.5854,0.0293,0.03,2.467
nb,Naive Bayes,0.518,0.5169,0.6024,0.537,0.5677,0.0269,0.0272,0.11


In [10]:
# head - feature 500
df_news = pd.read_csv('./전처리/TF-IDF/head_tf-idf_500.csv')
df_stock['Date'] = df_stock['Date'].str.replace('-','.')
df_stock['id'] = df_stock['Name']+df_stock['Date']
df_news['id'] = df_news['Company']+df_news['Day']
total_df = pd.merge(df_stock, df_news, on = 'id').drop(['Name','Date','id','Company','Day','Unnamed: 0','ChangeCode'],axis=1)
total_df = total_df.dropna()
target = total_df['target']
test = total_df.drop(['target'], axis=1)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(test, target, test_size=0.3, shuffle=False, random_state=100)

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

In [14]:
print(pd.DataFrame(pred).value_counts())

print(y_test.value_counts())

1    4449
0    2185
dtype: int64
1    3484
0    3150
Name: target, dtype: int64


In [19]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

print("오차 행렬\n", confusion_matrix(y_test, pred))
print("정확도 : ", accuracy_score(y_test, pred))
print("정밀도 : ", precision_score(y_test, pred))
print("재현율 : ", recall_score(y_test, pred))
print("f1 : ", f1_score(y_test,pred))
print("roc : ", roc_auc_score(y_test,pred))

오차 행렬
 [[1108 2042]
 [1077 2407]]
정확도 :  0.5298462466083811
정밀도 :  0.5410204540346145
재현율 :  0.6908725602755453
f1 :  0.6068322198411698
roc :  0.5213092960107885


In [20]:
import eli5
from eli5.sklearn  import PermutationImportance

perm_lr = PermutationImportance(lr_clf, scoring = "f1", random_state = 42).fit(X_test, y_test)
eli5.show_weights(perm_lr, top = 100, feature_names = X_test.columns.tolist())

Weight,Feature
0.0053  ± 0.0009,마감
0.0050  ± 0.0012,코스피
0.0040  ± 0.0015,거래일
0.0032  ± 0.0022,순매도
0.0032  ± 0.0044,시황
0.0030  ± 0.0016,출발
0.0024  ± 0.0010,매수
0.0023  ± 0.0013,쇼크
0.0022  ± 0.0020,미래
0.0022  ± 0.0030,증시
