# 주성분 분석(PCA)
주성분 분석은 특성들이 통계적으로 상관관계가 없도록 데이터셋을 회전시키는 기술입니다.  
PCA를 적용하기 전에 StandardScaler를 사용해 각 특성의 분산이 1이 되도록 데이터의 스케일을 조정합니다.  
왜냐하면 특성의 스케일이 다르면 올바른 주성분을 찾을 수 없기 때문입니다.  
유튜브: https://youtu.be/JZD_nbg4HBc

In [1]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [3]:
cancer.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [5]:
scaler.fit(cancer.data)
X_scaled = scaler.transform(cancer.data)

In [6]:
X_scaled

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

In [10]:
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)

In [11]:
print('원본 데이터 형태: ', str(X_scaled.shape))
print('축소된 데이터 형태: ', str(X_pca.shape))

원본 데이터 형태:  (569, 30)
축소된 데이터 형태:  (569, 2)


In [12]:
X_scaled[0]

array([ 1.09706398, -2.07333501,  1.26993369,  0.9843749 ,  1.56846633,
        3.28351467,  2.65287398,  2.53247522,  2.21751501,  2.25574689,
        2.48973393, -0.56526506,  2.83303087,  2.48757756, -0.21400165,
        1.31686157,  0.72402616,  0.66081994,  1.14875667,  0.90708308,
        1.88668963, -1.35929347,  2.30360062,  2.00123749,  1.30768627,
        2.61666502,  2.10952635,  2.29607613,  2.75062224,  1.93701461])

In [13]:
X_pca[0]

array([9.19283683, 1.94858307])

# PCA

In [36]:
df = pd.read_csv('data.csv')
X = df[['매매순서','최대거래대금','직전 거래대금','양봉개수','10이격도','20이격도','60이격도','매수등락률','시가등락률','뉴스기사','분봉전고점']]
X = X.to_numpy()

In [37]:
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X_train)

In [39]:
from sklearn.decomposition import PCA
pca = PCA(n_components=6)

In [40]:
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)

In [41]:
print('원본 데이터 형태: ', str(X_scaled.shape))
print('축소된 데이터 형태: ', str(X_pca.shape))

원본 데이터 형태:  (122, 11)
축소된 데이터 형태:  (122, 6)


# 딥러닝

In [611]:
import pandas as pd
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adagrad

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [612]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,주문일자,종목명,매수가격,매수시간,매도시간,수익률,매매순서,최대거래대금 시간,최대거래대금,직전 거래대금,양봉개수,10이격도,20이격도,60이격도,종목코드,매수등락률,시가등락률,뉴스기사,분봉전고점
0,2022-03-29,고려시멘트,5230,09:09:07,09:21:27,2.57,1,09:06:00,113,63,1,4.38,9.72,13.32,198440,16.22,6.22,1,0.38
1,2022-03-29,고려시멘트,5410,09:28:09,09:32:19,1.55,2,09:06:00,113,16,2,0.93,1.72,11.55,198440,20.22,6.22,1,1.29
2,2022-03-29,영진약품,5490,09:51:11,09:55:24,0.36,3,09:50:00,62,62,1,1.57,1.94,4.43,3520,6.6,0.19,1,3.28
3,2022-03-29,버킷스튜디오,5060,09:58:16,09:59:56,-0.89,4,09:22:00,72,26,0,0.87,2.06,4.17,66410,11.58,3.09,1,0.99
4,2022-03-29,우리바이오,4390,10:00:40,10:00:50,-0.87,5,10:00:00,44,43,3,3.56,4.06,4.23,82850,10.86,4.17,1,1.94


In [613]:
df['수익률'] = df['수익률'].apply(lambda x : 1 if x > 0 else 0 )
df['수익률'].unique()

array([1, 0], dtype=int64)

In [614]:
X = df[['매매순서','최대거래대금','직전 거래대금','양봉개수','10이격도','20이격도','60이격도','매수등락률','시가등락률','뉴스기사','분봉전고점']]
y = df['수익률']

In [615]:
X = X.to_numpy()
y = y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [616]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### PCA

In [617]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)

In [618]:
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [619]:
model = Sequential()
model.add(Dense(10, input_dim=4, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adagrad()

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [620]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_136 (Dense)            (None, 10)                50        
_________________________________________________________________
dense_137 (Dense)            (None, 10)                110       
_________________________________________________________________
dense_138 (Dense)            (None, 1)                 11        
Total params: 171
Trainable params: 171
Non-trainable params: 0
_________________________________________________________________


In [621]:
model.fit(X_train, y_train, epochs=200, batch_size=20)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 15

Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x1737477ef98>

In [622]:
_, accuracy = model.evaluate(X_test, y_test)
print(accuracy)

0.7560975667906971


In [623]:
y_pred = model.predict(X_test)
confusion_matrix = metrics.confusion_matrix(y_test, np.rint(y_pred))
confusion_matrix

array([[24,  3],
       [ 7,  7]], dtype=int64)

# Conclusion
주성분 분석을 활용하니 score가 대폭 개선되었다. 이전에는 0.6이 한계였는데 이제는 0.7도 가능하다.