## 股票分析
- Model: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
- Grid Search: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

df = pd.read_csv("stock_apple.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-04-30,1030.01001,1037.0,1016.849976,1017.330017,1017.330017,1664113
1,2018-04-27,1046.0,1049.5,1025.589966,1030.050049,1030.050049,1611500
2,2018-04-26,1029.51001,1047.97998,1018.190002,1040.040039,1040.040039,2079500
3,2018-04-25,1025.52002,1032.48999,1015.309998,1021.179993,1021.179993,2391100
4,2018-04-24,1052.0,1057.0,1010.590027,1019.97998,1019.97998,4760300


In [2]:
# 1.定義昨日收盤價、昨日成交量欄位
df["Yes Adj Close"] = df["Adj Close"].shift(-1)
df["Yes Volume"] = df["Volume"].shift(-1)

# 2.定義增益欄位(今天收盤價 - 昨天收盤價)
df["Gain"] = df["Adj Close"] - df["Yes Adj Close"] 

# 3.定義漲跌類別欄位 如果(今天收盤價 - 昨天收盤價) > 設為1， 反之設為0
df["Up"] = df["Gain"].apply(lambda x: 1 if x > 0 else 0)

print(df.shape)
df.head()

(503, 11)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Yes Adj Close,Yes Volume,Gain,Up
0,2018-04-30,1030.01001,1037.0,1016.849976,1017.330017,1017.330017,1664113,1030.050049,1611500.0,-12.720032,0
1,2018-04-27,1046.0,1049.5,1025.589966,1030.050049,1030.050049,1611500,1040.040039,2079500.0,-9.98999,0
2,2018-04-26,1029.51001,1047.97998,1018.190002,1040.040039,1040.040039,2079500,1021.179993,2391100.0,18.860046,1
3,2018-04-25,1025.52002,1032.48999,1015.309998,1021.179993,1021.179993,2391100,1019.97998,4760300.0,1.200013,1
4,2018-04-24,1052.0,1057.0,1010.590027,1019.97998,1019.97998,4760300,1067.449951,2341300.0,-47.469971,0


In [3]:
# 4.將缺失值處理掉
df = df.dropna()
df = df.reset_index(drop=True)
print(df.shape)

df.head()

(502, 11)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Yes Adj Close,Yes Volume,Gain,Up
0,2018-04-30,1030.01001,1037.0,1016.849976,1017.330017,1017.330017,1664113,1030.050049,1611500.0,-12.720032,0
1,2018-04-27,1046.0,1049.5,1025.589966,1030.050049,1030.050049,1611500,1040.040039,2079500.0,-9.98999,0
2,2018-04-26,1029.51001,1047.97998,1018.190002,1040.040039,1040.040039,2079500,1021.179993,2391100.0,18.860046,1
3,2018-04-25,1025.52002,1032.48999,1015.309998,1021.179993,1021.179993,2391100,1019.97998,4760300.0,1.200013,1
4,2018-04-24,1052.0,1057.0,1010.590027,1019.97998,1019.97998,4760300,1067.449951,2341300.0,-47.469971,0


In [4]:
df_normal = pd.DataFrame()
df_normal["Volume Ratio"] = df["Volume"].apply(pd.to_numeric) / df["Yes Volume"].apply(pd.to_numeric) 
df_normal["Close Price Ratio"] = df["Adj Close"].apply(pd.to_numeric) / df["Yes Adj Close"].apply(pd.to_numeric)

df_normal["Target"] = df["Up"].shift(1)

df_normal = df_normal.dropna()
df_normal = df_normal.reset_index(drop=True)
df_normal.head()

Unnamed: 0,Volume Ratio,Close Price Ratio,Target
0,0.774946,0.990395,0.0
1,0.869683,1.018469,0.0
2,0.5023,1.001177,1.0
3,2.033187,0.95553,1.0
4,1.23898,0.994865,0.0


In [5]:
# 定義X和y
X = df_normal[["Volume Ratio", "Close Price Ratio"]]
y = df_normal["Target"]

# 將資料集 8:2分成訓練資料及測試資料(假設random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) #random_state 種子值

# 將訓練資料做標準化
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# 5. 建立模型
model = SVC()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print("Number of correct sample: {}".format(num_correct_samples))
print("Accuracy: {:.3f}".format(accuracy))
print("Confusion matrix: {}".format(con_matrix))

Number of correct sample: 62
Accuracy: 0.614
Confusion matrix: [[ 1 38]
 [ 1 61]]


In [6]:
# 調整參數
param_grid = {"kernel":["linear", "poly", "rbf"], "C":[0.01,1,100]} #default kernel:rbf, C:1
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", cv=5, n_jobs=-1)
search_result = grid_search.fit(X_train, y_train)
print("Best Accuracy: :{:.3f} using {}".format(search_result.best_score_, search_result.best_params_))

Best Accuracy: :0.543 using {'C': 0.01, 'kernel': 'linear'}


In [7]:
bestmodel = grid_search.best_estimator_
bestmodel.fit(X_train, y_train)
y_pred_best = bestmodel.predict(X_test)

accuracy_best = accuracy_score(y_pred_best, y_pred)
num_correct_samples_best = accuracy_score(y_pred_best, y_pred, normalize=False)
con_matrix_best = confusion_matrix(y_pred_best, y_pred)

print("Number of correct sample: {}".format(num_correct_samples_best))
print("Accuracy: {:.3f}".format(accuracy_best))
print("Confusion matrix: {}".format(con_matrix_best))

Number of correct sample: 99
Accuracy: 0.980
Confusion matrix: [[ 0  0]
 [ 2 99]]
