In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import graphviz
from sklearn.tree import export_graphviz
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

In [2]:
data = pd.read_csv("data.csv")

In [3]:
data = data.drop(['date','symbol','label','period','Unnamed: 0',"calendarYear","epsgrowth","epsdilutedGrowth","weightedAverageSharesGrowth","weightedAverageSharesDilutedGrowth","dividendsperShareGrowth","operatingCashFlowGrowth","freeCashFlowGrowth","tenYRevenueGrowthPerShare","fiveYRevenueGrowthPerShare","threeYRevenueGrowthPerShare","tenYOperatingCFGrowthPerShare","fiveYOperatingCFGrowthPerShare","threeYOperatingCFGrowthPerShare","tenYNetIncomeGrowthPerShare","fiveYNetIncomeGrowthPerShare","threeYNetIncomeGrowthPerShare","tenYShareholdersEquityGrowthPerShare","fiveYShareholdersEquityGrowthPerShare","threeYShareholdersEquityGrowthPerShare","tenYDividendperShareGrowthPerShare","fiveYDividendperShareGrowthPerShare","threeYDividendperShareGrowthPerShare","receivablesGrowth","inventoryGrowth","assetGrowth","bookValueperShareGrowth","debtGrowth","rdexpenseGrowth","sgaexpensesGrowth"], axis=1)

In [4]:
# 刪除NaN值
data.dropna(inplace=True)

In [5]:
# 標籤建立
data['Target'] = (data['open'].shift(-90) - data['open']) / data['open'] >= 0.1
data['Target'] = data['Target'].astype(int)

# 特徵工程
# 計算移動平均線
data['MA_15'] = data['close'].rolling(window=15).mean()
data['MA_30'] = data['close'].rolling(window=30).mean()
# 計算相對強弱指標 (RSI)
def calculate_rsi(data, window=14):
    delta = data['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

data['RSI'] = calculate_rsi(data)

# 計算移動平均收斂與發散指標 (MACD)
def calculate_macd(data, fastperiod=12, slowperiod=26, signalperiod=9):
    exp12 = data['close'].ewm(span=fastperiod, adjust=False).mean()
    exp26 = data['close'].ewm(span=slowperiod, adjust=False).mean()
    macd = exp12 - exp26
    signal = macd.ewm(span=signalperiod, adjust=False).mean()
    return macd - signal

data['MACD'] = calculate_macd(data)
# 刪除NaN值
data.dropna(inplace=True)

In [None]:
data.to_csv("t.csv")

In [6]:
# 檢查缺失值
missing_values = data.isnull().sum()

# 輸出有缺失值的 column
print("Columns with missing values:")
print(missing_values[missing_values > 0])

Columns with missing values:
Series([], dtype: int64)


In [7]:
# 分割資料
X = data.drop(['Target'], axis=1)
y = data['Target']
# 數據標準化
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
# 去除最後90個資料
X = X[:-90]
y = y[:-90]
# train_size = int(0.7 * len(X))
# X_train, X_test, y_train, y_test = X[:train_size], X[train_size:], y[:train_size], y[train_size:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [8]:
# 選擇模型
model = RandomForestClassifier()

# 訓練模型
model.fit(X_train, y_train)

# 預測
predictions = model.predict(X_test)

# 評估模型
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.9801980198019802
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       263
           1       0.97      0.88      0.92        40

    accuracy                           0.98       303
   macro avg       0.98      0.94      0.95       303
weighted avg       0.98      0.98      0.98       303



In [13]:
# 創建SVM
svm_classifier = SVC()

# 定義網格參數
param_grid = {'C': [ 1, 10, 100],
              'kernel': ['rbf', 'poly', 'sigmoid']}

# 網格搜索
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 輸出最佳參數
print("最佳参数：", grid_search.best_params_)

# 使用最佳參數
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 計算準確率
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy：", accuracy)
print("Classification Report:\n", classification_report(y_test, predictions))

最佳参数： {'C': 1, 'kernel': 'rbf'}
Accuracy： 0.8679867986798679
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       263
           1       0.97      0.88      0.92        40

    accuracy                           0.98       303
   macro avg       0.98      0.94      0.95       303
weighted avg       0.98      0.98      0.98       303

