In [None]:
import pandas as pd  # 引用套件並縮寫為 pd

df = pd.read_csv('BTCUSDT_1M.csv', encoding='utf8') 

In [None]:
df

In [None]:
df = df.dropna()
df

## 將資料欄位重新命名

In [None]:
df = df.rename(columns={ 'open':'Open', 'high':'High', 'low':'Low',
                        'close':'Close', 'volume':'Volume','timestamp':'Date'})  
df['Date'].unique()

In [None]:
df = df.drop(["close_time", "quote_av","trades","tb_base_av","tb_quote_av","ignore","Date"], axis=1)
df 

In [None]:
# TaLib的安裝方式

!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz
%cd ta-lib
!./configure --prefix=/usr
!make
!make install
!pip install Ta-Lib
!pip install mpl_finance
!pip install matplotlib

In [None]:
import os
import talib # 技術指標產生器
import sklearn.preprocessing as preprocessing  # 資料前處理 Scikit-learn
from sklearn.model_selection import train_test_split # 資料分割
from sklearn.svm import SVC # 機器學習SVM的SVC

# # 更改讀檔路徑
# path = '/content'
# os.getcwd() # 抓取目前的工作目錄
# os.chdir(path) # 更改工作目錄

In [None]:
df.tail() # 檢查

# 以下處理技術指標

需要10個特徵 ('MA10' , 'WMA10', 'MOM' , 'K' , 'D', 'RSI-10' , 'dea值' , '威廉指標R' , 'AD線' , 'CCI指標' )

In [None]:
# 開高低收量
# 各用一個變數儲存
High  = df["High"]   # 最高價
Low   = df["Low"]   # 最低價
Open  = df["Open"]   # 開盤價
Close = df["Close"]   # 收盤價
Volume= df["Volume"] # 成交量

##### 技術指標--均線

In [None]:
df['MA10']  = talib.MA(Close, timeperiod=10)
df['WMA10'] = talib.WMA(Close, timeperiod=10)
df['EMA10'] = talib.EMA(Close, timeperiod=10)

##### 技術指標--動量線

In [None]:
df['MOM'] = talib.MOM(Close, timeperiod=10)

##### 技術指標--KD

In [None]:
fastk, fastd = talib.STOCHF( High ,Low, Close, fastk_period=5, fastd_period=3, fastd_matype=0)
df['K'] = fastk
df['D'] = fastd

##### 技術指標--RSI

In [None]:
df['RSI-5']  = talib.RSI(Close,timeperiod=5)
df['RSI-10'] = talib.RSI(Close,timeperiod=10)

##### 技術指標--MACD

In [None]:
dif, macd, dea = talib.MACD(Close,12,26,9)
df['dif值']= dif
df['macd值']= macd
df['dea值']= dea

##### 技術指標--威廉R指標

In [None]:
df['威廉指標R'] = talib.WILLR(High, Low, Close, timeperiod=14)

##### 技術指標--AD線

In [None]:
df['AD線'] = talib.AD(High, Low, Close, Volume)

##### 技術指標--CCI線

In [None]:
df['CCI指標'] = talib.CCI(High, Low, Close, timeperiod=14)

##### 計算報酬率  (LABEL採用)

In [None]:
R = (Close - Close.shift(1))/Close.shift(1)
# R = (Close.shift(-1)-Close)/Close # 明天-今天/今天，一樣的結果

df['報酬率(Return)']= R.shift(-1) # 位移1日
df['報酬率(Return)']

In [None]:
df

## 判斷漲跌1是跌0是漲

In [None]:
import numpy as np
tp = np.where((df['報酬率(Return)'] >= 0) ,1, 0)
tp

In [None]:
df["Trading Point"] = tp

In [None]:
df

### 整理階段

In [None]:
df_0 = df # 留一個原始檔案
df_1 = df # 一份用來訓練的檔案

In [None]:
features = ['MA10','WMA10','MOM','K','D','RSI-10','dea值','威廉指標R','AD線','CCI指標']
# 特徵選取，先用串列保存名稱 後續再讀資料
Labels = ['Trading Point']
# 解答選取，先用串列保存名稱 後續再讀資料

In [None]:
Col = features+Labels  # X+Y 作為一整張試卷

In [None]:
df1 = df[features].dropna(how='any') # 把NA刪除
# 讀取資料
df1.head() # 檢查

### 資料前處理階段 -----  最大最小法 (+1 ~ -1)

In [None]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(-1,1)) # 前處理器

In [None]:
# data_minmax = pd.DataFrame(minmax_scale.fit_transform(df1), index=df1.index, columns=df1.columns)
minmax_learn = minmax_scale.fit_transform(df1) 
# 導入並輸出成+1~-1
# 處理器的格式是numpy 要再轉換成pandas

In [None]:
data_minmax = pd.DataFrame(minmax_learn,index=df1.index, columns=df1.columns)
# 轉換回pandas格式

In [None]:
data_minmax.head()

In [None]:
data_minmax[Labels] = df[Labels] # 放入Labels

In [None]:
data_minmax.head()

#### 資料分割

In [None]:
from sklearn.model_selection import train_test_split # 資料分割
# data_minmax_train , data_minmax_test = train_test_split(data_minmax, test_size=0.552, random_state=2,shuffle=False)
data_minmax_train , data_minmax_test = train_test_split(data_minmax, test_size=0.2, random_state=2,shuffle=False)

# train_size test_size 是互補 設定test即可
# 這裡是範例 不用7:3或8:2
# 可以用 shuffle=True 打亂資料 

# 也可以人工自行切割 (例如:用pandas切開前後 或 excel檔分前後兩份)

#### 機器學習建立

In [None]:
data_minmax_train # 檢查

In [None]:
data_minmax_test # 檢查

In [None]:
print(data_minmax_train.shape)
print(data_minmax_test.shape)
# 直接看大小

In [None]:
y_train = data_minmax_train[Labels]
X_train = data_minmax_train[features]
y_test = data_minmax_test[Labels]
X_test = data_minmax_test[features]

In [None]:
y_train.head()
y_train

In [None]:
X_train.head()
X_train

In [None]:
y_test.head()
y_test

In [None]:
X_test.head()
X_test

# 機器學習

## XGboost

In [None]:
from xgboost import XGBClassifier

# 建立 XGBClassifier 模型
xgboostModel = XGBClassifier(n_estimators=100, learning_rate= 0.3)
# 使用訓練資料訓練模型
xgboostModel.fit(X_train, y_train)
# 使用訓練資料預測分類
predicted = xgboostModel.predict(X_train)
# 預測成功的比例
print('訓練集: ',xgboostModel.score(X_train,y_train))
print('測試集: ',xgboostModel.score(X_test,y_test))

## DecisionTree

In [None]:
from sklearn import tree
# from sklearn.cross_validation import train_test_split





# 建立分類器
clf = tree.DecisionTreeClassifier()
iris_clf = clf.fit(X_train, y_train)

# 預測
test_y_predicted = iris_clf.predict(X_test)
print(test_y_predicted)

# 標準答案
# print(y_test)

from sklearn import metrics

accuracy = metrics.accuracy_score(y_test, test_y_predicted)
print(accuracy)

## RF模型

In [None]:
from sklearn.ensemble import RandomForestClassifier # 隨機森林分類器

In [None]:
cf = RandomForestClassifier(
    n_estimators=100, # 隨機森林的樹木數量
    criterion='gini', # 決策樹的計算方法
    max_depth=None, # 樹林成長的範圍
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=0,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,)

# 這邊RF的定義，第一個參數是代表著決策樹的數量
# RF是隨機森林，第一個參數要設定該森林的大小

cf.fit(X_train, y_train)

In [None]:
test_y_predicted = cf.predict(X_test)

In [None]:
from sklearn import metrics

accuracy = metrics.accuracy_score(y_test, test_y_predicted)
print(accuracy)

## LightGBM Classifier

In [None]:
# build the lightgbm model
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

In [None]:
# predict the results
y_pred=clf.predict(X_test)

In [None]:
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)