In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import keras
%matplotlib inline

In [80]:
legal_files = glob.glob(pathname='./data/legal/[0-9]*')
feature_files=glob.glob(pathname='./data/feature/[0-9]*')

covid_files=glob.glob(pathname='./data/COVID-19/*')

df_1301 = pd.read_csv('./data/feature/1301_feature.csv',parse_dates=True,index_col='年月日')
dfl_1301 = pd.read_csv('./data/legal/1301_legal.csv',parse_dates=True,index_col='年月日')
df_1301.drop(['2021-11-24'],axis=0,inplace=True)

covid_files=glob.glob(pathname='./data/COVID-19/*')
cl = []
for file in covid_files:
    df = pd.read_csv(file,parse_dates=True,index_col='日期')
    country = df['iso_code'][0]+'_'
    df = df.add_prefix(country)
    df = df.drop(columns=[country+'洲名',country+'國家',country+'ID',country+'iso_code'],axis=1)
    cl.append(df)

for i in range(len(cl)-1):
    cl[i+1] = cl[i].join(cl[i+1])
covid = cl[-1]
covid = covid.drop(index=covid.loc['2021-11-24':].index)

##### 投信拉抬定義
##### 投信買賣超/在外流通股數 = 投本比
##### np.linspace(投本比min, 投本比max, 1000) = threshold產出(-1,0,1)的信號

In [81]:
# Remove string and comma in Dataframe
# dfl_1301 = dfl_1301.drop(columns='簡稱',axis=1)
def drop_name_comma(df,dfl):
    '''丟掉證券名稱以及去除逗號'''
    df.columns.name = dfl['證券名稱'].values[0] 
    dfl.columns.name = dfl['證券名稱'].values[0]
    
    try:
        df = df.drop(columns='證券代碼')
        dfl = dfl.drop(columns=['證券名稱','簡稱'])
    except:
        pass

    for col in df.columns:
        try:
            df[col]=df[col].str.replace(',','')
        except:
            continue

    return df,dfl
df_1301,dfl_1301 = drop_name_comma(df_1301,dfl_1301)

In [82]:
#Create needed columns
# thresholds = np.linspace(dfl_1301['投本比%'].min(),dfl_1301['投本比%'].max(),1000)
# Approach 1: -1,0,1
# Approach 2: minmax scale(-1~1)

from sklearn.preprocessing import MinMaxScaler
def label_creator(dfl):
    '''建立投本比%以及在外流通股數(千股)'''
    '''signal_arr1為拉抬訊號-1,0,1; signal_arr2為拉抬訊號map between -1 and 1'''
    
    dfl['在外流通股數(千股)'] = dfl['投信持股數(千股)']/dfl['投信持股率%']
    dfl['投本比%'] = (dfl['投信買賣超(千股)']/dfl['在外流通股數(千股)'])*100
    signal_arr1 = []
    for num in dfl['投本比%'].values:
        if num >0:
            signal_arr1.append(1)
        elif num <0:
            signal_arr1.append(-1)
        else:
            signal_arr1.append(0)
    dfl['signal_arr1']=pd.Series(signal_arr1,index=dfl.index)

    signal_arr2 = dfl['投本比%'].values.reshape(-1,1)
    scaler = MinMaxScaler(feature_range=(-1, 1))
    signal_arr2 = scaler.fit_transform(signal_arr2)
    signal_arr2 = signal_arr2.reshape(2179,)
    dfl['signal_arr2']=pd.Series(signal_arr2,index=dfl.index)

    return dfl
dfl_1301 = label_creator(dfl_1301)

疫情爆發日為2020-01-22

In [83]:
def before_rolling(df:pd.DataFrame,dfl:pd.DataFrame)->np.array:
    '''用疫情前資料每t天轉成一筆特徵，每t+5轉成一筆答案，t=60'''
    b_data = []
    b_signal = []
    for i in range(len(df[:'2020-01-22'])-65):
        b_data.append(df[:'2020-01-22'].iloc[i:60+i])
        b_signal.append(dfl[:'2020-01-22'][['signal_arr1','signal_arr2']].iloc[[60+i+5]])
    
    return b_data , b_signal

def after_rolling(df:pd.DataFrame,dfl:pd.DataFrame)->np.array:
    '''用疫情後資料每t天轉成一筆特徵，每t+5轉成一筆答案，t=60'''
    a_data = []
    a_signal = []
    for i in range(len(df['2020-01-22':])-65):
        a_data.append(df['2020-01-22':].iloc[i:60+i])
        a_signal.append(dfl['2020-01-22':][['signal_arr1','signal_arr2']].iloc[[60+i+5]])
    
    return a_data , a_signal

def after_covid_rolling(df,covid):
    '''原疫情後特徵+covid特徵'''
    dfc = df.join(covid)
    dfc = dfc.drop(index=dfc.loc[:'2020-01-24'].index)
    c_data = []
    for i in range(len(covid)-65):
        c_data.append(dfc.iloc[i:60+i])
    return dfc,c_data

In [88]:
#Train test split
b_data,b_signal = before_rolling(df_1301,dfl_1301)
a_data,a_signal = after_rolling(df_1301,dfl_1301)
dfc_1301, c_data = after_covid_rolling(df_1301,covid)

def train_test_split(data, signal):
    '''Rolling完的資料7:3=訓練集:驗證集'''    
    train_length= round(len(data)*0.7)

    X_train,X_test,y_train,y_test = data[:train_length],data[train_length:],signal[:train_length],signal[train_length:]

    return X_train,X_test,y_train,y_test

In [89]:
#1.疫情前 + 疫情前 共1730筆資料(dfl_1301[:'2020-01-22'])
X_train_b,X_test_b,y_train_b,y_test_b = train_test_split(b_data,b_signal)



In [91]:
len(X_train_b)

1166

In [None]:
#2.疫情前 + 疫情後 共2180筆資料(dfl_1301)
X_train_b,X_test_b,y_train_a,y_test_a = b_data,b_signal,a_data,a_signal


In [None]:
#3.疫情後 + 疫情後 共450筆資料(df_1301['2020-01-22':])
X_train_a,X_test_a,y_train_a,y_test_a = train_test_split(a_data,a_signal)

In [None]:
#4.疫情後加covid feature + 疫情後
X_train_c,X_test_c,y_train_a,y_test_a = train_test_split(c_data,a_signal)

In [None]:
#5.不分疫情前後直接7:3 without covid feature predict