In [1]:
import pandas as pd
import numpy as np

In [2]:
df_raw = pd.read_csv('data.csv')

In [3]:
df_raw.describe()

Unnamed: 0,last,volume
count,451113.0,451113.0
mean,2751.772257,3469636.0
std,3988.203423,8373189.0
min,63.0,0.0
25%,1099.1458,786100.0
50%,1901.4451,1614400.0
75%,3169.704,3595100.0
max,109550.0,728934100.0


# Generate features and returns of different time horizons

In [4]:
%%time
ticker_list = df_raw['ticker'].unique().tolist()
df_list = []
for ticker in ticker_list:
    df_ticker = df_raw[df_raw['ticker']==ticker]
    df_ticker.set_index('date')
    for d in [-1,-2,-3,-5,-10,-20,-30,-60,-120,-360,-720]: 
        exec('df_ticker["R_{}"]=df_ticker["last"].pct_change(periods={})'.format(-d,-d))
        exec('df_ticker["V_{}"]=df_ticker["volume"].pct_change(periods={})'.format(-d,-d))
    for d in [1,3,5,10]: 
        exec('df_ticker["Y_{}"]=df_ticker["R_{}"].shift(periods={})'.format(d,d,-d))

    df_list.append(df_ticker.set_index('date'))
    
df = pd.concat(df_list)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


CPU times: user 13.4 s, sys: 369 ms, total: 13.7 s
Wall time: 14.2 s


# Generate labels of different time horizons and returns 

In [5]:
for d in [1,3,5,10]:
    exec('y{} = df["Y_{}"].tolist()'.format(d,d))
    for r in [0.005, 0.01, 0.03]:
        exec('df["Y_{}_{}"] = [1 if yy > {} else 0 for yy in y{}]'.format(d,r,r,d))
df.to_csv('df.csv')

# Divide Train and Test Set

In [6]:
df_train = df[df.index <= '2018-12-15']
df_test = df[df.index > '2018-12-31']

df_train.to_csv('train.csv')
df_test.to_csv('test.csv')

# Build Machine learning model to generate trading signals

In [7]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score, precision_recall_fscore_support, classification_report

  from numpy.core.umath_tests import inner1d


In [8]:
feature_list = []
for d in [-1,-2,-3,-5,-10,-20,-30,-60,-120,-360,-720]:
    feature_list.append('R_{}'.format(-d))
    feature_list.append('V_{}'.format(-d))

x_train = np.array(df_train[feature_list])
x_test = np.array(df_test[feature_list])

for d in [1,3,5,10]:
    for r in [0.005, 0.01, 0.03]:
        exec('clf = LogisticRegression(random_state=0,class_weight="balanced").fit(x_train, df_train["Y_{}_{}"])'.format(d,r))
        predict = clf.predict(x_test)
        predict_score = clf.predict_proba(x_test)
        #print(predict_score[:,1])
        exec('df_test["predict_Y_{}_{}"] = predict_score[:,1]'.format(d,r))
        exec('accuracy = accuracy_score(predict,df_test["Y_{}_{}"])'.format(d,r))
        exec('precision = precision_score(predict,df_test["Y_{}_{}"])'.format(d,r))
        exec('recall = recall_score(predict,df_test["Y_{}_{}"])'.format(d,r))
        print('#################################')
        print(f"For the Y_{d}_{r} label with LogisticRegression model, performance metrics:")
        print(f"accuracy: {accuracy}")
        print(f"precision: {precision}")
        print(f"recall: {recall}")
        print('#################################')
df_test.to_csv('test.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


#################################
For the Y_1_0.005 label with LogisticRegression model, performance metrics:
accuracy: 0.518910982168129
precision: 0.5047779384914519
recall: 0.3885167813406822
#################################
#################################
For the Y_1_0.01 label with LogisticRegression model, performance metrics:
accuracy: 0.5581570195301444
precision: 0.4919109878499736
recall: 0.30121504963305906
#################################
#################################
For the Y_1_0.03 label with LogisticRegression model, performance metrics:
accuracy: 0.6703757429946221
precision: 0.4962396265560166
recall: 0.10285422489787142
#################################
#################################
For the Y_3_0.005 label with LogisticRegression model, performance metrics:
accuracy: 0.5042633739031984
precision: 0.5539320407916482
recall: 0.44765301801361607
#################################
#################################
For the Y_3_0.01 label with LogisticRegression

For the Y_10_0.005 label with LogisticRegression model, performance metrics:
accuracy: 0.5131350834984433
precision: 0.5923163921679508
recall: 0.5117955138874044

## Y_10_0.005 obtains the best model performance and design strategy accordingly 

# Backtest Strategy Description
## Buy at least one stock with the highest prediction score
## Hold ten days and re-select the new stock every 10 days

In [9]:
r_list = []
stock_list = []
date_list = []
COMM = 0.001
df = pd.read_csv('test.csv')

In [10]:
%%time
date_test = df['date'].unique().tolist()

for i in range(0,len(date_test),10):
    date = date_test[i]
    df1 = df[df['date']==date]
    max_predict = df1['predict_Y_10_0.005'].max()
    ticker = df1[df1['predict_Y_10_0.005'] == max_predict]['ticker'].tolist()[0]
    r = df1[df1['predict_Y_10_0.005'] == max_predict]['Y_10'].tolist()[0] - 2*COMM
    r_list.append(r)
    stock_list.append(ticker)
    date_list.append(date)

CPU times: user 444 ms, sys: 6.8 ms, total: 451 ms
Wall time: 453 ms


In [11]:
result = pd.DataFrame(columns = ['date','hold_stock', 'real_return','Cumulative_return'])
result['date'] = date_list
result['hold_stock'] = stock_list
result['real_return'] = [0]+r_list[:-1]
result['Cumulative_return'] = result['real_return'].add(1).cumprod().fillna(1)  
result.to_csv('backtest_result.csv')