In [1]:
#Import our libraries
import numpy as np
import pandas as pd
from datetime import datetime
from dateutil.parser import parse

#Import our csv tables
df_trade = pd.read_csv('trades.csv')
df_equity = pd.read_csv('equity.csv')
df_cashflow = pd.read_csv('cashmovements.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
"""
The get_table function returns only information of the specified user_id
The get_features function takes the output of the get_table function and returns engineered features as output
the get_target function takes the output of the get_table function and returns binary encoded classes as output
The engineer_table function combines the 3 previous functions and outputs both the target and features in a 1-step process
"""
def get_table(trade, equity, cashflow, user_id):
    #Extract useful columns from each table
    trade = trade.loc[:, ['account_id', 'quantity', 'opentime', 'openperiod', 'pnl']]
    equity = equity.loc[:, ['account_id', 'equity', 'timeperiod']]
    cashflow = cashflow.loc[:, ['account_id', 'amount', 'timestamp', 'timeperiod']]
    #Extract rows for each user
    trade = trade.loc[trade['account_id'] == user_id]
    equity = equity.loc[equity['account_id'] == user_id]
    cashflow = cashflow.loc[cashflow['account_id'] == user_id]
    #Rename columns 
    trade = trade.rename(columns={'openperiod': 'timeperiod', 'opentime': 'trade_timestamp'})
    #Set timeperiod as index
    trade = trade.drop('account_id', axis=1)
    trade = trade.set_index('timeperiod')
    trade = trade.sort_values('trade_timestamp', ascending=True)
    equity = equity.set_index('timeperiod')
    cashflow = cashflow.drop('account_id', axis=1)
    cashflow = cashflow.sort_values('timestamp', ascending=True)
    cashflow = cashflow.reset_index(drop=True)
    #Merge the equity and trade tables
    temp = equity.join(trade)
    temp['timeperiod'] = temp.index
    temp = temp.dropna(how='any')
    temp = temp.reset_index(drop=True)
    #Factor in the cash movement into equity
    for index, row in temp.iloc[1:].iterrows():
        for index2, row2 in cashflow.iterrows():
            if temp.iloc[index]['timeperiod'] == cashflow.iloc[index2]['timeperiod']:
                if parse(temp.iloc[index]['trade_timestamp']).time() > parse(cashflow.iloc[index2]['timestamp']).time():
                    temp.set_value(index, 'equity', temp.iloc[index]['equity'] + cashflow.iloc[index2]['amount'])
                else:
                    pass
            else:
                pass
    temp['%pnl'] = temp['pnl']/temp['equity']
    return temp

def get_features(table):
    feature=pd.DataFrame(columns=['Day_1','Day_2','Day_3','Day_4','Day_5','Day_6','Day_7','Day_8','Day_9','Day_10'])
    count=10
    for i in range(10,len(table)):
        data={}
        for j in range(1,11):
            data['Day'+'_'+str(j)]=table['%pnl'][i-10+j]
        feature.loc[count]=data
        count +=1
    return feature

def get_target(table):
    target=[]
    for i in range(10,len(table)):
        if (table['quantity'][i]>(table['quantity'][i-10:i].mean()+table['quantity'][i-10:i].std()) or table['quantity'][i]<(table['quantity'][i-10:i].mean()-table['quantity'][i-10:i].std())) and table['%pnl'][i]<table['%pnl'][i-1]:
            target.append(1)
        else:
            target.append(0)
    return target

def engineer_table(trade, equity, cashflow, user_id):
    temp_table = get_table(trade, equity, cashflow, user_id)
    features = get_features(temp_table)
    target = get_target(temp_table)
    return features, target

In [3]:
"""
We demonstrate the usage of the pre-defined functions above with user_id 2881.
As mentioned above, the engineer_table function returns two variables: features and target.
"""
features, target = engineer_table(df_trade, df_equity, df_cashflow, 2881)

In [4]:
"""
Now we shall import our required libraries for machine learning implementation
"""
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier



In [5]:
"""
We shall split our features and target into training and cross-validation sets using the train_test_split function, and setting
a random_state of 0 for reproducibility of results. Short comments of the two datasets are provided.
"""
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 0)
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 695 samples.
Testing set has 174 samples.


In [6]:
# Benchmark model
clf0 = DummyClassifier(strategy='constant', constant=1)
clf0.fit(X_train, y_train)
clf0_predictions = clf0.predict(X_test)
# find accuracy and f1 score of our benchmark model
dummy_accuracy = accuracy_score(y_test, clf0_predictions)
dummy_F1 = f1_score(y_test, clf0_predictions)
# print the results 
print("Benchmark Model: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(dummy_accuracy, dummy_F1))

Benchmark Model: [Accuracy score: 0.0747, F-score: 0.1390]


In [9]:
"""
Performs grid search over the parameters 'criterion', 'max_features', 'n_estimators', 'min_samples_split',
'min_samples_leaf' and 'bootstrap'for a random forest classifier trained on the input data features and labeled target.
Parameters for tuning are as follows:
    1) criterion - The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity
                   and “entropy” for the information gain. Note: this parameter is tree-specific.
    2) max_features - The number of features to consider when looking for the best split:
                a) If “auto”, then max_features=sqrt(n_features).
                b) If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
                c) If “log2”, then max_features=log2(n_features).
                d) If None, then max_features=n_features.
    3) n_estimators - The number of trees in the forest.
    4) min_samples_split - The minimum number of samples required to split an internal node, minimal is 2.
    5) min_samples_leaf - The minimum number of samples required to be at a leaf node, minimal is 1.
    6) bootstrap - Whether bootstrap samples are used when building trees, 1 for True, 0 for False.
"""
def fit_RandomForest(features, target):  
    
    # Choose RandomForest as the algorithm for optimization with GridSearch
    clf = RandomForestClassifier(random_state=0)
    
    # Create a dictionary for the parameters
    parameters_RF = {'criterion':('entropy', 'gini'),
                    'max_features':('auto', 'sqrt', 'log2'),
                    'n_estimators':[5, 10, 20],
                    'min_samples_split':[2, 3, 4, 5],
                    'min_samples_leaf':[1, 2, 3, 4],
                    'bootstrap':[0, 1]
                    }
    
    # Define a scoring function
    scorer = make_scorer(f1_score)
    
    # Create the GridSearch object"""
    grid_obj_RF = GridSearchCV(estimator=clf, param_grid=parameters_RF, scoring=scorer)
    
    # Fit the grid search object to the data to compute the optimal model
    grid_fit_RF = grid_obj_RF.fit(X_train, y_train)
    
    # Return the optimal model after fitting the data
    best_clf_RF = grid_fit_RF.best_estimator_
    
    # Make predictions with the optimal model
    best_predictions_RF = best_clf_RF.predict(X_test)
    
    # Get the accuracy and f1_score of the optimized model
    clf_optimized_accuracy = accuracy_score(y_test, best_predictions_RF)
    clf_optimized_f1 = f1_score(y_test, best_predictions_RF)
    
    print("RF Optimized [Accuracy score: {:.4f}, f1-score: {:.4f}]".format(clf_optimized_accuracy, clf_optimized_f1))

    # Return the optimal model after fitting the data
    return best_clf_RF

In [10]:
"""
We demonstrate the usage of the pre-defined fit_RandomForest function detailed above.
Our grid_search operator finds the optimal parameters by the F1-score metric which is a more robust model evaluation metric
in classification problems where the output classes are not balanced. In this case, class 0 predominates with 97% of all classes
(by looking back at our benchmark model's accuracy which always predicts 1). Since F1-score requires the model to predict at
least one positive class, if the model only predicts 0 then the F1-score would be considered 'ill-defined' with 'no predicted
samples'. 
"""
fit_RandomForest(features, target)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


RF Optimized [Accuracy score: 0.9023, f1-score: 0.0000]


RandomForestClassifier(bootstrap=1, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            n_estimators=5, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)