In [349]:
import pandas as pd
import numpy as np
import quandl
from datetime import datetime, timedelta
import json
import matplotlib.pyplot as plt


from dateutil.parser import parse
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import log_loss, precision_score, confusion_matrix, precision_recall_curve
from sklearn.metrics import roc_auc_score, f1_score, make_scorer, recall_score, average_precision_score

from sklearn.utils.multiclass import unique_labels
from sklearn.utils.fixes import signature
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from function_library import write_merged_frames, stack_frames, create_labels, clean_features
from function_library import encode_sectors, prepare_partitions
from function_library import create_hard_classes

In [13]:
# create list of 'surprise' files

surp_files = (['surp_1q14.csv',
               'surp_2q14.csv',
               'surp_3q14.csv',
               'surp_4q14.csv',
               'surp_1q15.csv',
               'surp_2q15.csv',
               'surp_3q15.csv',
               'surp_4q15.csv',
               'surp_1q16.csv',
               'surp_2q16.csv',
               'surp_3q16.csv',
               'surp_4q16.csv',
               'surp_1q17.csv',
               'surp_2q17.csv',
               'surp_3q17.csv',
               'surp_4q17.csv',
               'surp_1q18.csv',
               'surp_2q18.csv',
               'surp_3q18.csv'])

In [14]:
# create list of 'features' files

features_files = (['features_1q14.csv',
                   'features_2q14.csv',
                   'features_3q14.csv',
                   'features_4q14.csv',
                   'features_1q15.csv',
                   'features_2q15.csv',
                   'features_3q15.csv',
                   'features_4q15.csv',
                   'features_1q16.csv',
                   'features_2q16.csv',
                   'features_3q16.csv',
                   'features_4q16.csv',
                   'features_1q17.csv',
                   'features_2q17.csv',
                   'features_3q17.csv',
                   'features_4q17.csv',
                   'features_1q18.csv',
                   'features_2q18.csv',
                   'features_3q18.csv'])


In [350]:
# data pipeline script

combined_frames = write_merged_frames(surp_files, features_files)
combined_full = stack_frames(combined_frames)
create_labels('combined_full_set')
clean_features('combined_full_set')
encode_sectors('combined_clean')


In [323]:
X_train, X_test, y_train, y_test, r_train, r_test = prepare_partitions('combined_clean')

### Random Forest

In [324]:
clf = RandomForestClassifier(n_estimators=1000, 
                             criterion='gini',
                             max_features=6)

In [325]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [326]:
skf = StratifiedKFold(n_splits=5, shuffle=True)

In [327]:
cv_log_loss = cross_val_score(clf, # model
                             X_train, # Feature matrix
                             y_train, # Target vector
                             cv=skf, # Cross-validation technique
                             scoring='neg_log_loss', # Loss function
                             n_jobs=-1) # Use all CPU scores

In [328]:
cv_log_loss

array([-0.23177323, -0.2361727 , -0.22680134, -0.23883525, -0.23431099])

In [329]:
p_hat = clf.predict_proba(X_test)

In [330]:
p_hat

array([[0.912, 0.088],
       [0.964, 0.036],
       [0.983, 0.017],
       ...,
       [0.996, 0.004],
       [0.99 , 0.01 ],
       [0.983, 0.017]])

In [284]:
def calc_profit_curve(y_test, y_predict, max_threshold):
    thresholds = []
    results = []
    for i in range(0, max_threshold+1):
        thresh = float(i/100)
        hard_classes = create_hard_classes(y_predict, thresh)
        tn, fp, fn, tp = confusion_matrix(y_test, hard_classes).ravel()
        profit = (5*tp*0.09)+(5*fp*-0.01)
        thresholds.append(i)
        result_entry = [tp, fp, profit]
        results.append(result_entry)
        
    return dict(zip(thresholds, results))

In [331]:
calc_profit_curve(y_test, p_hat, 20)

{0: [368, 4937, -81.25],
 1: [364, 4760, -74.20000000000002],
 2: [354, 4291, -55.25000000000003],
 3: [337, 3859, -41.30000000000001],
 4: [318, 3372, -25.5],
 5: [294, 2902, -12.800000000000011],
 6: [269, 2505, -4.200000000000003],
 7: [245, 2094, 5.549999999999997],
 8: [220, 1755, 11.25],
 9: [190, 1465, 12.25],
 10: [166, 1219, 13.75],
 11: [141, 1007, 13.099999999999994],
 12: [122, 853, 12.25],
 13: [102, 713, 10.25],
 14: [92, 586, 12.099999999999998],
 15: [80, 490, 11.5],
 16: [68, 413, 9.949999999999996],
 17: [57, 335, 8.899999999999999],
 18: [48, 260, 8.599999999999998],
 19: [39, 217, 6.700000000000001],
 20: [35, 168, 7.35]}

In [332]:
hard_classes = create_hard_classes(p_hat, 0.10)

In [272]:
len(p_hat)

4739

In [273]:
sum(hard_classes)

442

In [221]:
rtns = list(r_test.values)

trades = []

for i, j in zip(hard_classes, rtns):
    if i == 1:
        profit = 5 * -j/100
        trades.append(profit)

In [222]:
sum(trades)

45.88500000000002

In [333]:
def simulate_perf(y_test, y_predict, max_threshold, trade_size=5, min_threshold=5):
    rtns = list(r_test.values)
    
    output = {}
    total_output = {}
    
    for i in range(min_threshold, max_threshold+1):
        thresh = float(i/100)
        hard_classes = create_hard_classes(y_predict, thresh)
        
        for label, rtn in zip(hard_classes, rtns):
            if label == 1:
                
                profit = trade_size * -rtn/100
                
                
                if thresh in output.keys():
                    output[thresh].append(profit)
                else:
                    output[thresh] = [profit]
    
    for key, val in output.items():
        total_output[key] = np.sum(val)
    
        
    return total_output

In [334]:
simulate_perf(y_test, p_hat, 20)

{0.05: 69.26999999999998,
 0.06: 58.72999999999999,
 0.07: 58.865,
 0.08: 57.01499999999999,
 0.09: 61.5,
 0.1: 58.550000000000004,
 0.11: 52.26499999999999,
 0.12: 47.76499999999999,
 0.13: 45.255,
 0.14: 39.675000000000004,
 0.15: 35.765,
 0.16: 32.535,
 0.17: 28.105000000000004,
 0.18: 20.069999999999997,
 0.19: 16.965,
 0.2: 6.54}

In [223]:
trades

[0.66,
 0.24,
 -0.18,
 -0.105,
 -0.625,
 0.025,
 0.135,
 0.15,
 0.29,
 -0.07,
 -0.385,
 -0.405,
 -0.045,
 0.31,
 -0.04,
 1.985,
 0.25,
 0.5,
 -0.18,
 0.125,
 -0.145,
 -0.365,
 -0.24,
 0.03,
 1.335,
 0.365,
 0.305,
 -0.96,
 -0.295,
 -0.0,
 -0.12,
 0.065,
 0.045,
 0.105,
 0.16,
 0.09,
 -0.02,
 -0.22,
 0.255,
 0.415,
 0.105,
 -0.075,
 -0.35,
 -0.85,
 -0.33,
 -0.395,
 -0.485,
 -0.165,
 0.3,
 0.24,
 -0.06,
 -0.18,
 0.4,
 0.615,
 0.91,
 0.125,
 0.015,
 -0.32,
 -0.32,
 -0.34,
 -0.315,
 -1.075,
 0.255,
 0.345,
 0.045,
 0.08,
 0.845,
 -0.52,
 0.19,
 0.33,
 0.405,
 -0.24,
 0.575,
 0.125,
 -0.42,
 0.205,
 -0.41,
 0.26,
 -0.195,
 -0.175,
 -2.02,
 0.205,
 0.27,
 -0.72,
 -0.055,
 -0.345,
 0.3,
 0.505,
 -0.645,
 0.225,
 0.37,
 0.035,
 1.175,
 0.195,
 -0.055,
 -0.295,
 -0.09,
 0.045,
 -0.215,
 0.1,
 -0.33,
 -0.29,
 0.17,
 -0.215,
 -0.24,
 0.145,
 -0.265,
 -0.295,
 -0.0,
 -0.155,
 0.02,
 -0.3,
 -0.055,
 -0.21,
 0.09,
 0.315,
 0.545,
 -0.75,
 0.315,
 0.355,
 -0.22,
 -1.025,
 -0.28,
 0.185,
 -0.11,
 -0.1

In [121]:
### Gradient Boosting Classifier

In [335]:
gb_clf = GradientBoostingClassifier(n_estimators=3000,
                                    learning_rate=0.005, 
                                    max_depth=4,
                                    subsample = 0.5,
                                    random_state=1970)

In [336]:
gb_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.005, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=3000,
              n_iter_no_change=None, presort='auto', random_state=1970,
              subsample=0.5, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [337]:
gb_p_hat = gb_clf.predict_proba(X_test)

In [338]:
log_loss(y_test, gb_clf.predict(X_test))

2.45450079111322

In [339]:
calc_profit_curve(y_test, gb_p_hat, 40)

{0: [368, 4937, -81.25],
 1: [368, 4931, -80.95000000000002],
 2: [360, 4592, -67.6],
 3: [332, 3734, -37.30000000000001],
 4: [298, 2862, -9.0],
 5: [253, 2137, 6.999999999999986],
 6: [215, 1572, 18.14999999999999],
 7: [176, 1166, 20.9],
 8: [147, 885, 21.89999999999999],
 9: [130, 706, 23.199999999999996],
 10: [112, 577, 21.549999999999997],
 11: [96, 486, 18.899999999999995],
 12: [88, 408, 19.2],
 13: [83, 346, 20.05],
 14: [73, 290, 18.35],
 15: [64, 251, 16.249999999999996],
 16: [58, 214, 15.399999999999997],
 17: [50, 184, 13.299999999999999],
 18: [44, 163, 11.65],
 19: [34, 143, 8.149999999999999],
 20: [29, 123, 6.899999999999999],
 21: [23, 105, 5.1],
 22: [22, 90, 5.4],
 23: [18, 74, 4.3999999999999995],
 24: [15, 66, 3.4499999999999997],
 25: [14, 61, 3.2499999999999996],
 26: [14, 57, 3.4499999999999997],
 27: [12, 52, 2.7999999999999994],
 28: [12, 49, 2.9499999999999993],
 29: [11, 41, 2.9000000000000004],
 30: [8, 35, 1.8499999999999996],
 31: [8, 34, 1.89999999999

In [319]:
gb_hard_classes = create_hard_classes(gb_p_hat, 0.07)

In [340]:
simulate_perf(y_test, p_hat, 20)

{0.05: 69.26999999999998,
 0.06: 58.72999999999999,
 0.07: 58.865,
 0.08: 57.01499999999999,
 0.09: 61.5,
 0.1: 58.550000000000004,
 0.11: 52.26499999999999,
 0.12: 47.76499999999999,
 0.13: 45.255,
 0.14: 39.675000000000004,
 0.15: 35.765,
 0.16: 32.535,
 0.17: 28.105000000000004,
 0.18: 20.069999999999997,
 0.19: 16.965,
 0.2: 6.54}

In [165]:
sum(gb_trades)

2.440000000000001

In [166]:
gb_trades

[0.83,
 -1.58,
 -0.62,
 0.405,
 0.69,
 -0.475,
 0.07,
 -0.47,
 -0.375,
 0.415,
 0.1,
 0.66,
 -0.36,
 0.62,
 -0.105,
 -0.395,
 1.045,
 -1.09,
 0.455,
 -1.09,
 -0.09,
 -0.275,
 0.285,
 -0.225,
 1.095,
 0.06,
 -0.305,
 0.15,
 -0.015,
 0.545,
 -0.2,
 1.545,
 -0.4,
 0.365,
 -0.095,
 2.395,
 0.005,
 1.73,
 0.545,
 -0.93,
 0.2,
 -0.305,
 -0.33,
 -0.435,
 0.135,
 -0.035,
 0.24,
 0.86,
 -0.28,
 0.37,
 0.175,
 0.86,
 -0.16,
 -0.125,
 0.055,
 -0.44,
 -1.725,
 -0.2,
 -0.475,
 0.355,
 1.085,
 -0.13,
 -0.295,
 0.545,
 0.6,
 0.11,
 -0.245,
 0.465,
 0.54,
 -0.625,
 0.255,
 -0.595,
 0.395,
 0.075,
 0.435,
 -0.385,
 -0.4,
 0.285,
 0.155,
 0.785,
 -0.055,
 0.01,
 0.06,
 0.385,
 -0.105,
 0.05,
 -0.41,
 -0.215,
 -0.52,
 -0.29,
 0.305,
 -0.285,
 0.425,
 0.455,
 -0.69,
 0.345,
 -0.33,
 -0.355,
 -0.045,
 -0.705,
 0.275,
 1.01,
 -1.145,
 -0.185,
 0.26,
 0.02,
 0.515,
 -0.4,
 -0.0,
 -0.185,
 0.82,
 0.355,
 -0.23,
 -0.28,
 -0.545,
 0.08,
 -0.22,
 0.29,
 0.655,
 0.575,
 -1.18,
 0.54,
 0.145,
 0.625,
 -1.17,
 -0.3