In [1]:
%matplotlib inline
from matplotlib import pyplot
import numpy as np
from odo import odo
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test_2.csv')

X = df.copy()
X_test = df_test.copy()
Y = df.loc[:,'Ret_121':'Ret_PlusTwo'].copy()

In [3]:
# winton evaluation function
import re
import numpy as np
from sklearn.metrics.regression import _check_reg_targets
from sklearn.metrics import make_scorer

# use regex to rename columns for simple selection
wmae_cols = list(Y.columns)
wmae_cols = map(lambda x: re.sub('Ret_[0-9]+', 'Weight_Intraday', x), wmae_cols)
wmae_cols = map(lambda x: re.sub('Ret.*', 'Weight_Daily', x), wmae_cols)
wmae_weights = df[wmae_cols]

def weighted_mean_absolute_error(y_pred, y_true, weights):
    output_errors = np.multiply(np.abs(y_pred - y_true), weights)
    return np.average(np.average(output_errors, axis=1))

# test, result should be 2.875
# weighted_mean_absolute_error(
#      np.array([[1.5,2.5],[6,8]]),
#      np.array([[1,2],[3,4]]),
#      np.array([2,1]))

def wmae_score(estimator, X, y_true):
    y_pred = estimator.predict(X)
    weights = wmae_weights.iloc[X.index]
    return weighted_mean_absolute_error(y_pred, y_true, weights)

def write_predictions(Y_hat):
    Y_hat = odo(Y_hat, pd.DataFrame)
    Y_hat.columns += 1
    Y_hat.index += 1
    Y_hat = Y_hat.stack().reset_index()
    Y_hat = odo(Y_hat, pd.DataFrame)
    Y_hat = Y_hat.rename(columns={0:'Predicted'})
    Y_hat.index = Y_hat[['level_0', 'level_1']].applymap(str).apply(
        lambda x: '_'.join(x), axis=1)
    Y_hat.index.name = 'Id'
    Y_hat = Y_hat.drop(['level_0', 'level_1'], axis=1)
    Y_hat.to_csv('predictions.csv')

In [4]:
fmt = lambda x: 'Feature_{}'.format(x)

# some features seem to be categories
_features = [1,5,8,9,10,13,16,20]
feature_cats = {fmt(i): df[fmt(i)].dropna().unique() for i in _features}
feature_cats

# model feature distribution
from scipy import stats
features_dist = {
    'norm': [2,3,4,11,14,17,18,19,21,22,23,24,25],
    'beta': [12],
    'uniform': [7],
    'expon': [6, 15]
}

features_coeff = {}
for _dist, _features in features_dist.items():
    for _feature in _features:
        _coeff = getattr(stats, _dist).fit(df[fmt(_feature)].dropna())
        features_coeff[fmt(_feature)] = {
            'type': _dist,
            'coeff': _coeff
        }

In [5]:
def corrpair(df):
    df = df.drop(['Id','Weight_Daily','Weight_Intraday'], 1)
    dfc = df.corr()
    dfcs = dfc.stack().sort_index()
    dfcs = dfcs[(dfcs<1)&(dfcs>-1)&(~dfcs.duplicated())]
    return abs(dfcs).sort_values(ascending=False)

# get highest correlation pairs
dfcs = corrpair(df)
top_cp = dfcs.head(20)

nuke_features = set([i[1] for i in top_cp[top_cp>0.75].index])

In [6]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils.validation import check_array, check_consistent_length

# sample from a series probabilistically according to the distribution of values
def sample_per_distribution(s):
    return s.dropna().sample(n=s.size, replace=True).reset_index(drop=True)

def sample(s):
    _d = features_coeff[s.name]
    sample_fn = getattr(stats, _d['type']).rvs
    return odo(sample_fn(*_d['coeff'], size=s.size), pd.Series)

def fillall0(df):
    return df.fillna(0)

def fillall(df):
    _x_base = df.loc[:,'Feature_1':'Feature_25'].copy()
    _x_base = _x_base.fillna(_x_base[feature_cats.keys()].apply(sample_per_distribution))
    _x_base = _x_base.fillna(_x_base[features_coeff.keys()].apply(sample))
    return _x_base

def fillall2(df):
    _x = df.copy()
    for features, samplefn in [(feature_cats.keys(), sample_per_distribution),
                               (features_coeff.keys(), sample)]:
        if not set(features).issubset(_x.columns):
            continue
        _x = _x.fillna(_x[features].apply(samplefn))

    start_ret = 'Ret_121'
    if 'Ret_2' in _x.columns:
        start_ret = 'Ret_2'
    end_ret = 'Ret_120'
    if 'Ret_180' in _x.columns:
        end_ret = 'Ret_180'

    _x.loc[:,start_ret:end_ret].fillna(method='ffill', inplace=True)
    _x.fillna(0,inplace=True)
    return _x

class NormRegressor(BaseEstimator, RegressorMixin):

    def fit(self, X, y, sample_weight=None):
        y = check_array(y, ensure_2d=False)
        if len(y) == 0:
            raise ValueError("y must not be empty.")
        
        check_consistent_length(X, y, sample_weight)
        
        self._coeff = odo(y, pd.DataFrame).apply(stats.norm.fit)
        return self

    def predict(self, X):
        if not hasattr(self, "_coeff"):
            raise ValueError("NormRegressor not fitted.")

        out = self._coeff.apply(
            lambda x: pd.Series(stats.norm.rvs(x[0], 0.001*x[1], size=X.shape[0]))).T
        return out.values

    
class NormRowRegressor(BaseEstimator, RegressorMixin):

    def fit(self, X, y, sample_weight=None):
        return self

    def predict(self, X):
        out = pd.DataFrame()

        daily_mean = .001*X.loc[:,'Ret_MinusTwo':'Ret_MinusOne'].mean(axis=1)
        out['Ret_PlusOne'] = out['Ret_PlusTwo'] = daily_mean   

        intra_mean = .001*X.loc[:, 'Ret_2':'Ret_120'].mean(axis=1)
        for idx in xrange(121,181):
            out['Feature_{}'.format(idx)] = intra_mean

        return out.values


# creates a regressor with customizable inputs/features,
# predicts the median of the outputs/targets
class FeatureMeanRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self, features=None, regressor=None):
        if regressor is None:
            regressor = LinearRegression()
        self._regressor = regressor
        if features is None:
            features = filter(lambda x: x.startswith('Feature'),list(df.columns))
        self._features = list(features)
        super(FeatureMeanRegressor, self).__init__()
    
    def fit(self, X, y, sample_weight=None):
        _y = X.loc[:,'Ret_2':'Ret_180'].median(axis=1)
        _x = X[self._features]
        self._clf = self._regressor.fit(_x, _y)
        return self
    
    def predict(self, X):
        _x = X[self._features]
        out = pd.DataFrame() 
        means = self._clf.predict(_x)
        for idx in xrange(121,181):
            out['Ret_{}'.format(idx)] = means
        out['Ret_PlusOne'] = out['Ret_PlusTwo'] = 0
        return out.values


# creates a regressor with customizable inputs/features,
# predicts the median of the outputs/targets
class FeatureMeanRegressor2(BaseEstimator, RegressorMixin):
    
    def __init__(self, features=None, outputs=None, regressor=None):
        if regressor is None:
            regressor = LinearRegression()
        self._regressor = regressor
        if features is None:
            features = filter(lambda x: x.startswith('Feature'),list(df.columns))
        self._features = list(features)
        if outputs is None:
            outputs = ['Ret_121','Ret_180']
        self._output_start = outputs[0]
        self._output_end = outputs[1]
        super(FeatureMeanRegressor2, self).__init__()

    def _get_output(self, X):
        return X.loc[:,self._output_start:self._output_end]
    
    def fit(self, X, y, sample_weight=None):
        _y = self._get_output(X).median(axis=1)
        _x = X[self._features]
        self._clf = self._regressor.fit(_x, _y)
        return self
    
    def predict(self, X):
        _x = X[self._features]
        out = pd.DataFrame(columns=df.loc[:,'Ret_MinusTwo':'Ret_PlusTwo'].columns)
        means = self._clf.predict(_x)
        
        subcols = out.loc[:,self._output_start:self._output_end].columns
        for subcol in subcols:
            out[subcol] = means

        #out = out.fillna(0)
        return out.loc[:,'Ret_121':'Ret_PlusTwo'].values    


# creates a regressor with customizable inputs/features,
# predicts the median of the outputs/targets
class RetMeanRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self, returns=None, features=None, outputs=None, regressor=None):
        if regressor is None:
            regressor = LinearRegression()
        self._regressor = regressor
        if returns is None:
            returns = ['Ret_2','Ret_120']
        self._returns = returns
        if features is None:
            features = filter(lambda x: x.startswith('Feature'),list(df.columns))
        self._features = list(features)
        if outputs is None:
            outputs = ['Ret_2','Ret_180']
        self._output_start = outputs[0]
        self._output_end = outputs[1]
        super(RetMeanRegressor, self).__init__()

    def _get_output(self, X):
        return X.loc[:,self._output_start:self._output_end]
    
    def _get_returns(self, X):
        return odo(X.loc[:,self._returns[0]:self._returns[1]].median(axis=1), pd.DataFrame)
    
    def _get_features(self, X):
        rets = self._get_returns(X)
        return pd.concat([rets, X[self._features]], axis=1)
    
    def fit(self, X, y, sample_weight=None):
        _y = self._get_output(X).median(axis=1)
        _x = self._get_features(X)
        self._clf = self._regressor.fit(_x, _y)
        return self
    
    def predict(self, X):
        _x = self._get_features(X)
        out = pd.DataFrame(columns=df.loc[:,'Ret_MinusTwo':'Ret_PlusTwo'].columns)
        means = self._clf.predict(_x)
        
        subcols = out.loc[:,self._output_start:self._output_end].columns
        for subcol in subcols:
            out[subcol] = means

        #out = out.fillna(0)
        return out.loc[:,'Ret_121':'Ret_PlusTwo'].values    
    

# combines DummyRegressor and NormRowRegressor
class JointRegressor(BaseEstimator, RegressorMixin):
    
    def fit(self, X, y, weights=None):
        self._weights = weights
        self._dr = DummyRegressor(strategy='median')
        self._dr.fit(X, y)
        self._nrr = NormRowRegressor()
        self._nrr.fit(X, y)
        return self
    
    def predict(self, X):
        out1 = self._dr.predict(X)
        out2 = self._nrr.predict(X)
        return np.average(np.swapaxes(np.array([out1, out2]), 0, 1),
                          axis=1, weights=self._weights)


# combines DummyRegressor, NormRowRegressor, and other arbitrary regressors with fixed weighting
class JointRegressor2(BaseEstimator, RegressorMixin):
    
    def _get_xrtns(self, X):
        X_returns = X.loc[:,'Ret_MinusTwo':'Ret_120'].reset_index(drop=True)
        return X_returns.fillna(X_returns.apply(sample_per_distribution))
    
    def fit(self, X, y, weights=None, other_clf=None):
        X_returns = self._get_xrtns(X)
        self._weights = weights
        self._dr = DummyRegressor(strategy='median')
        self._dr.fit(X, y)
        self._nrr = NormRowRegressor()
        self._nrr.fit(X, y)
        
        self._other_clf = {}
        
        if other_clf:
            for feature, clf in other_clf.items():
                _x = X_returns.copy()
                _x[feature] = X[feature]
                _x = _x.dropna(subset=[feature])

                if not _x.size:
                    continue

                _y = y.loc[_x.index]
                clf.fit(_x, _y)
                self._other_clf[feature] = clf

        return self
    
    def predict(self, X):
        X_returns = self._get_xrtns(X)
        out1 = self._dr.predict(X_returns)
        out2 = self._nrr.predict(X_returns)
        avg = odo(np.average(np.swapaxes(np.array([out1, out2]), 0, 1),
                             axis=1, weights=self._weights),pd.DataFrame)
    
        avg1 = pd.DataFrame()
        for feature, clf in self._other_clf.items():
            _x = X_returns.copy()
            _x[feature] = X[feature]
            _x = _x.dropna(subset=[feature])
            
            if not _x.size:
                continue

            _p = odo(clf.predict(_x), pd.DataFrame)
            _p.index = _x.index
            avg1 = (avg1).add(_p, fill_value=0)
            
        avg = (.999*avg).add(.001*avg1, fill_value=0)
        
        return avg.values


# combines arbitrary regressors, can pass in weights, fillna functions, and fit_params
class JointRegressor3(BaseEstimator, RegressorMixin):
    
    def fit(self, X, y, regressors=None, weights=None, fillfn=fillall0, fit_params=None):
        self._fillfn = fillfn
        X = self._fillfn(X)
        y = self._fillfn(y)
        if regressors is None:
            raise Exception('You done goofed!')        
        self._weights = weights
        self._regressors = []       
        
        if fit_params is None:
            fit_params = [{}]*len(regressors)

        reg_and_params = zip(regressors, fit_params)
        for clf, _fit_params in reg_and_params:
            clf.fit(X, y, **_fit_params)
            self._regressors.append(clf)
        return self
    
    def predict(self, X):
        X = self._fillfn(X)
        outs = []
        for regressor in self._regressors:
            outs.append(regressor.predict(X))
        
        arr = np.swapaxes(np.array(outs,dtype=np.float32), 0, 1)
        marr = np.ma.masked_array(arr,np.isnan(arr))
        avg = np.ma.average(marr, axis=1, weights=self._weights)
        return avg.filled(0)


from statsmodels.tsa.arima_model import ARMA

class ARMARegressor(BaseEstimator, RegressorMixin):
    def fit(self, X, y, sample_weight=None):
        return self

    def predict(self, X):
        out = pd.DataFrame()

        def arma_predict(s):
            try:
                am = ARMA(s.values, (3, 2)).fit(warn_convergence=False)
                p = am.predict(119,178)
                return odo(np.concatenate([p,[0,0]]), pd.Series)
            except:
                return odo([0]*62, pd.Series)

        _X = X.loc[:, 'Ret_2':'Ret_120']
        r = _X.apply(arma_predict, axis=1)
        return r.values

In [7]:
# baseline using a simple regressor that returns the median of target values
from sklearn import cross_validation
clf = DummyRegressor(strategy='median')
scores = cross_validation.cross_val_score(
    clf, X.fillna(0), Y.fillna(0), cv=5, scoring=wmae_score)
print("%s Accuracy: %0.5f (+/- %0.5f)" % (
        clf.__class__.__name__, scores.mean(), scores.std() * 2))

DummyRegressor Accuracy: 1773.78450 (+/- 27.11001)


In [9]:
from sklearn import cross_validation

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

feats1 = set(df.loc[:,'Feature_1':'Feature_25'].columns) - nuke_features
catfeats1 = set(feature_cats) - nuke_features

def testit(clf, fit_params={}):
    scores = cross_validation.cross_val_score(
        clf, X, Y, cv=5, scoring=wmae_score, fit_params=fit_params)
    print("%s Accuracy: %0.5f (+/- %0.5f)" % (
            clf.__class__.__name__, scores.mean(), scores.std() * 2))

fit_params = {
    'weights':[1,1],
    'fillfn':fillall2,
    'regressors': [
        DummyRegressor(strategy='median'),
        NormRowRegressor()
    ]
}

clf = JointRegressor3()
testit(clf, fit_params)

JointRegressor3 Accuracy: 1773.83220 (+/- 26.93983)


In [10]:
fit_params = {
    'weights':[1,1,1,1],
    'regressors': [
        DummyRegressor(strategy='median'),
        FeatureMeanRegressor2(feats1),
        FeatureMeanRegressor2(feats1, ['Ret_PlusTwo','Ret_PlusTwo'],
                              DecisionTreeRegressor(min_samples_leaf=100)),
        FeatureMeanRegressor2(feats1, ['Ret_PlusOne','Ret_PlusOne'],
                              DecisionTreeRegressor(min_samples_leaf=100))
    ]
}

clf = JointRegressor3()
testit(clf, fit_params)

JointRegressor3 Accuracy: 1737.13655 (+/- 30.25603)


In [11]:
msl = 200
n_estimators = 100

fit_params = {
    'weights':[8,1, 1,1,1,1,1,1,1, 1,1,1,1,1,1, 1,1],
    'regressors': [
        DummyRegressor(strategy='median'),
        FeatureMeanRegressor2(feats1),

        RetMeanRegressor(['Ret_MinusTwo','Ret_MinusOne']),
        RetMeanRegressor(['Ret_2','Ret_120'], ['Ret_121','Ret_130']),
        RetMeanRegressor(['Ret_12','Ret_120'], ['Ret_131','Ret_140']),
        RetMeanRegressor(['Ret_22','Ret_120'], ['Ret_141','Ret_150']),
        RetMeanRegressor(['Ret_32','Ret_120'], ['Ret_151','Ret_160']),
        RetMeanRegressor(['Ret_42','Ret_120'], ['Ret_161','Ret_170']),
        RetMeanRegressor(['Ret_52','Ret_120'], ['Ret_171','Ret_180']),
        
        FeatureMeanRegressor2(feats1, ['Ret_121','Ret_130'],
                              DecisionTreeRegressor(min_samples_leaf=msl)),
        FeatureMeanRegressor2(feats1, ['Ret_131','Ret_140'],
                              DecisionTreeRegressor(min_samples_leaf=msl)),
        FeatureMeanRegressor2(feats1, ['Ret_141','Ret_150'],
                              DecisionTreeRegressor(min_samples_leaf=msl)),
        FeatureMeanRegressor2(feats1, ['Ret_151','Ret_160'],
                              DecisionTreeRegressor(min_samples_leaf=msl)),
        FeatureMeanRegressor2(feats1, ['Ret_161','Ret_170'],
                              DecisionTreeRegressor(min_samples_leaf=msl)),
        FeatureMeanRegressor2(feats1, ['Ret_171','Ret_180'],
                              DecisionTreeRegressor(min_samples_leaf=msl)),

        FeatureMeanRegressor2(
            feats1, ['Ret_PlusTwo','Ret_PlusTwo'],
            GradientBoostingRegressor(n_estimators=1000, min_samples_leaf=msl)),        
        FeatureMeanRegressor2(
            feats1, ['Ret_PlusOne','Ret_PlusOne'],
            GradientBoostingRegressor(n_estimators=1000, min_samples_leaf=msl))
    ]
}

clf = JointRegressor3()
testit(clf, fit_params)

JointRegressor3 Accuracy: 1764.69820 (+/- 26.99498)


In [12]:
msl = 200
ne = 100

def build_fit_params():
    fit_params = {
        'weights':[1,1,1,1,1],
        'fillfn':fillall2,
        'regressors': [
            FeatureMeanRegressor2(feats1),
            FeatureMeanRegressor2(
                feats1, ['Ret_PlusTwo','Ret_PlusTwo'],
                DecisionTreeRegressor(min_samples_leaf=msl)),
            FeatureMeanRegressor2(
                feats1, ['Ret_PlusOne','Ret_PlusOne'],
                DecisionTreeRegressor(min_samples_leaf=msl)),
            FeatureMeanRegressor2(
                feats1, ['Ret_PlusTwo','Ret_PlusTwo'],
                GradientBoostingRegressor(n_estimators=ne,min_samples_leaf=msl)),
            FeatureMeanRegressor2(
                feats1, ['Ret_PlusOne','Ret_PlusOne'],
                GradientBoostingRegressor(n_estimators=ne,min_samples_leaf=msl))
        ]
    }
    return fit_params


fit_params = {
    'weights':[16,1,1,1],
    'regressors': [
        DummyRegressor(strategy='median'),
        JointRegressor3(),
        JointRegressor3(),
        JointRegressor3()
    ],
    'fit_params': [
        {},
        build_fit_params(),
        build_fit_params(),
        build_fit_params()
    ]
}

clf = JointRegressor3()
testit(clf, fit_params)

JointRegressor3 Accuracy: 1765.09410 (+/- 27.59929)


In [None]:
# write out predictions to file
clf.fit(X, Y, **fit_params)
Y_hat = clf.predict(X_test)
write_predictions(Y_hat)