In [1]:
%matplotlib inline
import pandas
import os
import collections
import numpy

from train_test import find_files, parse_dates, load_series, load_data, separate_output

data = load_data("../data/v2/train_set/", resample_interval="1H", filter_null_power=True, derived_features=False)
X_train, Y_train = separate_output(data)

Using Theano backend.


load_data took 1.4 minutes


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49125 entries, 2008-08-22 00:00:00 to 2014-04-13 23:00:00
Data columns (total 82 columns):
NPWD2372                                   49125 non-null float64
NPWD2401                                   49125 non-null float64
NPWD2402                                   49125 non-null float64
NPWD2451                                   49125 non-null float64
NPWD2471                                   49125 non-null float64
NPWD2472                                   49125 non-null float64
NPWD2481                                   49125 non-null float64
NPWD2482                                   49125 non-null float64
NPWD2491                                   49125 non-null float64
NPWD2501                                   49125 non-null float64
NPWD2531                                   49125 non-null float64
NPWD2532                                   49125 non-null float64
NPWD2551                                   49125 non-null float6

In [3]:
import sklearn.cross_validation
import sklearn.preprocessing
import sklearn.linear_model
import sklearn.dummy
from operator import itemgetter
from train_test import score_feature, rms_error

splits = sklearn.cross_validation.KFold(data.shape[0], 7, shuffle=False)

# compute baseline
baseline_model = sklearn.dummy.DummyRegressor("mean")
baseline_rms = -sklearn.cross_validation.cross_val_score(baseline_model, X_train, Y_train, scoring=rms_error, cv=splits).mean()

# compute score for each feature
feature_scores = dict()
for f in X_train.columns:
    feature_scores[f] = score_feature(X_train[f], Y_train, splits)



In [4]:
for feature, score in sorted(feature_scores.iteritems(), key=itemgetter(1)):
    print "{:30s}: {:.4f} over baseline".format(feature, baseline_rms - score)

eclipseduration_min           : 0.0042 over baseline
eclipseduration_min_rolling_2d: 0.0042 over baseline
eclipseduration_min_rolling_5d: 0.0042 over baseline
sz                            : 0.0027 over baseline
EVTF_altitude                 : 0.0024 over baseline
days_in_space                 : 0.0022 over baseline
sx                            : 0.0020 over baseline
flagcomms_rolling_1h          : 0.0019 over baseline
occultationduration_min       : 0.0017 over baseline
flagcomms_rolling_2h          : 0.0016 over baseline
EVTF_IN_MAR_UMBRA_rolling_1h  : 0.0016 over baseline
SAAF_stddev_8d                : 0.0015 over baseline
SAAF_stddev_1d                : 0.0013 over baseline
FTL_SLEW_rolling_1h           : 0.0012 over baseline
FTL_EARTH_rolling_1h          : 0.0010 over baseline
sa                            : 0.0009 over baseline
DMOP_ACFE03A_under_0.5h_ago_rolling_1h: 0.0009 over baseline
FTL_SLEW_rolling_2h           : 0.0009 over baseline
FTL_EARTH_rolling_2h          : 0.0008

In [5]:
from collections import defaultdict

derived_scores = defaultdict(dict)
for f in X_train.columns:
    for win in [10, 25, 50, 100, 200, 400, 800, 1600]:
        derived_scores[f]["rolling_{:3d}".format(win)] = feature_scores[f] - score_feature(X_train[f].rolling(win).mean().fillna(method="bfill"), Y_train, splits)

    derived_scores[f]["log"] = feature_scores[f] - score_feature(numpy.log(X_train[f] + 1), Y_train, splits)
    derived_scores[f]["sqrt"] = feature_scores[f] - score_feature(numpy.sqrt(X_train[f]), Y_train, splits)
    derived_scores[f]["square"] = feature_scores[f] - score_feature(numpy.square(X_train[f]), Y_train, splits)
    derived_scores[f]["tanh"] = feature_scores[f] - score_feature(pandas.Series(numpy.tanh(X_train[f])), Y_train, splits)
    derived_scores[f]["gradient"] = feature_scores[f] - score_feature(pandas.Series(numpy.gradient(X_train[f])), Y_train, splits)


In [6]:
for base_feature in sorted(derived_scores.iterkeys()):
    print base_feature
    for transform, score in sorted(derived_scores[base_feature].iteritems(), key=itemgetter(1), reverse=True):
        if score > 0.0002:
            rel_improvement = score / (baseline_rms - feature_scores[base_feature])
            print "\t{:10s}: {:.5f} over base. Relative to base feature vs baseline {:.0f}%".format(transform, 
                                                                  score, 
                                                                  100 * rel_improvement)

DMOP_ACFE03A_under_0.5h_ago_rolling_1h
DMOP_MMMF01A0_under_0.5h_ago_rolling_1h
	rolling_800: 0.00184 over base. Relative to base feature vs baseline 1085%
	rolling_1600: 0.00170 over base. Relative to base feature vs baseline 1004%
	rolling_400: 0.00149 over base. Relative to base feature vs baseline 877%
	rolling_200: 0.00117 over base. Relative to base feature vs baseline 688%
	rolling_100: 0.00099 over base. Relative to base feature vs baseline 584%
	rolling_ 25: 0.00090 over base. Relative to base feature vs baseline 531%
	rolling_ 50: 0.00086 over base. Relative to base feature vs baseline 508%
	gradient  : 0.00075 over base. Relative to base feature vs baseline 443%
	rolling_ 10: 0.00071 over base. Relative to base feature vs baseline 419%
DMOP_MMMF10A0_under_0.5h_ago_rolling_1h
	gradient  : 0.00197 over base. Relative to base feature vs baseline 1214%
DMOP_event_counts
	gradient  : 0.00050 over base. Relative to base feature vs baseline 128%
DMOP_event_counts_rolling_2h
	gradien

In [10]:
import numpy
a = numpy.random.rand(5, 2) - 0.5
print a

print numpy.maximum(a, 0)

[[-0.08414971  0.4672089 ]
 [-0.08916447  0.17019319]
 [ 0.3670955   0.30133396]
 [-0.43574754  0.16329424]
 [ 0.36180119 -0.15390023]]
[[ 0.          0.4672089 ]
 [ 0.          0.17019319]
 [ 0.3670955   0.30133396]
 [ 0.          0.16329424]
 [ 0.36180119  0.        ]]
