In [1]:
%matplotlib inline
import pandas
import os
import collections
import numpy

from train_test import find_files, parse_dates, load_series, load_data, separate_output

data = load_data("../data/v2/train_set/", resample_interval="1H", filter_null_power=True, derived_features=False)
X_train, Y_train = separate_output(data)

Using Theano backend.


Reduced data from 49,464 rows to 49,125


In [2]:
type(data).__name__

'DataFrame'

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49125 entries, 2008-08-22 00:00:00 to 2014-04-13 23:00:00
Data columns (total 89 columns):
NPWD2372                                  49125 non-null float64
NPWD2401                                  49125 non-null float64
NPWD2402                                  49125 non-null float64
NPWD2451                                  49125 non-null float64
NPWD2471                                  49125 non-null float64
NPWD2472                                  49125 non-null float64
NPWD2481                                  49125 non-null float64
NPWD2482                                  49125 non-null float64
NPWD2491                                  49125 non-null float64
NPWD2501                                  49125 non-null float64
NPWD2531                                  49125 non-null float64
NPWD2532                                  49125 non-null float64
NPWD2551                                  49125 non-null float64
NPWD2552   

In [4]:
import sklearn.cross_validation
import sklearn.preprocessing
import sklearn.linear_model
import sklearn.dummy
from operator import itemgetter
from train_test import score_feature, mse_to_rms

splits = sklearn.cross_validation.KFold(data.shape[0], 5, shuffle=True)

# compute baseline
baseline_model = sklearn.dummy.DummyRegressor("mean")
baseline_rms = mse_to_rms(sklearn.cross_validation.cross_val_score(baseline_model, X_train, Y_train, scoring="mean_squared_error", cv=splits)).mean()

# compute score for each feature
feature_scores = dict()
for f in X_train.columns:
    feature_scores[f] = score_feature(X_train[f], Y_train, splits)



In [5]:
for feature, score in sorted(feature_scores.iteritems(), key=itemgetter(1)):
    print "{:30s}: {:.4f} over baseline".format(feature, baseline_rms - score)

eclipseduration_min           : 0.0056 over baseline
eclipseduration_min_rolling_2d: 0.0055 over baseline
eclipseduration_min_rolling_5d: 0.0055 over baseline
days_in_space                 : 0.0042 over baseline
EVTF_event_counts             : 0.0034 over baseline
sa                            : 0.0028 over baseline
flagcomms                     : 0.0027 over baseline
sunmarsearthangle_deg         : 0.0026 over baseline
EVTF_altitude                 : 0.0026 over baseline
EVTF_event_counts_rolling_2h  : 0.0025 over baseline
flagcomms_rolling_1h          : 0.0024 over baseline
sx                            : 0.0023 over baseline
occultationduration_min       : 0.0022 over baseline
flagcomms_rolling_2h          : 0.0022 over baseline
EVTF_IN_MAR_UMBRA             : 0.0020 over baseline
DMOP_event_counts             : 0.0020 over baseline
FTL_EARTH                     : 0.0020 over baseline
sz                            : 0.0019 over baseline
sunmars_km                    : 0.0019 over ba

In [13]:
from collections import defaultdict

derived_scores = defaultdict(dict)
for f in X_train.columns:
    for win in [10, 25, 50, 100, 200, 400]:
        derived_scores[f]["rolling_{:3d}".format(win)] = feature_scores[f] - score_feature(X_train[f].rolling(win).mean().fillna(method="bfill"), Y_train, splits)

    derived_scores[f]["log"] = feature_scores[f] - score_feature(numpy.log(X_train[f] + 1), Y_train, splits)
    derived_scores[f]["sqrt"] = feature_scores[f] - score_feature(numpy.sqrt(X_train[f]), Y_train, splits)
    derived_scores[f]["square"] = feature_scores[f] - score_feature(numpy.square(X_train[f]), Y_train, splits)
    derived_scores[f]["tanh"] = feature_scores[f] - score_feature(pandas.Series(numpy.tanh(X_train[f])), Y_train, splits)
    derived_scores[f]["gradient"] = feature_scores[f] - score_feature(pandas.Series(numpy.gradient(X_train[f])), Y_train, splits)


In [14]:
for base_feature in sorted(derived_scores.iterkeys()):
    print base_feature
    for transform, score in sorted(derived_scores[base_feature].iteritems(), key=itemgetter(1), reverse=True):
        if score > 0.0002:
            rel_improvement = score / (baseline_rms - feature_scores[base_feature])
            print "\t{:10s}: {:.5f} over base. Relative to base feature vs baseline {:.0f}%".format(transform, 
                                                                  score, 
                                                                  100 * rel_improvement)

DMOP_event_counts
	log       : 0.00049 over base. Relative to base feature vs baseline 25%
	sqrt      : 0.00048 over base. Relative to base feature vs baseline 24%
DMOP_event_counts_rolling_2h
	log       : 0.00034 over base. Relative to base feature vs baseline 22%
	sqrt      : 0.00029 over base. Relative to base feature vs baseline 18%
DMOP_event_counts_rolling_5h
	gradient  : 0.00063 over base. Relative to base feature vs baseline 102%
	rolling_400: 0.00036 over base. Relative to base feature vs baseline 58%
	rolling_200: 0.00022 over base. Relative to base feature vs baseline 36%
EVTF_IN_MAR_UMBRA
	rolling_400: 0.00213 over base. Relative to base feature vs baseline 107%
	rolling_200: 0.00153 over base. Relative to base feature vs baseline 77%
	rolling_100: 0.00134 over base. Relative to base feature vs baseline 67%
	rolling_ 50: 0.00124 over base. Relative to base feature vs baseline 63%
	rolling_ 25: 0.00096 over base. Relative to base feature vs baseline 49%
	rolling_ 10: 0.00087

In [20]:
numpy.random.choice(5, 3, replace=False)

array([2, 4, 3])