Permalink
Browse files

Hash caching fix. New feature tests

  • Loading branch information...
1 parent a1bac75 commit 8f92a5cb9a8d49ec8db980d63092d051a630a6bc @kvh committed Oct 31, 2012
Showing with 34 additions and 84 deletions.
  1. +11 −12 ramp/features/base.py
  2. +0 −59 ramp/new_tests/test_features.py
  3. +23 −13 ramp/tests/test_features.py
View
@@ -118,7 +118,6 @@ def __init__(self, features):
if cname.endswith('Feature'):
cname = cname[:-7]
self._name = cname
- self._hash_cache = None
def __getstate__(self):
# shallow copy dict and keep references
@@ -132,12 +131,8 @@ def __repr__(self):
return stable_repr(self)
def _hash(self):
- if self._hash_cache is not None:
- return self._hash_cache
s = repr(self)
- self._hash_cache = md5(s).hexdigest()[:self.hash_length]
- return self._hash_cache
-
+ return md5(s).hexdigest()[:self.hash_length]
@property
def unique_name(self):
@@ -162,7 +157,7 @@ def __str__(self):
def _remove_hashes(self, s):
return self.re_hsh.sub('', s)
- def column_rename(self, existing_name):
+ def column_rename(self, existing_name, hsh=None):
"""
like unique_name, but in addition must be unique to each column of this
feature. accomplishes this by prepending readable string to existing
@@ -172,11 +167,13 @@ def column_rename(self, existing_name):
existing_name = str(existing_name)
except UnicodeEncodeError:
pass
+ if hsh is None:
+ hsh = self._hash()
if self._name:
return '%s(%s) [%s]' %(self._name, self._remove_hashes(existing_name),
- self._hash())
+ hsh)
return '%s [%s]'%(self._remove_hashes(existing_name),
- self._hash())
+ hsh)
def depends_on_y(self):
return any([f.depends_on_y() for f in self.features])
@@ -222,7 +219,7 @@ def create_key(self):
tindex = get_np_hashable(self.context.train_index) if self.depends_on_y() else ''
pindex = get_np_hashable(self.context.prep_index) if self.depends_on_other_x() else ''
return self.unique_name + '--' + md5('%s--%s--%s' % (s, tindex, pindex)).hexdigest()
-
+
def create(self, context, force=False):
""" This is the prep for creating features. Has caching logic. """
@@ -254,7 +251,8 @@ def create(self, context, force=False):
def _create(self, datas):
data = self.combine(datas)
- data.columns = data.columns.map(self.column_rename)
+ hsh = self._hash() # cache this so we dont recompute for every column
+ data.columns = data.columns.map(lambda x: self.column_rename(x, hsh))
return data
@@ -268,7 +266,8 @@ def create_data(self, force):
data = self.feature.create(self.context, force)
#data = DataFrame(data.copy())
data = self._create(data)
- data.columns = data.columns.map(self.column_rename)
+ hsh = self._hash() # cache this so we dont recompute for every column
+ data.columns = data.columns.map(lambda x: self.column_rename(x, hsh))
return data
def _create(self, data):
@@ -1,59 +0,0 @@
-import sys
-sys.path.append('../..')
-from ramp import context, store
-from ramp.features import base
-import unittest
-from pandas import DataFrame, Series, Index
-import pandas
-import tempfile
-
-from sklearn import linear_model
-import numpy as np
-import os, sys
-
-from pandas.util.testing import assert_almost_equal
-
-
-def strip_hash(s):
- return s[:-11]
-
-def make_data(n):
- data = pandas.DataFrame(
- np.random.randn(n, 3),
- columns=['a','b','c'],
- index=range(10, n+10))
- data['const'] = np.zeros(n)
- data['ints'] = range(n)
- data['y'] = data['a'] ** 2
- return data
-
-
-class TestBasicFeature(unittest.TestCase):
- def setUp(self):
- self.data = make_data(10)
-
- def test_create_cache(self):
- f = base.Normalize(base.F(10) + base.F('a'))
- ctx = context.DataContext(store.MemoryStore('test', verbose=True), self.data)
- r = f.create(ctx)
- r = r[r.columns[0]]
- self.assertAlmostEqual(r.mean(), 0)
- self.assertAlmostEqual(r.std(), 1)
-
- # now add some new data
- idx = len(self.data) + 1000
- ctx.data = ctx.data.append(DataFrame([100, 200], columns=['a'], index=Index([idx, idx+1])))
- r = f.create(ctx)
- r = r[r.columns[0]]
- self.assertAlmostEqual(r[idx], (100 - self.data['a'].mean()) / self.data['a'].std())
-
- # drop all the other data ... should still use old prep data
- ctx.data = ctx.data.ix[[idx, idx+1]]
- r = f.create(ctx)
- r = r[r.columns[0]]
- self.assertAlmostEqual(r[idx], (100 - self.data['a'].mean()) / self.data['a'].std())
-
-
-
-if __name__ == '__main__':
- unittest.main()
@@ -1,9 +1,10 @@
import sys
sys.path.append('../..')
+from ramp import context, store
+from ramp.features import base
from ramp.features.base import *
-from ramp import store
-from ramp.context import *
import unittest
+from pandas import DataFrame, Series, Index
import pandas
import tempfile
@@ -12,7 +13,6 @@
import os, sys
from pandas.util.testing import assert_almost_equal
-from test_models import lm
def strip_hash(s):
@@ -30,6 +30,8 @@ def make_data(n):
class TestBasicFeature(unittest.TestCase):
+ def setUp(self):
+ self.data = make_data(10)
def test_basefeature(self):
f = BaseFeature('col1')
@@ -70,21 +72,29 @@ def test_feature(self):
self.assertEqual(repr(f), "Feature(_name='',"
"feature='col1',features=['col1'])")
+ def test_create_cache(self):
+ f = base.Normalize(base.F(10) + base.F('a'))
+ ctx = context.DataContext(store.MemoryStore('test', verbose=True), self.data)
+ r = f.create(ctx)
+ r = r[r.columns[0]]
+ self.assertAlmostEqual(r.mean(), 0)
+ self.assertAlmostEqual(r.std(), 1)
+ # now add some new data
+ idx = len(self.data) + 1000
+ ctx.data = ctx.data.append(DataFrame([100, 200], columns=['a'], index=Index([idx, idx+1])))
+ r = f.create(ctx)
+ r = r[r.columns[0]]
+ self.assertAlmostEqual(r[idx], (100 - self.data['a'].mean()) / self.data['a'].std())
-class TestFeatureCreate(unittest.TestCase):
-
- def setUp(self):
- n = 100
- self.n = n
- self.data = make_data(n)
- self.context = DataContext(store.MemoryStore(), self.data)
-
-
+ # drop all the other data ... should still use old prep data
+ ctx.data = ctx.data.ix[[idx, idx+1]]
+ r = f.create(ctx)
+ r = r[r.columns[0]]
+ self.assertAlmostEqual(r[idx], (100 - self.data['a'].mean()) / self.data['a'].std())
if __name__ == '__main__':
unittest.main()
-

0 comments on commit 8f92a5c

Please sign in to comment.