Skip to content

Commit

Permalink
allow recasting dt variables
Browse files Browse the repository at this point in the history
  • Loading branch information
jpn-- committed Jan 19, 2017
1 parent f4ff30a commit a6f1d49
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 9 deletions.
64 changes: 57 additions & 7 deletions py/dt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2506,11 +2506,44 @@ def new_idco(self, name, expression, dtype=numpy.float64, *, overwrite=False, ti
if title is not None:
self.idco[name]._v_attrs.TITLE = title

def recast_idco(self, name, newtype, invalid_values=()):
"""
Recast an existing idco variable into a new fundamental dtype.
Parameters
----------
name : str
The name of the variable to recast
newtype : dtype
The new dtype to which the existing values will be coerced
invalid_values : tuple
Values that are invalid and will be changed to NaN (if the new dtype is
some kind of float, otherwise this is ignored).
"""
arr = self.idco[name][:].astype(newtype)
if arr.dtype.kind=='f':
for invalid_value in invalid_values:
arr[arr==invalid_value] = numpy.nan
attrnames = self.idco[name]._v_attrs._v_attrnames
attrs={}
for attrname in attrnames:
if attrname not in ('CLASS','VERSION'):
attrs[attrname] = self.idco[name]._v_attrs[attrname]
try:
self.h5f.remove_node(self.idco._v_node, name, False)
except _tb.exceptions.NoSuchNodeError:
pass
self.new_idco_from_array(name, arr)
for attrname in attrs.keys():
self.idco[name]._v_attrs[attrname] = attrs[attrname]



def new_blank_idco(self, name, dtype=None, overwrite=False, title=None):
zer = numpy.zeros(self.nAllCases(), dtype=dtype or numpy.float64)
return self.new_idco_from_array(name, zer, overwrite=overwrite, title=title)

def new_idco_from_array(self, name, arr, *, overwrite=False, original_source=None, rel_original_source=True, title=None):
def new_idco_from_array(self, name, arr, *, overwrite=False, original_source=None, rel_original_source=True, title=None, dictionary=None):
"""Create a new :ref:`idco` variable.
Creating a new variable in the data might be convenient in some instances.
Expand Down Expand Up @@ -2579,13 +2612,15 @@ def convert_datetime_time_to_epoch_seconds(tm):
h5var[:] = arr
else:
raise
if rel_original_source and original_source:
if rel_original_source and original_source and original_source[0]!='=':
basedir = os.path.dirname(self.source_filename)
original_source = os.path.relpath(original_source, start=basedir)
if original_source is not None:
self.idco[name]._v_attrs.ORIGINAL_SOURCE = original_source
if title is not None:
self.idco[name]._v_attrs.TITLE = title
if dictionary is not None:
self.idco[name]._v_attrs.DICTIONARY = dictionary


def merge_into_idco_from_dataframe(self, other, self_on, other_on, dupe_suffix="_copy", original_source=None, names=None, log=lambda *x: None,):
Expand Down Expand Up @@ -2861,7 +2896,7 @@ def new_blank_idca(self, name, nalts=None, dtype=None, overwrite=False, title=No
zer = numpy.zeros([self.nAllCases(), nalts], dtype=dtype or numpy.float64)
return self.new_idca_from_array(name, zer, overwrite=overwrite, title=title)

def new_idca_from_array(self, name, arr, overwrite=False, original_source=None, rel_original_source=True, title=None):
def new_idca_from_array(self, name, arr, overwrite=False, original_source=None, rel_original_source=True, title=None, dictionary=None):
"""Create a new :ref:`idca` variable.
Creating a new variable in the data might be convenient in some instances.
Expand Down Expand Up @@ -2893,13 +2928,15 @@ def new_idca_from_array(self, name, arr, overwrite=False, original_source=None,
if overwrite:
self.delete_data(name)
self.h5f.create_carray(self.idca._v_node, name, obj=arr)
if rel_original_source and original_source:
if rel_original_source and original_source and original_source[0]!='=':
basedir = os.path.dirname(self.source_filename)
original_source = os.path.relpath(original_source, start=basedir)
if original_source is not None:
self.idca[name]._v_attrs.ORIGINAL_SOURCE = original_source
if title is not None:
self.idca[name]._v_attrs.TITLE = title
if dictionary is not None:
self.idca[name]._v_attrs.DICTIONARY = dictionary

def new_idco_from_keyed_array(self, name, arr_val, arr_index, title=None):
"""Create a new :ref:`idco` variable.
Expand Down Expand Up @@ -3888,6 +3925,7 @@ def TempCopy(cls, filename, *args, **kwargs):

@classmethod
def Concat(cls, *subs, tags=None, tagname='casesource', **kwargs):
from ..util.arraytools import failable_iter_to_unique
self = cls(**kwargs)

def _getshape1(z):
Expand Down Expand Up @@ -3919,7 +3957,12 @@ def _getshape1(z):
present = numpy.asarray([(varname in sub.idco) for sub in subs])
if numpy.all(present):
arr = numpy.hstack(sub.idco[varname][:] for sub in subs)
self.new_idco_from_array(varname, arr)

title = failable_iter_to_unique(subs, lambda i: i.idco[varname]._v_attrs.TITLE)
original_source = failable_iter_to_unique(subs, lambda i: i.idco[varname]._v_attrs.ORIGINAL_SOURCE)
dictionary = failable_iter_to_unique(subs, lambda i: i.idco[varname]._v_attrs.DICTIONARY)

self.new_idco_from_array(varname, arr, title=title, original_source=original_source, dictionary=dictionary)
else:
#warnings.warn('idco variable "{}" in DT {} is lost'.format(varname, 0))
idco_lost[0].add(varname)
Expand All @@ -3938,8 +3981,15 @@ def _getshape1(z):
shapes_match = numpy.asarray([(_getshape1(sub.idca[varname])==shape0) for sub in subs])
if numpy.all(shapes_match):
if subnum==0:
arr = numpy.vstack(sub_.idca[varname][:] for sub_ in subs)
self.new_idca_from_array(varname, arr)
try:
arr = numpy.vstack(sub_.idca[varname][:] for sub_ in subs)
except:
idca_lost[subnum].add( (varname, _getshape1(sub.idca[varname])) )
else:
title = failable_iter_to_unique(subs, lambda i: i.idca[varname]._v_attrs.TITLE)
original_source = failable_iter_to_unique(subs, lambda i: i.idca[varname]._v_attrs.ORIGINAL_SOURCE)
dictionary = failable_iter_to_unique(subs, lambda i: i.idca[varname]._v_attrs.DICTIONARY)
self.new_idca_from_array(varname, arr, title=title, original_source=original_source, dictionary=dictionary)
# else: we did this already for sub 0
else:
#warnings.warn('idca variable "{}" with shape {} in DT {} is lost'.format(varname, _getshape1(sub.idca[varname]), subnum))
Expand Down
67 changes: 65 additions & 2 deletions py/dt/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def look_site(self, directory=None, screen="None"):
f_top_table_headrow = f_top_table.put(tag="thead").put(tag="tr")
f_top_table_headrow.put(tag="th", text='Variable')
f_top_table_headrow.put(tag="th", text='dtype')
f_top_table_headrow.put(tag="th", text='shape')
f_top_table_headrow.put(tag="th", text='Original Source')
f_top_table_headrow.put(tag="th", text='Description')

for slot in range(len(names)):

Expand All @@ -54,13 +57,42 @@ def look_site(self, directory=None, screen="None"):

try:
z_dtype = self.idco[name].dtype
except (TypeError,tables.NoSuchNodeError):
except (TypeError,tables.NoSuchNodeError,AttributeError):
try:
z_dtype = self.idco[name]._values_.dtype
except:
z_dtype = "?"

trow.put(tag="td", text=str(z_dtype))


try:
z_shape = self.idco[name].shape
except (TypeError,tables.NoSuchNodeError,AttributeError):
try:
z_shape = self.idco[name]._index_.shape[0:1] + self.idco[name]._values_.shape[1:]
except:
z_shape = "?"
trow.put(tag="td", text=str(z_shape))


try:
z_source = self.idco[name]._v_attrs.ORIGINAL_SOURCE
except (TypeError,tables.NoSuchNodeError,AttributeError):
try:
z_source = self.idco[name]._values_._v_attrs.ORIGINAL_SOURCE
except:
z_source = "?"
trow.put(tag="td", text=str(z_source))

try:
z_descrip = self.idco[name]._v_attrs.TITLE
except (TypeError,tables.NoSuchNodeError,AttributeError):
try:
z_descrip = self.idco[name]._values_._v_attrs.TITLE
except:
z_descrip = ""
trow.put(tag="td", text=str(z_descrip))

with XHTML(fname, overwrite=True, view_on_exit=False) as f:
navbar = Elem(tag='span', attrib={'style':'font:Roboto, monospace; font-size:80%; font-weight:900;'}, text='', tail=' ')
Expand Down Expand Up @@ -89,6 +121,9 @@ def look_site(self, directory=None, screen="None"):
f_top_table_headrow = f_top_table.put(tag="thead").put(tag="tr")
f_top_table_headrow.put(tag="th", text='Variable')
f_top_table_headrow.put(tag="th", text='dtype')
f_top_table_headrow.put(tag="th", text='shape')
f_top_table_headrow.put(tag="th", text='Original Source')
f_top_table_headrow.put(tag="th", text='Description')

for slot in range(len(names)):

Expand All @@ -99,13 +134,41 @@ def look_site(self, directory=None, screen="None"):
trow.put(tag="td").put(tag="a", attrib={'href':'./idca/'+name+".html"}, text=name)
try:
z_dtype = self.idca[name].dtype
except (TypeError,tables.NoSuchNodeError):
except (TypeError,tables.NoSuchNodeError,AttributeError):
try:
z_dtype = self.idca[name]._values_.dtype
except:
z_dtype = "?"
trow.put(tag="td", text=str(z_dtype))

try:
z_shape = self.idca[name].shape
except (TypeError,tables.NoSuchNodeError,AttributeError):
try:
z_shape = self.idca[name]._index_.shape[0:1] + self.idca[name]._values_.shape[1:]
except:
z_shape = "?"
trow.put(tag="td", text=str(z_shape))

try:
z_source = self.idca[name]._v_attrs.ORIGINAL_SOURCE
except (TypeError,tables.NoSuchNodeError,AttributeError):
try:
z_source = self.idca[name]._values_._v_attrs.ORIGINAL_SOURCE
except:
z_source = "?"
trow.put(tag="td", text=str(z_source))

try:
z_descrip = self.idca[name]._v_attrs.TITLE
except (TypeError,tables.NoSuchNodeError,AttributeError):
try:
z_descrip = self.idca[name]._values_._v_attrs.TITLE
except:
z_descrip = ""
trow.put(tag="td", text=str(z_descrip))


with XHTML(fname, overwrite=True, view_on_exit=False) as f:
navbar = Elem(tag='span', attrib={'style':'font:Roboto, monospace; font-size:80%; font-weight:900;'}, text='', tail=' ')
if slot==0:
Expand Down
4 changes: 4 additions & 0 deletions py/dt/groupnode.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,12 @@ def __contains__(self, arg):
def __repr__(self, *arg, **kwarg):
return "<larch.DT:GroupNode> "+self._v_node._v_pathname+"\n "+"\n ".join(sorted(self._v_children_keys_including_extern))
def __getitem__(self, key):
if isinstance(key,slice):
raise TypeError('cannot slice a group (yet)')
return self.__getattr__(key)
def __setitem__(self, key, value):
if isinstance(key,slice):
raise TypeError('cannot slice a group (yet)')
return self.__setattr__(key,value)

def add_group_node(self, name):
Expand Down
40 changes: 40 additions & 0 deletions py/util/arraytools.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,43 @@ def convert_float_to_int_if_lossless(arr, inttype=numpy.int32):
return arr.astype(inttype)
return arr


def failable_iter_to_set(iterable, transformer):
s = set()
for i in iterable:
try:
s.add(transformer(i))
except AttributeError:
pass
return s

def failable_iter_to_unique(iterable, transformer):
s_cache = None
for i in iterable:
try:
s = transformer(i)
except AttributeError:
pass
else:
if s_cache is not None:
if s_cache!=s:
return None
else:
s_cache = s
return s_cache

def unique_successful_transform(iterable, transformer, accept_longest=False):
s = failable_iter_to_unique(iterable, transformer)
if len(s)==1:
return s.pop()
if accept_longest:
candidate = None
candidate_len = 0
for i in s:
if len(str(i)) > candidate_len:
candidate_len = len(str(i))
candidate = i
return candidate
return None


0 comments on commit a6f1d49

Please sign in to comment.