allow recasting dt variables

jpn-- · Jan 19, 2017 · a6f1d49 · a6f1d49
1 parent f4ff30a
commit a6f1d49
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 9 deletions.
diff --git a/py/dt/__init__.py b/py/dt/__init__.py
@@ -2506,11 +2506,44 @@ def new_idco(self, name, expression, dtype=numpy.float64, *, overwrite=False, ti
 		if title is not None:
 			self.idco[name]._v_attrs.TITLE = title
 
+	def recast_idco(self, name, newtype, invalid_values=()):
+		"""
+		Recast an existing idco variable into a new fundamental dtype.
+		
+		Parameters
+		----------
+		name : str
+			The name of the variable to recast
+		newtype : dtype
+			The new dtype to which the existing values will be coerced
+		invalid_values : tuple
+			Values that are invalid and will be changed to NaN (if the new dtype is 
+			some kind of float, otherwise this is ignored).
+		"""
+		arr = self.idco[name][:].astype(newtype)
+		if arr.dtype.kind=='f':
+			for invalid_value in invalid_values:
+				arr[arr==invalid_value] = numpy.nan
+		attrnames = self.idco[name]._v_attrs._v_attrnames
+		attrs={}
+		for attrname in attrnames:
+			if attrname not in ('CLASS','VERSION'):
+				attrs[attrname] = self.idco[name]._v_attrs[attrname]
+		try:
+			self.h5f.remove_node(self.idco._v_node, name, False)
+		except _tb.exceptions.NoSuchNodeError:
+			pass
+		self.new_idco_from_array(name, arr)
+		for attrname in attrs.keys():
+			self.idco[name]._v_attrs[attrname] = attrs[attrname]
+
+
+
 	def new_blank_idco(self, name, dtype=None, overwrite=False, title=None):
 		zer = numpy.zeros(self.nAllCases(), dtype=dtype or numpy.float64)
 		return self.new_idco_from_array(name, zer, overwrite=overwrite, title=title)
 
-	def new_idco_from_array(self, name, arr, *, overwrite=False, original_source=None, rel_original_source=True, title=None):
+	def new_idco_from_array(self, name, arr, *, overwrite=False, original_source=None, rel_original_source=True, title=None, dictionary=None):
 		"""Create a new :ref:`idco` variable.
 		
 		Creating a new variable in the data might be convenient in some instances.
@@ -2579,13 +2612,15 @@ def convert_datetime_time_to_epoch_seconds(tm):
 				h5var[:] = arr
 			else:
 				raise
-		if rel_original_source and original_source:
+		if rel_original_source and original_source and original_source[0]!='=':
 			basedir = os.path.dirname(self.source_filename)
 			original_source = os.path.relpath(original_source, start=basedir)
 		if original_source is not None:
 			self.idco[name]._v_attrs.ORIGINAL_SOURCE = original_source
 		if title is not None:
 			self.idco[name]._v_attrs.TITLE = title
+		if dictionary is not None:
+			self.idco[name]._v_attrs.DICTIONARY = dictionary
 
 
 	def merge_into_idco_from_dataframe(self, other, self_on, other_on, dupe_suffix="_copy", original_source=None, names=None, log=lambda *x: None,):
@@ -2861,7 +2896,7 @@ def new_blank_idca(self, name, nalts=None, dtype=None, overwrite=False, title=No
 		zer = numpy.zeros([self.nAllCases(), nalts], dtype=dtype or numpy.float64)
 		return self.new_idca_from_array(name, zer, overwrite=overwrite, title=title)
 
-	def new_idca_from_array(self, name, arr, overwrite=False, original_source=None, rel_original_source=True, title=None):
+	def new_idca_from_array(self, name, arr, overwrite=False, original_source=None, rel_original_source=True, title=None, dictionary=None):
 		"""Create a new :ref:`idca` variable.
 		
 		Creating a new variable in the data might be convenient in some instances.
@@ -2893,13 +2928,15 @@ def new_idca_from_array(self, name, arr, overwrite=False, original_source=None,
 		if overwrite:
 			self.delete_data(name)
 		self.h5f.create_carray(self.idca._v_node, name, obj=arr)
-		if rel_original_source and original_source:
+		if rel_original_source and original_source and original_source[0]!='=':
 			basedir = os.path.dirname(self.source_filename)
 			original_source = os.path.relpath(original_source, start=basedir)
 		if original_source is not None:
 			self.idca[name]._v_attrs.ORIGINAL_SOURCE = original_source
 		if title is not None:
 			self.idca[name]._v_attrs.TITLE = title
+		if dictionary is not None:
+			self.idca[name]._v_attrs.DICTIONARY = dictionary
 
 	def new_idco_from_keyed_array(self, name, arr_val, arr_index, title=None):
 		"""Create a new :ref:`idco` variable.
@@ -3888,6 +3925,7 @@ def TempCopy(cls, filename, *args, **kwargs):
 
 	@classmethod
 	def Concat(cls, *subs, tags=None, tagname='casesource', **kwargs):
+		from ..util.arraytools import failable_iter_to_unique
 		self = cls(**kwargs)
 
 		def _getshape1(z):
@@ -3919,7 +3957,12 @@ def _getshape1(z):
 			present = numpy.asarray([(varname in sub.idco) for sub in subs])
 			if numpy.all(present):
 				arr = numpy.hstack(sub.idco[varname][:] for sub in subs)
-				self.new_idco_from_array(varname, arr)
+
+				title = failable_iter_to_unique(subs, lambda i: i.idco[varname]._v_attrs.TITLE)
+				original_source = failable_iter_to_unique(subs, lambda i: i.idco[varname]._v_attrs.ORIGINAL_SOURCE)
+				dictionary = failable_iter_to_unique(subs, lambda i: i.idco[varname]._v_attrs.DICTIONARY)
+
+				self.new_idco_from_array(varname, arr, title=title, original_source=original_source, dictionary=dictionary)
 			else:
 				#warnings.warn('idco variable "{}" in DT {} is lost'.format(varname, 0))
 				idco_lost[0].add(varname)
@@ -3938,8 +3981,15 @@ def _getshape1(z):
 					shapes_match = numpy.asarray([(_getshape1(sub.idca[varname])==shape0) for sub in subs])
 					if numpy.all(shapes_match):
 						if subnum==0:
-							arr = numpy.vstack(sub_.idca[varname][:] for sub_ in subs)
-							self.new_idca_from_array(varname, arr)
+							try:
+								arr = numpy.vstack(sub_.idca[varname][:] for sub_ in subs)
+							except:
+								idca_lost[subnum].add(  (varname, _getshape1(sub.idca[varname]))  )
+							else:
+								title = failable_iter_to_unique(subs, lambda i: i.idca[varname]._v_attrs.TITLE)
+								original_source = failable_iter_to_unique(subs, lambda i: i.idca[varname]._v_attrs.ORIGINAL_SOURCE)
+								dictionary = failable_iter_to_unique(subs, lambda i: i.idca[varname]._v_attrs.DICTIONARY)
+								self.new_idca_from_array(varname, arr, title=title, original_source=original_source, dictionary=dictionary)
 						# else: we did this already for sub 0
 					else:
 						#warnings.warn('idca variable "{}" with shape {} in DT {} is lost'.format(varname, _getshape1(sub.idca[varname]), subnum))

diff --git a/py/dt/analyze.py b/py/dt/analyze.py
@@ -43,6 +43,9 @@ def look_site(self, directory=None, screen="None"):
 			f_top_table_headrow = f_top_table.put(tag="thead").put(tag="tr")
 			f_top_table_headrow.put(tag="th", text='Variable')
 			f_top_table_headrow.put(tag="th", text='dtype')
+			f_top_table_headrow.put(tag="th", text='shape')
+			f_top_table_headrow.put(tag="th", text='Original Source')
+			f_top_table_headrow.put(tag="th", text='Description')
 
 			for slot in range(len(names)):
 
@@ -54,13 +57,42 @@ def look_site(self, directory=None, screen="None"):
 
 				try:
 					z_dtype = self.idco[name].dtype
-				except (TypeError,tables.NoSuchNodeError):
+				except (TypeError,tables.NoSuchNodeError,AttributeError):
 					try:
 						z_dtype = self.idco[name]._values_.dtype
 					except:
 						z_dtype = "?"
 
 				trow.put(tag="td", text=str(z_dtype))
+
+
+				try:
+					z_shape = self.idco[name].shape
+				except (TypeError,tables.NoSuchNodeError,AttributeError):
+					try:
+						z_shape = self.idco[name]._index_.shape[0:1] + self.idco[name]._values_.shape[1:]
+					except:
+						z_shape = "?"
+				trow.put(tag="td", text=str(z_shape))
+
+
+				try:
+					z_source = self.idco[name]._v_attrs.ORIGINAL_SOURCE
+				except (TypeError,tables.NoSuchNodeError,AttributeError):
+					try:
+						z_source = self.idco[name]._values_._v_attrs.ORIGINAL_SOURCE
+					except:
+						z_source = "?"
+				trow.put(tag="td", text=str(z_source))
+
+				try:
+					z_descrip = self.idco[name]._v_attrs.TITLE
+				except (TypeError,tables.NoSuchNodeError,AttributeError):
+					try:
+						z_descrip = self.idco[name]._values_._v_attrs.TITLE
+					except:
+						z_descrip = ""
+				trow.put(tag="td", text=str(z_descrip))
 
 				with XHTML(fname, overwrite=True, view_on_exit=False) as f:
 					navbar = Elem(tag='span', attrib={'style':'font:Roboto, monospace; font-size:80%; font-weight:900;'}, text='', tail=' ')
@@ -89,6 +121,9 @@ def look_site(self, directory=None, screen="None"):
 			f_top_table_headrow = f_top_table.put(tag="thead").put(tag="tr")
 			f_top_table_headrow.put(tag="th", text='Variable')
 			f_top_table_headrow.put(tag="th", text='dtype')
+			f_top_table_headrow.put(tag="th", text='shape')
+			f_top_table_headrow.put(tag="th", text='Original Source')
+			f_top_table_headrow.put(tag="th", text='Description')
 
 			for slot in range(len(names)):
 
@@ -99,13 +134,41 @@ def look_site(self, directory=None, screen="None"):
 				trow.put(tag="td").put(tag="a", attrib={'href':'./idca/'+name+".html"}, text=name)
 				try:
 					z_dtype = self.idca[name].dtype
-				except (TypeError,tables.NoSuchNodeError):
+				except (TypeError,tables.NoSuchNodeError,AttributeError):
 					try:
 						z_dtype = self.idca[name]._values_.dtype
 					except:
 						z_dtype = "?"
 				trow.put(tag="td", text=str(z_dtype))
 
+				try:
+					z_shape = self.idca[name].shape
+				except (TypeError,tables.NoSuchNodeError,AttributeError):
+					try:
+						z_shape = self.idca[name]._index_.shape[0:1] + self.idca[name]._values_.shape[1:]
+					except:
+						z_shape = "?"
+				trow.put(tag="td", text=str(z_shape))
+
+				try:
+					z_source = self.idca[name]._v_attrs.ORIGINAL_SOURCE
+				except (TypeError,tables.NoSuchNodeError,AttributeError):
+					try:
+						z_source = self.idca[name]._values_._v_attrs.ORIGINAL_SOURCE
+					except:
+						z_source = "?"
+				trow.put(tag="td", text=str(z_source))
+
+				try:
+					z_descrip = self.idca[name]._v_attrs.TITLE
+				except (TypeError,tables.NoSuchNodeError,AttributeError):
+					try:
+						z_descrip = self.idca[name]._values_._v_attrs.TITLE
+					except:
+						z_descrip = ""
+				trow.put(tag="td", text=str(z_descrip))
+
+
 				with XHTML(fname, overwrite=True, view_on_exit=False) as f:
 					navbar = Elem(tag='span', attrib={'style':'font:Roboto, monospace; font-size:80%; font-weight:900;'}, text='', tail=' ')
 					if slot==0:

diff --git a/py/dt/groupnode.py b/py/dt/groupnode.py
@@ -219,8 +219,12 @@ def __contains__(self, arg):
 	def __repr__(self, *arg, **kwarg):
 		return "<larch.DT:GroupNode> "+self._v_node._v_pathname+"\n  "+"\n  ".join(sorted(self._v_children_keys_including_extern))
 	def __getitem__(self, key):
+		if isinstance(key,slice):
+			raise TypeError('cannot slice a group (yet)')
 		return self.__getattr__(key)
 	def __setitem__(self, key, value):
+		if isinstance(key,slice):
+			raise TypeError('cannot slice a group (yet)')
 		return self.__setattr__(key,value)
 
 	def add_group_node(self, name):

diff --git a/py/util/arraytools.py b/py/util/arraytools.py
@@ -131,3 +131,43 @@ def convert_float_to_int_if_lossless(arr, inttype=numpy.int32):
 				return arr.astype(inttype)
 	return arr
 
+
+def failable_iter_to_set(iterable, transformer):
+	s = set()
+	for i in iterable:
+		try:
+			s.add(transformer(i))
+		except AttributeError:
+			pass
+	return s
+
+def failable_iter_to_unique(iterable, transformer):
+	s_cache = None
+	for i in iterable:
+		try:
+			s = transformer(i)
+		except AttributeError:
+			pass
+		else:
+			if s_cache is not None:
+				if s_cache!=s:
+					return None
+			else:
+				s_cache = s
+	return s_cache
+
+def unique_successful_transform(iterable, transformer, accept_longest=False):
+	s = failable_iter_to_unique(iterable, transformer)
+	if len(s)==1:
+		return s.pop()
+	if accept_longest:
+		candidate = None
+		candidate_len = 0
+		for i in s:
+			if len(str(i)) > candidate_len:
+				candidate_len = len(str(i))
+				candidate = i
+		return candidate
+	return None
+
+