Skip to content

Commit

Permalink
support for removing unused levels (internally)
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Mar 22, 2017
1 parent 4c880ce commit aa6190f
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 16 deletions.
53 changes: 48 additions & 5 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1175,7 +1175,7 @@ def from_product(cls, iterables, sortorder=None, names=None):
labels = cartesian_product(labels)
return MultiIndex(levels, labels, sortorder=sortorder, names=names)

def _reconstruct(self, sort=False):
def _reconstruct(self, sort=False, remove_unused=False):
"""
reconstruct the MultiIndex
Expand All @@ -1186,21 +1186,33 @@ def _reconstruct(self, sort=False):
----------
sort: boolean, default False
monotonically sort the levels
remove_unused: boolean, default False
remove unsued levels
Returns
-------
MultiIndex
"""

if sort and remove_unused:
raise ValueError("only support one of sort / remove_unused")

if not (sort or remove_unused):
raise ValueError("must supply one of sort / remove_unsued")

levels = self.levels
labels = self.labels

new_levels = []
new_labels = []

if sort:

if self.is_monotonic:
if self.is_lexsorted() and self.is_monotonic:
return self

for lev, lab in zip(self.levels, self.labels):
for lev, lab in zip(levels, labels):

if lev.is_monotonic:
new_levels.append(lev)
Expand All @@ -1218,8 +1230,39 @@ def _reconstruct(self, sort=False):
new_levels.append(lev)
new_labels.append(lab)

else:
return self
elif remove_unused:

changed = np.zeros(self.nlevels, dtype=bool)
for i, (lev, lab) in enumerate(zip(levels, labels)):

uniques = np.sort(algos.unique(lab))

# nothing unused
if len(uniques) == len(lev):
new_levels.append(lev)
new_labels.append(lab)
changed[i] = True
continue

unused = list(reversed(sorted(set(
np.arange(len(lev))) - set(uniques))))

# new levels are simple
lev = lev.take(uniques)

# new labels, we remove the unsued
# by decrementing the labels for that value
# prob a better way
for u in unused:

lab = np.where(lab > u, lab - 1, lab)

new_levels.append(lev)
new_labels.append(lab)

# nothing changed
if not changed.any():
return self

return MultiIndex(new_levels, new_labels,
names=self.names, sortorder=self.sortorder,
Expand Down
49 changes: 41 additions & 8 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2411,6 +2411,18 @@ def test_is_monotonic(self):

self.assertFalse(i.is_monotonic)

def test_reconstruct_api(self):

mi = MultiIndex.from_arrays([
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
])

with pytest.raises(ValueError):
mi._reconstruct()

with pytest.raises(ValueError):
mi._reconstruct(sort=True, remove_unused=True)

def test_reconstruct_sort(self):

# starts off lexsorted & monotonic
Expand All @@ -2428,14 +2440,6 @@ def test_reconstruct_sort(self):
assert mi.equals(recons)
assert Index(mi.values).equals(Index(recons.values))

recons = mi._reconstruct(sort=False)
assert recons.is_lexsorted()
assert recons.is_monotonic
assert mi is recons

assert mi.equals(recons)
assert Index(mi.values).equals(Index(recons.values))

# cannot convert to lexsorted
mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'),
('x', 'b'), ('y', 'a'), ('z', 'b')],
Expand Down Expand Up @@ -2464,6 +2468,35 @@ def test_reconstruct_sort(self):
assert mi.equals(recons)
assert Index(mi.values).equals(Index(recons.values))

def test_reconstruct_remove_unused(self):
# xref to GH 2770
df = DataFrame([['deleteMe', 1, 9],
['keepMe', 2, 9],
['keepMeToo', 3, 9]],
columns=['first', 'second', 'third'])
df2 = df.set_index(['first', 'second'], drop=False)
df2 = df2[df2['first'] != 'deleteMe']

# removed levels are there
expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'],
[1, 2, 3]],
labels=[[1, 2], [1, 2]],
names=['first', 'second'])
result = df2.index
tm.assert_index_equal(result, expected)

expected = MultiIndex(levels=[['keepMe', 'keepMeToo'],
[2, 3]],
labels=[[0, 1], [0, 1]],
names=['first', 'second'])
result = df2.index._reconstruct(remove_unused=True)
tm.assert_index_equal(result, expected)

# idempotent
result2 = result._reconstruct(remove_unused=True)
tm.assert_index_equal(result2, expected)
assert result2 is result

def test_isin(self):
values = [('foo', 2), ('bar', 3), ('quux', 4)]

Expand Down
22 changes: 19 additions & 3 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2559,16 +2559,32 @@ def test_sort_index_and_reconstruction(self):
assert result.columns.is_lexsorted()
assert result.columns.is_monotonic

def test_sort_index_and_reconstruction_doc_example(self):
# doc example
df = DataFrame({'value': [1, 2, 3, 4]},
index=MultiIndex(
levels=[['a', 'b'], ['bb', 'aa']],
labels=[[0, 0, 1, 1], [1, 0, 1, 0]]))
result = df.sort_index()
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
assert df.index.is_lexsorted()
assert not df.index.is_monotonic

# sort it
expected = DataFrame({'value': [2, 1, 4, 3]},
index=MultiIndex(
levels=[['a', 'b'], ['aa', 'bb']],
labels=[[0, 0, 1, 1], [1, 0, 1, 0]]))
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
result = df.sort_index()
assert not result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)

# reconstruct
result = df.sort_index().copy()
result.index = result.index._reconstruct(sort=True)
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)

def test_sort_index_reorder_on_ops(self):
Expand Down

0 comments on commit aa6190f

Please sign in to comment.