diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 488748403dcf35..7de13513d8a146 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -17,11 +17,10 @@ from pandas.compat import (range, lrange, StringIO, lzip, u, product as cart_product, zip) import pandas as pd - import pandas._libs.index as _index -class TestMultiLevel(tm.TestCase): +class Base(object): def setUp(self): @@ -56,6 +55,10 @@ def setUp(self): inplace=True) self.ymd.index.set_names(['year', 'month', 'day'], inplace=True) + + +class TestMultiLevel(Base, tm.TestCase): + def test_append(self): a, b = self.frame[:5], self.frame[5:] @@ -212,50 +215,6 @@ def test_reindex_preserve_levels(self): chunk = ymdT.loc[:, new_index] self.assertIs(chunk.columns, new_index) - def test_sort_index_preserve_levels(self): - result = self.frame.sort_index() - self.assertEqual(result.index.names, self.frame.index.names) - - def test_sorting_repr_8017(self): - - np.random.seed(0) - data = np.random.randn(3, 4) - - for gen, extra in [([1., 3., 2., 5.], 4.), ([1, 3, 2, 5], 4), - ([Timestamp('20130101'), Timestamp('20130103'), - Timestamp('20130102'), Timestamp('20130105')], - Timestamp('20130104')), - (['1one', '3one', '2one', '5one'], '4one')]: - columns = MultiIndex.from_tuples([('red', i) for i in gen]) - df = DataFrame(data, index=list('def'), columns=columns) - df2 = pd.concat([df, - DataFrame('world', index=list('def'), - columns=MultiIndex.from_tuples( - [('red', extra)]))], axis=1) - - # check that the repr is good - # make sure that we have a correct sparsified repr - # e.g. only 1 header of read - self.assertEqual(str(df2).splitlines()[0].split(), ['red']) - - # GH 8017 - # sorting fails after columns added - - # construct single-dtype then sort - result = df.copy().sort_index(axis=1) - expected = df.iloc[:, [0, 2, 1, 3]] - tm.assert_frame_equal(result, expected) - - result = df2.sort_index(axis=1) - expected = df2.iloc[:, [0, 2, 1, 4, 3]] - tm.assert_frame_equal(result, expected) - - # setitem then sort - result = df.copy() - result[('red', extra)] = 'world' - - result = result.sort_index(axis=1) - tm.assert_frame_equal(result, expected) def test_repr_to_string(self): repr(self.frame) @@ -479,18 +438,6 @@ def test_getitem_setitem_tuple_plus_columns(self): expected = df.loc[2000, 1, 6][['A', 'B', 'C']] tm.assert_series_equal(result, expected) - def test_getitem_multilevel_index_tuple_unsorted(self): - index_columns = list("abc") - df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], - columns=index_columns + ["data"]) - df = df.set_index(index_columns) - query_index = df.index[:1] - rs = df.loc[query_index, "data"] - - xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) - xp = Series(['x'], index=xp_idx, name='data') - tm.assert_series_equal(rs, xp) - def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.loc[('bar', 'two')] @@ -710,14 +657,6 @@ def test_getitem_partial(self): expected.columns = expected.columns.droplevel(0).droplevel(0) tm.assert_frame_equal(result, expected) - def test_getitem_slice_not_sorted(self): - df = self.frame.sort_index(level=1).T - - # buglet with int typechecking - result = df.iloc[:, :np.int32(3)] - expected = df.reindex(columns=df.columns[:3]) - tm.assert_frame_equal(result, expected) - def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] @@ -774,41 +713,6 @@ def test_getitem_partial_column_select(self): self.assertRaises(KeyError, df.loc.__getitem__, (('a', 'foo'), slice(None, None))) - def test_sort_index_level(self): - df = self.frame.copy() - df.index = np.arange(len(df)) - - # axis=1 - - # series - a_sorted = self.frame['A'].sort_index(level=0) - - # preserve names - self.assertEqual(a_sorted.index.names, self.frame.index.names) - - # inplace - rs = self.frame.copy() - rs.sort_index(level=0, inplace=True) - tm.assert_frame_equal(rs, self.frame.sort_index(level=0)) - - def test_sort_index_level_large_cardinality(self): - - # #2684 (int64) - index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) - - # it works! - result = df.sort_index(level=0) - self.assertTrue(result.index.lexsort_depth == 3) - - # #2684 (int32) - index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) - - # it works! - result = df.sort_index(level=0) - self.assertTrue((result.dtypes.values == df.dtypes.values).all()) - self.assertTrue(result.index.lexsort_depth == 3) def test_delevel_infer_dtype(self): tuples = [tuple @@ -833,81 +737,6 @@ def test_reset_index_with_drop(self): deleveled = self.series.reset_index(drop=True) tm.assertIsInstance(deleveled, Series) - def test_sort_index_level_by_name(self): - self.frame.index.names = ['first', 'second'] - result = self.frame.sort_index(level='second') - expected = self.frame.sort_index(level=1) - tm.assert_frame_equal(result, expected) - - def test_sort_index_level_mixed(self): - sorted_before = self.frame.sort_index(level=1) - - df = self.frame.copy() - df['foo'] = 'bar' - sorted_after = df.sort_index(level=1) - tm.assert_frame_equal(sorted_before, - sorted_after.drop(['foo'], axis=1)) - - dft = self.frame.T - sorted_before = dft.sort_index(level=1, axis=1) - dft['foo', 'three'] = 'bar' - - sorted_after = dft.sort_index(level=1, axis=1) - tm.assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), - sorted_after.drop([('foo', 'three')], axis=1)) - - def test_sort_index_and_reconstruction(self): - - # 15622 - # lexsortedness should be identical - # across MultiIndex consruction methods - - df = DataFrame([[1, 1], [2, 2]], index=list('ab')) - expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]], - index=MultiIndex.from_tuples([(0.5, 'a'), - (0.5, 'b'), - (0.8, 'a'), - (0.8, 'b')])) - assert expected.index.is_lexsorted() - - result = DataFrame( - [[1, 1], [2, 2], [1, 1], [2, 2]], - index=MultiIndex.from_product([[0.5, 0.8], list('ab')])) - result = result.sort_index() - assert result.index.is_lexsorted() - assert result.index.is_monotonic - - tm.assert_frame_equal(result, expected) - - result = DataFrame( - [[1, 1], [2, 2], [1, 1], [2, 2]], - index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) - result = result.sort_index() - assert result.index.is_lexsorted() - - tm.assert_frame_equal(result, expected) - - concatted = pd.concat([df, df], keys=[0.8, 0.5]) - result = concatted.sort_index() - - # this will be monotonic, but not lexsorted! - assert not result.index.is_lexsorted() - assert result.index.is_monotonic - - tm.assert_frame_equal(result, expected) - - # 14015 - df = DataFrame([[1, 2], [6, 7]], - columns=MultiIndex.from_tuples( - [(0, Timestamp('20160811 12:00:00')), - (0, Timestamp('20160809 12:00:00'))], - names=['l1', 'Date'])) - assert not df.columns.is_lexsorted() - assert not df.columns.is_monotonic - result = df.sort_index(axis=1) - assert result.columns.is_lexsorted() - assert result.columns.is_monotonic def test_count_level(self): def _check_counts(frame, axis=0): @@ -1479,22 +1308,6 @@ def test_alignment(self): exp = x.reindex(exp_index) - y.reindex(exp_index) tm.assert_series_equal(res, exp) - def test_is_lexsorted(self): - levels = [[0, 1], [0, 1, 2]] - - index = MultiIndex(levels=levels, - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) - self.assertTrue(index.is_lexsorted()) - - index = MultiIndex(levels=levels, - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) - self.assertFalse(index.is_lexsorted()) - - index = MultiIndex(levels=levels, - labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) - self.assertFalse(index.is_lexsorted()) - self.assertEqual(index.lexsort_depth, 0) - def test_frame_getitem_view(self): df = self.frame.T.copy() @@ -1520,43 +1333,6 @@ def f(): pass self.assertTrue((df['foo', 'one'] == 0).all()) - def test_frame_getitem_not_sorted(self): - df = self.frame.T - df['foo', 'four'] = 'foo' - - arrays = [np.array(x) for x in zip(*df.columns.values)] - - result = df['foo'] - result2 = df.loc[:, 'foo'] - expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) - expected.columns = expected.columns.droplevel(0) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) - - df = df.T - result = df.xs('foo') - result2 = df.loc['foo'] - expected = df.reindex(df.index[arrays[0] == 'foo']) - expected.index = expected.index.droplevel(0) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) - - def test_series_getitem_not_sorted(self): - arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - tuples = lzip(*arrays) - index = MultiIndex.from_tuples(tuples) - s = Series(randn(8), index=index) - - arrays = [np.array(x) for x in zip(*index.values)] - - result = s['qux'] - result2 = s.loc['qux'] - expected = s[arrays[0] == 'qux'] - expected.index = expected.index.droplevel(0) - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) - def test_count(self): frame = self.frame.copy() frame.index.names = ['a', 'b'] @@ -1631,32 +1407,6 @@ def aggf(x): tm.assert_frame_equal(leftside, rightside) - def test_sort_index_reorder_on_ops(self): - # 15687 - df = pd.DataFrame( - np.random.randn(8, 2), - index=MultiIndex.from_product( - [['a', 'b'], - ['big', 'small'], - ['red', 'blu']], - names=['letter', 'size', 'color']), - columns=['near', 'far']) - df = df.sort_index() - - def my_func(group): - group.index = ['newz', 'newa'] - return group - - result = df.groupby(level=['letter', 'size']).apply( - my_func).sort_index() - expected = MultiIndex.from_product( - [['a', 'b'], - ['big', 'small'], - ['newa', 'newz']], - names=['letter', 'size', None]) - - tm.assert_index_equal(result.index, expected) - def test_stat_op_corner(self): obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) @@ -2558,3 +2308,278 @@ def test_iloc_mi(self): for r in range(5)]) tm.assert_frame_equal(result, expected) + + +class TestSorted(Base, tm.TestCase): + """ everthing you wanted to test about sorting """ + + def test_sort_index_preserve_levels(self): + result = self.frame.sort_index() + self.assertEqual(result.index.names, self.frame.index.names) + + def test_sorting_repr_8017(self): + + np.random.seed(0) + data = np.random.randn(3, 4) + + for gen, extra in [([1., 3., 2., 5.], 4.), ([1, 3, 2, 5], 4), + ([Timestamp('20130101'), Timestamp('20130103'), + Timestamp('20130102'), Timestamp('20130105')], + Timestamp('20130104')), + (['1one', '3one', '2one', '5one'], '4one')]: + columns = MultiIndex.from_tuples([('red', i) for i in gen]) + df = DataFrame(data, index=list('def'), columns=columns) + df2 = pd.concat([df, + DataFrame('world', index=list('def'), + columns=MultiIndex.from_tuples( + [('red', extra)]))], axis=1) + + # check that the repr is good + # make sure that we have a correct sparsified repr + # e.g. only 1 header of read + self.assertEqual(str(df2).splitlines()[0].split(), ['red']) + + # GH 8017 + # sorting fails after columns added + + # construct single-dtype then sort + result = df.copy().sort_index(axis=1) + expected = df.iloc[:, [0, 2, 1, 3]] + tm.assert_frame_equal(result, expected) + + result = df2.sort_index(axis=1) + expected = df2.iloc[:, [0, 2, 1, 4, 3]] + tm.assert_frame_equal(result, expected) + + # setitem then sort + result = df.copy() + result[('red', extra)] = 'world' + + result = result.sort_index(axis=1) + tm.assert_frame_equal(result, expected) + + def test_getitem_multilevel_index_tuple_unsorted(self): + index_columns = list("abc") + df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], + columns=index_columns + ["data"]) + df = df.set_index(index_columns) + query_index = df.index[:1] + rs = df.loc[query_index, "data"] + + xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) + xp = Series(['x'], index=xp_idx, name='data') + tm.assert_series_equal(rs, xp) + + def test_getitem_slice_not_sorted(self): + df = self.frame.sort_index(level=1).T + + # buglet with int typechecking + result = df.iloc[:, :np.int32(3)] + expected = df.reindex(columns=df.columns[:3]) + tm.assert_frame_equal(result, expected) + + def test_sort_index_level(self): + df = self.frame.copy() + df.index = np.arange(len(df)) + + # axis=1 + + # series + a_sorted = self.frame['A'].sort_index(level=0) + + # preserve names + self.assertEqual(a_sorted.index.names, self.frame.index.names) + + # inplace + rs = self.frame.copy() + rs.sort_index(level=0, inplace=True) + tm.assert_frame_equal(rs, self.frame.sort_index(level=0)) + + def test_sort_index_level_large_cardinality(self): + + # #2684 (int64) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) + + # it works! + result = df.sort_index(level=0) + self.assertTrue(result.index.lexsort_depth == 3) + + # #2684 (int32) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) + + # it works! + result = df.sort_index(level=0) + self.assertTrue((result.dtypes.values == df.dtypes.values).all()) + self.assertTrue(result.index.lexsort_depth == 3) + + def test_sort_index_level_by_name(self): + self.frame.index.names = ['first', 'second'] + result = self.frame.sort_index(level='second') + expected = self.frame.sort_index(level=1) + tm.assert_frame_equal(result, expected) + + def test_sort_index_level_mixed(self): + sorted_before = self.frame.sort_index(level=1) + + df = self.frame.copy() + df['foo'] = 'bar' + sorted_after = df.sort_index(level=1) + tm.assert_frame_equal(sorted_before, + sorted_after.drop(['foo'], axis=1)) + + dft = self.frame.T + sorted_before = dft.sort_index(level=1, axis=1) + dft['foo', 'three'] = 'bar' + + sorted_after = dft.sort_index(level=1, axis=1) + tm.assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), + sorted_after.drop([('foo', 'three')], axis=1)) + + def test_sort_index_and_reconstruction(self): + + # 15622 + # lexsortedness should be identical + # across MultiIndex consruction methods + + df = DataFrame([[1, 1], [2, 2]], index=list('ab')) + expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_tuples([(0.5, 'a'), + (0.5, 'b'), + (0.8, 'a'), + (0.8, 'b')])) + assert expected.index.is_lexsorted() + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_product([[0.5, 0.8], list('ab')])) + result = result.sort_index() + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + result = result.sort_index() + assert result.index.is_lexsorted() + + tm.assert_frame_equal(result, expected) + + concatted = pd.concat([df, df], keys=[0.8, 0.5]) + result = concatted.sort_index() + + # this will be monotonic, but not lexsorted! + assert not result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + # 14015 + df = DataFrame([[1, 2], [6, 7]], + columns=MultiIndex.from_tuples( + [(0, '20160811 12:00:00'), + (0, '20160809 12:00:00')], + names=['l1', 'Date'])) + + df.columns.set_levels(pd.to_datetime(df.columns.levels[1]), + level=1, + inplace=True) + assert not df.columns.is_lexsorted() + assert not df.columns.is_monotonic + result = df.sort_index(axis=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + result = df.sort_index(axis=1, level=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + +# def test_ +#df = pd.DataFrame({'col1': ['b','d','b','a'], 'col2': [3,1,1,2], 'data':['one','two','three','four']}) + +#In [121]: df2 = df.set_index(['col1','col2']) + +#In [122]: df2.index.set_levels(['b','d','a'], level='col1', inplace=True) + +#In [123]: df2.index.set_labels([0,1,0,2], level='col1', inplace=True) + def test_is_lexsorted(self): + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex(levels=levels, + labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + self.assertTrue(index.is_lexsorted()) + + index = MultiIndex(levels=levels, + labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) + self.assertFalse(index.is_lexsorted()) + + index = MultiIndex(levels=levels, + labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) + self.assertFalse(index.is_lexsorted()) + self.assertEqual(index.lexsort_depth, 0) + + def test_frame_getitem_not_sorted(self): + df = self.frame.T + df['foo', 'four'] = 'foo' + + arrays = [np.array(x) for x in zip(*df.columns.values)] + + result = df['foo'] + result2 = df.loc[:, 'foo'] + expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + df = df.T + result = df.xs('foo') + result2 = df.loc['foo'] + expected = df.reindex(df.index[arrays[0] == 'foo']) + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + def test_series_getitem_not_sorted(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = lzip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + + arrays = [np.array(x) for x in zip(*index.values)] + + result = s['qux'] + result2 = s.loc['qux'] + expected = s[arrays[0] == 'qux'] + expected.index = expected.index.droplevel(0) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + def test_sort_index_reorder_on_ops(self): + # 15687 + df = pd.DataFrame( + np.random.randn(8, 2), + index=MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['red', 'blu']], + names=['letter', 'size', 'color']), + columns=['near', 'far']) + df = df.sort_index() + + def my_func(group): + group.index = ['newz', 'newa'] + return group + + result = df.groupby(level=['letter', 'size']).apply( + my_func).sort_index() + expected = MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['newa', 'newz']], + names=['letter', 'size', None]) + + tm.assert_index_equal(result.index, expected)