Skip to content

Commit

Permalink
BUG: Dense ranking with percent now uses 100% basis (pandas-dev#15639)
Browse files Browse the repository at this point in the history
  • Loading branch information
rouzazari authored and jreback committed Mar 9, 2018
1 parent f9fd540 commit 1d73cf3
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 8 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ Offsets

Numeric
^^^^^^^
- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`)
- Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`)
- Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`)
- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`)
Expand Down
10 changes: 8 additions & 2 deletions pandas/_libs/algos_rank_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
sum_ranks = dups = 0
{{endif}}
if pct:
return ranks / count
if tiebreak == TIEBREAK_DENSE:
return ranks / total_tie_count
else:
return ranks / count
else:
return ranks

Expand Down Expand Up @@ -385,7 +388,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
ranks[i, argsorted[i, z]] = total_tie_count
sum_ranks = dups = 0
if pct:
ranks[i, :] /= count
if tiebreak == TIEBREAK_DENSE:
ranks[i, :] /= total_tie_count
else:
ranks[i, :] /= count
if axis == 0:
return ranks.T
else:
Expand Down
43 changes: 37 additions & 6 deletions pandas/tests/frame/test_rank.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# -*- coding: utf-8 -*-
import pytest
from datetime import timedelta, datetime
from distutils.version import LooseVersion
from numpy import nan
import numpy as np
import pandas.util.testing as tm

from pandas import Series, DataFrame
from distutils.version import LooseVersion
from datetime import timedelta, datetime
from numpy import nan

from pandas.compat import product
from pandas.util.testing import assert_frame_equal
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
from pandas import Series, DataFrame
from pandas.compat import product


class TestRank(TestData):
Expand Down Expand Up @@ -266,3 +266,34 @@ def _check2d(df, expected, method='average', axis=0):
continue
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, results[method], method=method, axis=axis)


@pytest.mark.parametrize(
"method,exp", [("dense",
[[1., 1., 1.],
[1., 0.5, 2. / 3],
[1., 0.5, 1. / 3]]),
("min",
[[1. / 3, 1., 1.],
[1. / 3, 1. / 3, 2. / 3],
[1. / 3, 1. / 3, 1. / 3]]),
("max",
[[1., 1., 1.],
[1., 2. / 3, 2. / 3],
[1., 2. / 3, 1. / 3]]),
("average",
[[2. / 3, 1., 1.],
[2. / 3, 0.5, 2. / 3],
[2. / 3, 0.5, 1. / 3]]),
("first",
[[1. / 3, 1., 1.],
[2. / 3, 1. / 3, 2. / 3],
[3. / 3, 2. / 3, 1. / 3]])])
def test_rank_pct_true(method, exp):
# see gh-15630.

df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
result = df.rank(method=method, pct=True)

expected = DataFrame(exp)
tm.assert_frame_equal(result, expected)
93 changes: 93 additions & 0 deletions pandas/tests/series/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,3 +376,96 @@ def test_rank_modify_inplace(self):
s.rank()
result = s
assert_series_equal(result, expected)


# GH15630, pct should be on 100% basis when method='dense'

@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1., 1.]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 2, 2. / 2, 2. / 2]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
([1, 1, 3, 3, 5, 5], [1. / 3, 1. / 3, 2. / 3, 2. / 3, 3. / 3, 3. / 3]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_dense_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='dense', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1. / 2, 1. / 2]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 3, 2. / 3, 2. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 5, 1. / 5, 4. / 5, 4. / 5, 3. / 5]),
([1, 1, 3, 3, 5, 5], [1. / 6, 1. / 6, 3. / 6, 3. / 6, 5. / 6, 5. / 6]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_min_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='min', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1., 1.]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 3, 3. / 3, 3. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [2. / 5, 2. / 5, 5. / 5, 5. / 5, 3. / 5]),
([1, 1, 3, 3, 5, 5], [2. / 6, 2. / 6, 4. / 6, 4. / 6, 6. / 6, 6. / 6]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_max_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='max', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1.5 / 2, 1.5 / 2]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 3, 2.5 / 3, 2.5 / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3. / 5]),
([1, 1, 3, 3, 5, 5],
[1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_average_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='average', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1. / 2, 2. / 2.]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 3, 2. / 3, 3. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 5, 2. / 5, 4. / 5, 5. / 5, 3. / 5]),
([1, 1, 3, 3, 5, 5], [1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 6. / 6]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_first_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='first', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)

0 comments on commit 1d73cf3

Please sign in to comment.