Skip to content

Commit

Permalink
Remove dropna and fillna
Browse files Browse the repository at this point in the history
  • Loading branch information
has2k1 committed Nov 2, 2017
1 parent 7410868 commit 3f50e96
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 284 deletions.
2 changes: 0 additions & 2 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ One table verbs
define
distinct
do
dropna
fillna
group_by
group_indices
head
Expand Down
5 changes: 5 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,17 @@ API Changes
- Using internal function for :class:`~plydata.one_table_verbs.summarize` that
counts the number of elements in the current group changed from
``{n}`` to ``n()``.

- You can now use piping with the two table verbs (the joins).

- ``modify_where`` and ``define_where`` helper verbs have been removed.
Using the new expression helper functions :class:`~plydata.expressions.case_when`
and :class:`~plydata.expressions.if_else` is more readable.

- Removed ``dropna`` and ``fillna`` in favour of using
:class:`~plydata.helper_verbs.call` with :meth:`pandas.DataFrame.dropna` and
:meth:`pandas.DataFrame.fillna`.


v0.2.1
------
Expand Down
32 changes: 4 additions & 28 deletions plydata/dataframe/one_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,15 @@
import pandas as pd

from ..types import GroupedDataFrame
from ..options import get_option, options
from ..options import get_option
from ..utils import Q, get_empty_env, regular_index, unique
from .common import Evaluator, Selector
from .common import _get_groups, _get_base_dataframe

__all__ = ['arrange', 'create', 'define', 'distinct', 'do',
'dropna', 'fillna', 'group_by', 'group_indices',
'head', 'mutate', 'query', 'rename', 'sample_frac',
'sample_n', 'select', 'summarize', 'tail', 'ungroup',
'unique']
'group_by', 'group_indices', 'head', 'mutate',
'query', 'rename', 'sample_frac', 'sample_n',
'select', 'summarize', 'tail', 'ungroup', 'unique']


def define(verb):
Expand Down Expand Up @@ -194,29 +193,6 @@ def tail(verb):
return data


def dropna(verb):
result = verb.data.dropna(
axis=verb.axis,
how=verb.how,
thresh=verb.thresh,
subset=verb.subset
)
return result


def fillna(verb):
inplace = get_option('modify_input_data')
result = verb.data.fillna(
value=verb.value,
method=verb.method,
axis=verb.axis,
limit=verb.limit,
downcast=verb.downcast,
inplace=inplace
)
return result if not inplace else verb.data


# Aggregations functions

def _nth(arr, n):
Expand Down
234 changes: 1 addition & 233 deletions plydata/one_table_verbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
__all__ = ['define', 'create', 'sample_n', 'sample_frac', 'select',
'rename', 'distinct', 'unique', 'arrange', 'group_by',
'ungroup', 'group_indices', 'summarize',
'query', 'do', 'head', 'tail', 'dropna', 'fillna',
'query', 'do', 'head', 'tail',
# Aliases
'summarise', 'mutate', 'transmute',
]
Expand Down Expand Up @@ -987,238 +987,6 @@ def __init__(self, n=5):
self.n = n


class dropna(DataOperator):
"""
Remove rows or columns with missing values
This is a wrapper around :meth:`pandas.DataFrame.dropna`. It
is useful because you cannot :class:`query` ``NaN`` values.
Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, or tuple/list thereof
Pass tuple or list to drop on multiple axes
how : {'any', 'all'}
* any : if any NA values are present, drop that label
* all : if all values are NA, drop that label
thresh : int, default None
int value : require that many non-NA values
subset : array-like
Labels along other axis to consider, e.g. if you are
dropping rows these would be a list of columns to include
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> df = pd.DataFrame({
... 'w': [1, 2, np.nan, 4, 5],
... 'x': [np.nan, 2, np.nan, 4, 5],
... 'y': [np.nan] * 4 + [5],
... 'z': [np.nan] * 5
... })
>>> df
w x y z
0 1.0 NaN NaN NaN
1 2.0 2.0 NaN NaN
2 NaN NaN NaN NaN
3 4.0 4.0 NaN NaN
4 5.0 5.0 5.0 NaN
Drop rows with any ``NaN`` values
>>> df >> dropna()
Empty DataFrame
Columns: [w, x, y, z]
Index: []
Drop rows with all ``NaN`` values
>>> df >> dropna(how='all')
w x y z
0 1.0 NaN NaN NaN
1 2.0 2.0 NaN NaN
3 4.0 4.0 NaN NaN
4 5.0 5.0 5.0 NaN
Drop rows with ``NaN`` values in the *x* column.
>>> df >> dropna(subset=['x'])
w x y z
1 2.0 2.0 NaN NaN
3 4.0 4.0 NaN NaN
4 5.0 5.0 5.0 NaN
Drop and keep rows atleast 3 ``non-NaN`` values
>>> df >> dropna(thresh=3)
w x y z
4 5.0 5.0 5.0 NaN
Drop columns with all ``NaN`` values
>>> df >> dropna(axis=1, how='all')
w x y
0 1.0 NaN NaN
1 2.0 2.0 NaN
2 NaN NaN NaN
3 4.0 4.0 NaN
4 5.0 5.0 5.0
Drop columns with any ``NaN`` values in row 3.
>>> df >> dropna(axis=1, subset=[3])
w x
0 1.0 NaN
1 2.0 2.0
2 NaN NaN
3 4.0 4.0
4 5.0 5.0
"""

def __init__(self, axis=0, how='any', thresh=None, subset=None):
self.axis = axis
self.how = how
self.thresh = thresh
self.subset = subset


class fillna(DataOperator):
"""
Fill NA/NaN values using the specified method
This is a wrapper around :meth:`pandas.DataFrame.fillna`. It
is useful because you cannot :class:`modify_where` ``NaN``
values.
Parameters
----------
value : scalar, dict, Series, or DataFrame
Value to use to fill holes (e.g. 0), alternately a
dict/Series/DataFrame of values specifying which value to
use for each index (for a Series) or column (for a DataFrame).
(values not in the dict/Series/DataFrame will not be filled).
This value cannot be a list.
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
Method to use for filling holes in reindexed Series
pad / ffill: propagate last valid observation forward to next
valid backfill / bfill: use NEXT valid observation to fill gap
axis : {0 or 'index', 1 or 'columns'}
inplace : boolean, default False
If True, fill in place. Note: this will modify any
other views on this object, (e.g. a no-copy slice for a column
in a DataFrame).
limit : int, default None
If method is specified, this is the maximum number of
consecutive NaN values to forward/backward fill. In other
words, if there is a gap with more than this number of
consecutive NaNs, it will only be partially filled. If method
is not specified, this is the maximum number of entries along
the entire axis where NaNs will be filled. Must be greater
than 0 if not None.
downcast : dict, default is None
a dict of item->dtype of what to downcast if possible, or the
string 'infer' which will try to downcast to an appropriate
equal type (e.g. float64 to int64 if possible)
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> df = pd.DataFrame({
... 'w': [1, 2, np.nan, 4, 5],
... 'x': [np.nan, 2, np.nan, 4, 5],
... 'y': [np.nan] * 4 + [5],
... 'z': [np.nan] * 5
... })
>>> df
w x y z
0 1.0 NaN NaN NaN
1 2.0 2.0 NaN NaN
2 NaN NaN NaN NaN
3 4.0 4.0 NaN NaN
4 5.0 5.0 5.0 NaN
Replace all ``NaN`` values with -1.
>>> df >> fillna(-1)
w x y z
0 1.0 -1.0 -1.0 -1.0
1 2.0 2.0 -1.0 -1.0
2 -1.0 -1.0 -1.0 -1.0
3 4.0 4.0 -1.0 -1.0
4 5.0 5.0 5.0 -1.0
Replace all ``NaN`` values with the first ``non-NaN`` value *above
in column*
>>> df >> fillna(method='ffill')
w x y z
0 1.0 NaN NaN NaN
1 2.0 2.0 NaN NaN
2 2.0 2.0 NaN NaN
3 4.0 4.0 NaN NaN
4 5.0 5.0 5.0 NaN
Replace all ``NaN`` values with the first ``non-NaN`` value *below
in column*
>>> df >> fillna(method='bfill')
w x y z
0 1.0 2.0 5.0 NaN
1 2.0 2.0 5.0 NaN
2 4.0 4.0 5.0 NaN
3 4.0 4.0 5.0 NaN
4 5.0 5.0 5.0 NaN
Replace atmost 2 ``NaN`` values with the first ``non-NaN`` value
*below in column*
>>> df >> fillna(method='bfill', limit=2)
w x y z
0 1.0 2.0 NaN NaN
1 2.0 2.0 NaN NaN
2 4.0 4.0 5.0 NaN
3 4.0 4.0 5.0 NaN
4 5.0 5.0 5.0 NaN
Replace all ``NaN`` values with the first ``non-NaN`` value to the
*left in the row*
>>> df >> fillna(method='ffill', axis=1)
w x y z
0 1.0 1.0 1.0 1.0
1 2.0 2.0 2.0 2.0
2 NaN NaN NaN NaN
3 4.0 4.0 4.0 4.0
4 5.0 5.0 5.0 5.0
Replace all ``NaN`` values with the first ``non-NaN`` value to the
*right in the row*
>>> df >> fillna(method='bfill', axis=1)
w x y z
0 1.0 NaN NaN NaN
1 2.0 2.0 NaN NaN
2 NaN NaN NaN NaN
3 4.0 4.0 NaN NaN
4 5.0 5.0 5.0 NaN
Note
----
If :obj:`plydata.options.modify_input_data` is ``True``,
:class:`modify_where` will modify the original dataframe.
"""

def __init__(self, value=None, method=None, axis=None, limit=None,
downcast=None):
self.value = value
self.method = method
self.axis = axis
self.limit = limit
self.downcast = downcast


# Aliases
mutate = define
transmute = create
Expand Down
22 changes: 1 addition & 21 deletions plydata/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
rename, distinct, arrange, group_by, ungroup,
group_indices, summarize, query, do, head, tail,
tally, count, add_tally, add_count,
fillna, call,
call,
arrange_all, arrange_at, arrange_if,
create_all, create_at, create_if,
group_by_all, group_by_at, group_by_if,
Expand Down Expand Up @@ -761,16 +761,6 @@ def test_add_count():
assert isinstance(result, GroupedDataFrame)


def test_dropna():
# wraps around pandas and doctests are sufficient
pass


def test_fillna():
# wraps around pandas and doctests are adequate
pass


def test_call():
def remove_column_a(df):
_df = df.copy()
Expand Down Expand Up @@ -847,11 +837,6 @@ def test_data_mutability():
df >> group_by(z='x**2')
assert 'z' not in df

df2 = df.copy()
df2['x'] = np.nan
df2 >> fillna(-1)
assert all(df2['x'].isnull())

set_option('modify_input_data', True)

df2 = df.copy()
Expand All @@ -862,11 +847,6 @@ def test_data_mutability():
df2 >> group_by(z='x**2')
assert 'z' in df2

df2 = df.copy()
df2['x'] = np.nan
df2 >> fillna(-1)
assert all(df2['x'] == -1)

# Not mutable
df2 = df.copy()
df2 >> create(z='x**2')
Expand Down

0 comments on commit 3f50e96

Please sign in to comment.