From 8e3e3acb0c99343e92490b36a824d83bf70a39eb Mon Sep 17 00:00:00 2001 From: Jonas Schulze Date: Sat, 10 Mar 2018 16:05:23 +0100 Subject: [PATCH] DOC: Improve pandas.DataFrame.plot.density docstring This docstring is quite similar to the one of [PR20041](https://github.com/pandas-dev/pandas/pull/20041/). Unfortunately, I was not able to compute a kernel estimate of a two-dimensional random variable. Hence, the example is more of an analysis of some independent data series. --- pandas/plotting/_core.py | 69 ++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 98fdcf8f94ae00..a7564340565add 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2791,26 +2791,77 @@ def hist(self, by=None, bins=10, **kwds): def kde(self, bw_method=None, ind=None, **kwds): """ - Kernel Density Estimate plot + Generate Kernel Density Estimate plot using Gaussian kernels. + + In statistics, kernel density estimation (KDE) is a non-parametric way + to estimate the probability density function (PDF) of a random + variable. This function uses Gaussian kernels and includes automatic + bandwith determination. Parameters ---------- - bw_method: str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be 'scott', 'silverman', a scalar constant or a callable. If None (default), 'scott' is used. See :class:`scipy.stats.gaussian_kde` for more information. ind : NumPy array or integer, optional - Evaluation points. If None (default), 1000 equally spaced points - are used. If `ind` is a NumPy array, the kde is evaluated at the - points passed. If `ind` is an integer, `ind` number of equally - spaced points are used. - `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Evaluation points for the estimated PDF. If None (default), + 1000 equally spaced points are used. If `ind` is a NumPy array, the + kde is evaluated at the points passed. If `ind` is an integer, + `ind` number of equally spaced points are used. + **kwds : optional + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- axes : matplotlib.AxesSubplot or np.array of them + + See also + -------- + scipy.stats.gaussian_kde : Representation of a kernel-density + estimate using Gaussian kernels. This is the function used + internally to estimate the PDF. + :meth:`pandas.Series.plot.kde` : Generate a KDE plot for a Series + + Examples + -------- + Given several Series of points randomly sampled from unknown + distributions, estimate their distribution using KDE with automatic + bandwidth determination and plot the results, evaluating them at + 1000 equally spaced points (default): + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'x': [1, 2, 2.5, 3, 3.5, 4, 5], + ... 'y': [4, 4, 4.5, 5, 5.5, 6, 6], + ... }) + >>> ax = df.plot.kde() + + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to overfitting, while using a large bandwidth value can result + in underfitting: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) """ return self(kind='kde', bw_method=bw_method, ind=ind, **kwds)