forked from rocketlaunchr/dataframe-go
/
describe.go
115 lines (92 loc) · 3 KB
/
describe.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// Copyright 2018-20 PJ Engineering and Business Solutions Pty. Ltd. All rights reserved.
// Package pandas contains functionality that mirrors python's popular pandas library.
package pandas
import (
"context"
"fmt"
"strconv"
dataframe "github.com/rocketlaunchr/dataframe-go"
)
// DescribeOutput contains statistical data for a DataFrame or Series.
// Despite the fields being exported, it is not intended to be inspected.
// Use the String function to view the information in a table format.
type DescribeOutput struct {
Count []int
NilCount []int
Median []float64
Mean []float64
StdDev []float64
Min []float64
Max []float64
Percentiles [][]float64
percentiles []float64
headers []string
}
// String implements the Stringer interface in the fmt package.
func (do DescribeOutput) String() string {
out := map[string][]interface{}{}
for idx := range do.headers {
out["count"] = append(out["count"], do.Count[idx])
out["nil count"] = append(out["nil count"], do.NilCount[idx])
if len(do.Median) > 0 {
out["median"] = append(out["median"], do.Median[idx])
} else {
out["median"] = append(out["median"], "NaN")
}
if len(do.Mean) > 0 {
out["mean"] = append(out["mean"], do.Mean[idx])
} else {
out["mean"] = append(out["mean"], "NaN")
}
if len(do.StdDev) > 0 {
out["std dev"] = append(out["std dev"], do.StdDev[idx])
} else {
out["std dev"] = append(out["std dev"], "NaN")
}
if len(do.Min) > 0 {
out["min"] = append(out["min"], do.Min[idx])
} else {
out["min"] = append(out["min"], "NaN")
}
if len(do.Max) > 0 {
out["max"] = append(out["max"], do.Max[idx])
} else {
out["max"] = append(out["max"], "NaN")
}
for i, p := range do.percentiles {
key := strconv.FormatFloat(100*p, 'f', -1, 64) + "%"
out[key] = append(out[key], do.Percentiles[idx][i])
}
}
return printMap(do.headers, out)
}
// DescribeOptions configures what Describe should return or display.
type DescribeOptions struct {
// Percentiles sets which Quantiles to return.
Percentiles []float64
// Whitelist sets which Series to provide statistics for.
Whitelist []interface{}
// Blacklist sets which Series to NOT provide statistics for.
Blacklist []interface{}
}
// Describe outputs various statistical information a Series or Dataframe.
//
// See: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html#pandas.DataFrame.describe
func Describe(ctx context.Context, sdf interface{}, opts ...DescribeOptions) (DescribeOutput, error) {
if len(opts) == 0 {
opts = append(opts, DescribeOptions{
Percentiles: []float64{.2, .4, .6, .8},
})
} else {
if opts[0].Percentiles == nil {
opts[0].Percentiles = []float64{.2, .4, .6, .8}
}
}
switch _sdf := sdf.(type) {
case dataframe.Series:
return describeSeries(ctx, _sdf, opts...)
case *dataframe.DataFrame:
return describeDataframe(ctx, _sdf, opts...)
}
panic(fmt.Sprintf("interface conversion: %T is not a valid Series or DataFrame", sdf))
}