-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.go
81 lines (69 loc) · 1.74 KB
/
preprocessing.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package godf
import (
"reflect"
)
type Preprocessing interface {
Standardize()
Normalize()
OneHotEncode()
}
// preprocessing is still not working, do not use.
// if no headers are provided, all non string data will be standardized
func (d *dataframe) Standardize(headers ...string) *dataframe {
df := d
for i, header := range df.headers {
if inArrayString(header, headers) {
standardized := standardize(df.data[i])
df.data[i] = []interface{}{}
for _, s := range standardized {
df.data[i] = append(df.data[i], s)
}
}
}
return df
}
func (d *dataframe) Normalize(headers ...string) *dataframe {
df := d
for i, header := range df.headers {
if inArrayString(header, headers) {
standardized := normalize(df.data[i])
df.data[i] = []interface{}{}
for _, s := range standardized {
df.data[i] = append(df.data[i], s)
}
}
}
return df
}
// OneHotEncode will encode categorical data (string) into numerical data.
//
// If no headers given, it will one hot encode all string data
func (d *dataframe) OneHotEncode(headers ...string) {
if len(headers) == 0 {
// encode all string header
for i := range d.data {
if reflect.TypeOf(d.data[i][0]).Kind() == reflect.String {
d.data[i] = oneHotEncode(d.data[i])
}
}
} else {
for i, h := range d.headers {
if inArrayString(h, headers) {
if reflect.TypeOf(d.data[i][0]).Kind() == reflect.String {
d.data[i] = oneHotEncode(d.data[i])
}
}
}
}
}
func oneHotEncode(data []interface{}) []interface{} {
encodeMap := make(map[string]int)
encoded := make([]interface{}, len(data))
for i, v := range data {
if _, ok := encodeMap[v.(string)]; !ok {
encodeMap[v.(string)] = len(encodeMap)
}
encoded[i] = encodeMap[v.(string)]
}
return encoded
}