-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
249 lines (210 loc) · 6.04 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
package main
import (
"bufio"
"context"
_ "embed"
"encoding/csv"
"flag"
"fmt"
"io"
"log"
"os"
"runtime"
"sync"
"time"
"github.com/jackc/pgx/v4"
"github.com/jackc/pgx/v4/pgxpool"
"github.com/jszwec/csvutil"
"github.com/johnsiilver/pipelines/stagedpipe"
"github.com/johnsiilver/pipelines/stagedpipe/examples/etl/bostonFoodViolations/pipelined/etl"
)
var (
filePath = flag.String("file", "../violations.csv", "The path to the file to parse")
connStr = flag.String("connStr", "", "The connection string to your postgres database")
)
// concurrency is the number of concurrent pipelines we have running. Each pipeline is running
// X stages in parallel, where X is the number of stages.
var concurrency = runtime.NumCPU()
//go:embed etl/drop.sql
var dropTable string
//go:embed etl/create.sql
var createTable string
func main() {
log.SetFlags(log.LstdFlags | log.Lshortfile)
flag.Parse()
ctx := context.Background()
// Setup DB connection.
connCtx, connCancel := context.WithTimeout(ctx, 10*time.Second)
pool, err := pgxpool.Connect(connCtx, *connStr)
if err != nil {
log.Fatalf("cannot connect to Postgres: %s", err)
}
connCancel()
_, err = pool.Exec(ctx, dropTable)
if err != nil {
log.Fatalf("could not drop an existing violations table: %s", err)
}
_, err = pool.Exec(ctx, createTable)
if err != nil {
log.Fatalf("could not drop an existing violations table: %s", err)
}
// Setup DB transaction.
tx, err := pool.BeginTx(ctx, pgx.TxOptions{})
if err != nil {
log.Fatalf("cannot start a transaction: %s", err)
}
txMutex := &sync.Mutex{}
// Setup our file access to read in.
f, err := newCSVBlocks[etl.Row](*filePath, 100000)
if err != nil {
log.Fatalf("cannot open our file: %s", err)
}
// Setup our pipeline.
sm, err := etl.NewSM()
if err != nil {
log.Fatalf("cannot start state machine: %s", err)
}
pipeline, err := stagedpipe.New[etl.Data]("boston food violations", concurrency, sm)
if err != nil {
log.Fatalf("cannot create a pipeline: %s", err)
}
defer pipeline.Close()
// Setup our request group to send our data on.
rg := pipeline.NewRequestGroup()
reqCtx, reqCancel := context.WithCancel(ctx)
defer reqCancel() // Not really needed, but for consistency
start := time.Now()
// Gets all the output from the pipeline and checks for errors. We don't use
// any of the output otherwise, as the pipeline writes the data to the database.
done := make(chan error, 1)
go func() {
var err error
defer func() {
defer close(done)
if err != nil {
done <- err
}
}()
// A RequestGroup must always drain its .Out() channel. If we receive an error and
// want to stop processing, we can cancel the Context and wait for everything to stop.
// Here we capture the error so that we can report it. If we get an error, we also
// rollback the transaction.
for out := range rg.Out() {
if err != nil {
continue
}
if out.Err != nil {
reqCancel()
err = out.Err
log.Printf("pipeline had error in stream: %s", out.Err)
}
}
if err != nil {
tx.Rollback(ctx)
}
}()
fmt.Println("Starting to process records...")
// Sent blocks of data into the pipeline.
items := 0
for block := range f.blocks(reqCtx) {
if block.Err != nil {
if block.Err == context.Canceled {
log.Println("fileBlock received context.Canceled")
break
}
log.Fatalf("problem reading csv block: %s", block.Err)
}
items += len(block.Data)
req := etl.NewRequest(reqCtx, etl.Data{Rows: block.Data, Tx: tx, TxMutex: txMutex})
if err := rg.Submit(req); err != nil {
log.Fatalf("problem submitting request to pipeline: %s", err)
}
}
// Tell the pipeline that this request group is done.
rg.Close()
// We have processed all output.
processingErr := <-done
if processingErr != nil {
fmt.Printf("Pipeline had processing error(DB transaction rolled back): %s", processingErr)
os.Exit(1)
}
fmt.Println("Pipeline has completed processing")
fmt.Printf("Processed %d records into Postgres\n", items)
fmt.Println("Committing Transaction to Postgres...")
if err := tx.Commit(ctx); err != nil {
log.Fatalf("transaction commit failure: %s", err)
}
end := time.Since(start)
fmt.Println("Commit complete! We are DONE!")
fmt.Println("Time taken: ", end)
}
// streamResp holds any streaming response object for a channel.
type streamResp[T any] struct {
Data T
Err error
}
// csvBlocks reads a CSV file that must contain a header into a bufio.Reader and
// returns "items" number of decoded records into type T, where T should be a struct.
type csvBlocks[T any] struct {
f io.ReadCloser
buf *bufio.Reader
dec *csvutil.Decoder
items int
}
// newCSVBlocks opens a csv file at "path" and streams results in "items" blocks that are
// decoded into type T, where T should be a struct. "items" must be >= than 100.
func newCSVBlocks[T any](path string, items int) (csvBlocks[T], error) {
if items < 100 {
return csvBlocks[T]{}, fmt.Errorf("newCSVBlocks() cannot be called with items set to less than 100")
}
f, err := os.Open(path)
if err != nil {
return csvBlocks[T]{}, err
}
buf := bufio.NewReaderSize(f, 10*1024*1024)
r := csv.NewReader(buf)
dec, err := csvutil.NewDecoder(r)
if err != nil {
return csvBlocks[T]{}, err
}
return csvBlocks[T]{
f: f,
buf: buf,
items: items,
dec: dec,
}, nil
}
// blocks returns a channel that sends all the blocks. The caller is responsible for
// reading all data until the channel closes, even if cancelling the Context.
func (c csvBlocks[T]) blocks(ctx context.Context) chan streamResp[[]T] {
ch := make(chan streamResp[[]T], 1)
go func() {
defer close(ch)
for {
if ctx.Err() != nil {
ch <- streamResp[[]T]{Err: ctx.Err()}
return
}
out := []T{}
for i := 0; i < c.items; i++ {
var row T
err := c.dec.Decode(&row)
if err != nil {
if err == io.EOF {
if len(out) > 0 {
ch <- streamResp[[]T]{Data: out}
}
return
}
ch <- streamResp[[]T]{Err: err}
return
}
out = append(out, row)
}
if len(out) > 0 {
ch <- streamResp[[]T]{Data: out}
}
}
}()
return ch
}