Skip to content

Commit

Permalink
Add CSV to DB bulk dictionary data import tool (--import flag).
Browse files Browse the repository at this point in the history
  • Loading branch information
knadh committed Jun 26, 2022
1 parent 8ac725d commit 38a1927
Show file tree
Hide file tree
Showing 2 changed files with 319 additions and 2 deletions.
15 changes: 13 additions & 2 deletions cmd/dictpress/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

"github.com/jmoiron/sqlx"
"github.com/knadh/dictpress/internal/data"
"github.com/knadh/dictpress/internal/importer"
"github.com/knadh/goyesql"
goyesqlx "github.com/knadh/goyesql/sqlx"
"github.com/knadh/koanf"
Expand Down Expand Up @@ -57,7 +58,7 @@ type App struct {

var (
logger = log.New(os.Stdout, "", log.Ldate|log.Ltime|log.Lshortfile)
ko = koanf.New(".")
ko = koanf.New(".")
)

func init() {
Expand All @@ -74,6 +75,7 @@ func init() {
"path to one or more config files (will be merged in order)")
f.String("site", "", "path to a site theme. If left empty, only HTTP APIs will be available.")
f.Bool("install", false, "run first time DB installation")
f.String("import", "", "import a CSV file into the database. eg: --import=data.csv")
f.Bool("yes", false, "assume 'yes' to prompts, eg: during --install")
f.Bool("version", false, "current version of the build")

Expand Down Expand Up @@ -149,7 +151,6 @@ func main() {

// Map queries to the query container.
var q data.Queries

if err := goyesqlx.ScanToStruct(&q, qMap, db.Unsafe()); err != nil {
logger.Fatalf("no SQL queries loaded: %v", err)
}
Expand All @@ -160,6 +161,16 @@ func main() {
logger.Fatal("0 languages in config")
}

// Run the CSV importer.
if fPath := ko.String("import"); fPath != "" {
imp := importer.New(langs, q.InsertSubmissionEntry, q.InsertSubmissionRelation, db, logger)
logger.Printf("importing data from %s ...", fPath)
if err := imp.Import(fPath); err != nil {
logger.Fatal(err)
}
os.Exit(0)
}

app.data = data.New(&q, langs)
app.queries = &q

Expand Down
306 changes: 306 additions & 0 deletions internal/importer/importer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
// package importer imports a dictionary CSV into the database.
package importer

import (
"encoding/csv"
"fmt"
"io"
"log"
"os"
"regexp"
"strings"

"github.com/jmoiron/sqlx"
"github.com/knadh/dictpress/internal/data"
"github.com/lib/pq"
)

const (
insertBatchSize = 5000

typeEntry = "-"
typeDef = "^"
)

// entry represents a single row read from the CSV. The CSV columns are:
// Array columns like tokens, tags etc. are pipe (|) separated.
// entry_type, word, initial, language, notes, tsvector_language, [tsvector_tokens], [tags], [phones], definition_type
//
// entry_type = - represents a main entry and subsequent ^ represents definitions.
// definition_type (last field) should only be set in definition (^) entries.
// It represents the part of speech types defined in the config. Eg: noun, verb etc.
//
// tsvector_language = Name of the Postgres language tokenizer if it's a built in one.
// If this is set, content is automatically tokenized using this language in Postgres and [tsvector_tokens] can be left empty.
// If the language does not have a Postgres tokenizer, leave tsvector_language empty and manually set [tsvector_tokens]
type entry struct {
// Comments show CSV column positions.
Type string // 0
Content string // 1
Initial string // 2
Lang string // 3
Notes string // 4
TSVectorLang string // 4
TSVectorTokens string // 6
Tags []string // 7
Phones []string // 8
DefTypes []string // 9 - Only read in definition entries (0=^)

defs []entry
}

// Importer imports CSV entries into the database.
type Importer struct {
langs data.LangMap

db *sqlx.DB
stmtInsertEntry *sqlx.Stmt
stmtInsertRel *sqlx.Stmt
lo *log.Logger
}

var (
reSpaces, _ = regexp.Compile("\\s+")
)

// New returns a new instance of the CSV importer.
func New(langs data.LangMap, stmtInsertEntry *sqlx.Stmt, stmtInsertRel *sqlx.Stmt, db *sqlx.DB, lo *log.Logger) *Importer {
return &Importer{
langs: langs,
stmtInsertEntry: stmtInsertEntry,
stmtInsertRel: stmtInsertRel,
db: db,
lo: lo,
}
}

// Import imports a CSV file into the DB.
func (im *Importer) Import(filePath string) error {
fp, err := os.Open(filePath)
if err != nil {
return fmt.Errorf("error opening file %s: %v", filePath, err)
}

var (
// Holds all main entries.
entries []entry
n = 0
numMain = 0
numDefs = 0
)

rd := csv.NewReader(fp)
rd.FieldsPerRecord = -1
for {
row, err := rd.Read()
if err != nil {
if err == io.EOF {
break
}

return fmt.Errorf("error reading CSV file %s: %v", filePath, err)
}

if n == 0 && row[0] != "-" {
return fmt.Errorf("line %d: first row in the file should be of type '-'", n)
}
n++

e, err := im.readEntry(row)
if err != nil {
return fmt.Errorf("error reading line %d: %v", n, err)
}

// First entry is always a main entry.
if len(entries) == 0 {
entries = append(entries, e)
continue
}

// Add all definitions to the last main entry in the list.
if e.Type == typeDef {
i := len(entries) - 1
entries[i].defs = append(entries[i].defs, e)
numDefs++
continue
}

// On hitting the batchsize, insert to DB.
if len(entries)%insertBatchSize == 0 {
if err := im.insertEntries(entries, numMain); err != nil {
return fmt.Errorf("error inserting entries to DB: %v", err)
}

numMain += len(entries)
entries = []entry{}

im.lo.Printf("imported %d entries and %d definitions", numMain, numDefs)
}

// New main entry.
entries = append(entries, e)
}

if len(entries) > 0 {
if err := im.insertEntries(entries, numMain); err != nil {
return fmt.Errorf("error inserting entries to DB: %v", err)
}
}

im.lo.Printf("finished. imported %d entries and %d definitions", numMain+len(entries), numDefs)
return nil
}

// initial, content, lang, notes, tsvector_language, [tokens|], [tags|], [pronunciations|]
func (im *Importer) readEntry(r []string) (entry, error) {
typ := cleanString(r[0])
if typ != typeEntry && typ != typeDef {
return entry{}, fmt.Errorf("unknown type '%s' in column 0. Should be '-' (entry), or '^' for definition", typ)
}

e := entry{
Type: typ,
Initial: cleanString(r[1]),
Content: cleanString(r[2]),
Lang: cleanString(r[3]),
Notes: cleanString(r[4]),
TSVectorLang: cleanString(r[5]),
TSVectorTokens: cleanString(r[6]),
Tags: splitString(cleanString(r[7])),
Phones: splitString(cleanString(r[8])),
}

if len(r) != 10 {
return e, fmt.Errorf("every line should have exactly 10 columns. Found %d", len(r))
}

lang, ok := im.langs[e.Lang]
if !ok {
return e, fmt.Errorf("unknown language '%s' at column 2", e.Lang)
}

if e.Initial == "" {
e.Initial = strings.ToUpper(string(e.Content[0]))
}

if e.Content == "" {
return e, fmt.Errorf("empty content (word) at column 1")
}

defTypeStr := cleanString(r[9])
if typ == typeDef {
defTypes := splitString(defTypeStr)
for _, t := range defTypes {
if _, ok := lang.Types[t]; !ok {
return e, fmt.Errorf("unknown type '%s' for language '%s'", t, e.Lang)
}
}
e.DefTypes = defTypes
} else if defTypeStr != "" {
return e, fmt.Errorf("column 10, definition type (part of speec) should only be set of definition entries (^)")
}

return e, nil
}

func (im *Importer) insertEntries(entries []entry, lineStart int) error {
var (
tx *sqlx.Tx
stmt *sqlx.Stmt
err error
)

// Insert entries.
entryIDs := make([]int, len(entries))
if tx, err = im.db.Beginx(); err != nil {
return err
}
stmt = tx.Stmtx(im.stmtInsertEntry)
for i, e := range entries {
if err := stmt.Get(&entryIDs[i],
e.Content,
e.Initial,
lineStart,
e.TSVectorTokens,
e.TSVectorLang,
e.Lang,
pq.StringArray(e.Tags),
pq.StringArray(e.Phones),
e.Notes,
data.StatusEnabled); err != nil {
return err
}
lineStart++
}

if err := tx.Commit(); err != nil {
return err
}

// Insert definition entries and collect their IDs for every main entry.
relIDs := make([][]int, len(entries))

if tx, err = im.db.Beginx(); err != nil {
return err
}
stmt = tx.Stmtx(im.stmtInsertEntry)

// Iterate through all main entries again, inserting their definition entries.
for i, mainEntry := range entries {
relIDs[i] = make([]int, len(mainEntry.defs))

for j, e := range mainEntry.defs {
// Insert the definition entry and record the resulting ID
// against the parent ID.
if err := stmt.Get(&relIDs[i][j],
e.Content,
e.Initial,
i+j,
e.TSVectorTokens,
e.TSVectorLang,
e.Lang,
pq.StringArray{},
pq.StringArray(e.Phones),
"",
data.StatusEnabled); err != nil {
return err
}
}
}

if err := tx.Commit(); err != nil {
return err
}

// Insert relationships.
if tx, err = im.db.Beginx(); err != nil {
return err
}
stmt = tx.Stmtx(im.stmtInsertRel)
for i, defIDs := range relIDs {
for j, toID := range defIDs {
d := entries[i].defs[j]
if _, err := stmt.Exec(entryIDs[i], toID, pq.StringArray(d.DefTypes), pq.StringArray(d.Tags), d.Notes, j, data.StatusEnabled); err != nil {
return err
}
}
}
if err := tx.Commit(); err != nil {
return err
}

return nil
}

func cleanString(s string) string {
return reSpaces.ReplaceAllString(strings.TrimSpace(s), " ")
}

func splitString(s string) []string {
out := strings.Split(s, "|")
for n, v := range out {
out[n] = strings.TrimSpace(v)
}

return out
}

0 comments on commit 38a1927

Please sign in to comment.