Skip to content

Commit

Permalink
Add support for importing meta{} field from CSVs. Closes #35.
Browse files Browse the repository at this point in the history
  • Loading branch information
knadh committed Nov 16, 2023
1 parent 7263d6f commit 8af7b4b
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 29 deletions.
39 changes: 20 additions & 19 deletions docs/documentation/docs/import.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ For instance, if there are multiple `Apple (English)` entries, it is inserted on

## Sample CSV format
```csv
-,A,Apple,english,Optional note,english,"",optional-tag1|tag2,"ˈæp.əl|aapl",""
^,"","round, red or yellow, edible fruit of a small tree",english,"","","","","",noun
^,"","the tree, cultivated in most temperate regions.",english,"","","","","",noun
^,"","il pomo.",italian,"","","","","",sost
-,A,Application,english,Optional note,italian,"","","aplɪˈkeɪʃ(ə)n",""
^,"","the act of putting to a special use or purpose",english,"","","","","",noun
^,"","le applicazione",italian,"","","","","",sost
-,A,Apple,english,Optional note,english,"",optional-tag1|tag2,"ˈæp.əl|aapl","","{""etym"": ""ml""}"
^,"","round, red or yellow, edible fruit of a small tree",english,"","","","","",noun,""
^,"","the tree, cultivated in most temperate regions.",english,"","","","","",noun,""
^,"","il pomo.",italian,"","","","","",sost,""
-,A,Application,english,Optional note,italian,"","","aplɪˈkeɪʃ(ə)n","",""
^,"","the act of putting to a special use or purpose",english,"","","","","",noun,""
^,"","le applicazione",italian,"","","","","",sost,""
```

Expand All @@ -26,18 +26,19 @@ English and Italian definitions below them.

## CSV fields

| Column | Field | |
|--------|------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 0 | type | `-` represents a main entry. `^` under it represents a definition entry. |
| 1 | initial | The uppercase first character of the entry. Eg: `A` for Apple. If left empty, it is automatically picked up. |
| 2 | content | The entry content (word or phrase). |
| 3 | language | Language of the entry (as defined in the config). |
| 4 | notes | Optional notes describing the entry. |
| 5 | tsvector_language | If the language has a built in Postgres fulltext tokenizer, the name of the tokenizer language. For languages that do not have Postgres tokenizers, this should be empty. |
| 6 | tsVector_tokens | Postgres fulltext search tokens for the entry (Content). If `tsvector_language` is specified, this field can be left empty as the tokens are automatically created in the database using `TO_TSVECTOR($tsvector_language, $content)`. For languages without Postgres tokenizers, the [tsvector](https://www.postgresql.org/docs/10/datatype-textsearch.html#DATATYPE-TSVECTOR) token string should be computed externally and provided here. |
| 7 | tags | Optional tags describing the entry. Separate multiple tags by `\|`. |
| 8 | phones | Optional phonetic notations representing the pronunciations of the entry. Separate multiple phones by `\|`. |
| 9 | definition-types | This should only be set for definition entries that ar marked with `Type = ^`. One or more parts-of-speech types separated by `\|`. Example `noun\|verb`. |
| Column | Field | | | |
|:-------|:------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------|:-------|
| 0 | type | `-` represents a main entry. `^` under it represents a definition entry. | | |
| 1 | initial | The uppercase first character of the entry. Eg: `A` for Apple. If left empty, it is automatically picked up. | | |
| 2 | content | The entry content (word or phrase). | | |
| 3 | language | Language of the entry (as defined in the config). | | |
| 4 | notes | Optional notes describing the entry. | | |
| 5 | tsvector_language | If the language has a built in Postgres fulltext tokenizer, the name of the tokenizer language. For languages that do not have Postgres tokenizers, this should be empty. | | |
| 6 | tsVector_tokens | Postgres fulltext search tokens for the entry (Content). If `tsvector_language` is specified, this field can be left empty as the tokens are automatically created in the database using `TO_TSVECTOR($tsvector_language, $content)`. For languages without Postgres tokenizers, the [tsvector](https://www.postgresql.org/docs/10/datatype-textsearch.html#DATATYPE-TSVECTOR) token string should be computed externally and provided here. | | |
| 7 | tags | Optional tags describing the entry. Separate multiple tags by `\ | `. | |
| 8 | phones | Optional phonetic notations representing the pronunciations of the entry. Separate multiple phones by `\ | `. | |
| 9 | definition-types | This should only be set for definition entries that ar marked with `Type = ^`. One or more parts-of-speech types separated by `\ | `. Example `noun\ | verb`. |
| 10 | meta | Otional JSON metadata. Quotes inside JSON are escaped by doubling them. Eg: `{"etym": "ml"} => {""etym"": ""ml""}` |


# Importing with SQL
Expand Down
12 changes: 9 additions & 3 deletions internal/data/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,9 +369,15 @@ func (d *Data) GetStats() (Stats, error) {
return out, err
}

err := json.Unmarshal(b, &out)
if err := json.Unmarshal(b, &out); err != nil {
return out, nil
}

if out.Languages == nil {
out.Languages = map[string]int{}
}

return out, err
return out, nil
}

// ApproveSubmission approves a pending submission (entry, relations, related entries).
Expand Down Expand Up @@ -417,7 +423,7 @@ func (d *Data) insertEntry(e Entry, stmt *sqlx.Stmt) (int, error) {
}

var id int
err := stmt.Get(&id, e.Content, e.Initial, e.Weight, tokens, tsVectorLang, e.Lang, e.Tags, e.Phones, e.Notes, e.Status)
err := stmt.Get(&id, e.Content, e.Initial, e.Weight, tokens, tsVectorLang, e.Lang, e.Tags, e.Phones, e.Notes, e.Meta, e.Status)
return id, err
}

Expand Down
11 changes: 6 additions & 5 deletions internal/importer/importer.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (

const (
insertBatchSize = 5000
colCount = 11

typeEntry = "-"
typeDef = "^"
Expand Down Expand Up @@ -171,8 +172,8 @@ func (im *Importer) readEntry(r []string) (entry, error) {
Phones: splitString(cleanString(r[8])),
}

if len(r) != 10 {
return e, fmt.Errorf("every line should have exactly 10 columns. Found %d", len(r))
if len(r) != colCount {
return e, fmt.Errorf("every line should have exactly %d columns. Found %d", colCount, len(r))
}

lang, ok := im.langs[e.Lang]
Expand Down Expand Up @@ -213,10 +214,10 @@ func (im *Importer) readEntry(r []string) (entry, error) {
}

e.Meta = strings.TrimSpace(e.Meta)
if e.Meta[0:1] != "{" {
return e, fmt.Errorf("column 11, meta JSON should begin with `{`")
} else if e.Meta == "" {
if e.Meta == "" {
e.Meta = "{}"
} else if e.Meta[0:1] != "{" {
return e, fmt.Errorf("column 11, meta JSON should begin with `{`")
}

return e, nil
Expand Down
5 changes: 3 additions & 2 deletions queries.sql
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ WITH w AS (
-- for the initial of the given word and add +1 to it.
SELECT MAX(weight) + 1 AS weight FROM entries WHERE $3=0 AND (initial=$2 AND lang=$6)
)
INSERT INTO entries (content, initial, weight, tokens, lang, tags, phones, notes, status)
INSERT INTO entries (content, initial, weight, tokens, lang, tags, phones, notes, meta, status)
VALUES(
$1,
$2,
Expand All @@ -152,7 +152,8 @@ INSERT INTO entries (content, initial, weight, tokens, lang, tags, phones, notes
$7,
$8,
$9,
$10
$10,
$11
)
RETURNING id;

Expand Down

0 comments on commit 8af7b4b

Please sign in to comment.