Skip to content

Commit

Permalink
Rewrite ADIF field parser
Browse files Browse the repository at this point in the history
Summary:

A byte-level parser is implemented to pick up ADIF tags and fields. This
simplifies the parsing function. The old unused parser functions are removed.
API function from adifparser application remains the same.

ADIF field values now accept the byte information as is. This prevents
processing errors when non-ASCII data is contained in the field, which is
commonly happened among downloaded logs from QRZ.com.

List of changes:

* baseADIFReader.rdr is changed from io.Reader to *bufio.Reader
  - This does not break any adifparser application
* baseADIFReader.version is changed from float64 to string
  - The version string is "3.1.3" now and this is apparently not a float
* Add readElement() parser (as a method of baseADIFReader)
  - Add elementData struct as the return value definition
* ReadRecord() is changed to use readElement()
* readHeader() is changed to use readElement()
* Removed the following unused functions:
  - readChunk()
  - readRecord() (beginning with the small r)
  - trimLotwEof()
  - ParseADIFRecord()
  - getNextField()
* Revised the tests
  - Added tests for new functions
  - Removed tests for non-existing functions
  - Edited the old tests for new functions
* Implemented non-Unicode tolower and toupper functions
  - These are required to preserve length of a byte slice
  - Go bytes.toLower() and bytes.toUpper() uses implicit UTF-8 case conversion, which changes the byte length of the processed byte slice, so these functions cannot be used
  • Loading branch information
jj1bdx committed Oct 10, 2022
1 parent 0cd0e28 commit f287508
Show file tree
Hide file tree
Showing 5 changed files with 427 additions and 245 deletions.
274 changes: 181 additions & 93 deletions adifreader.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package adifparser

import (
"bufio"
"bytes"
"errors"
"io"
"strconv"
)
Expand All @@ -15,14 +15,14 @@ type ADIFReader interface {

// Real implementation of ADIFReader
type baseADIFReader struct {
// Underlying io Reader
rdr io.Reader
// Underlying bufio Reader
rdr *bufio.Reader
// Whether or not the header is included
noHeader bool
// Whether or not the header has been read
headerRead bool
// Version of the adif file
version float64
// Excess read data
excess []byte
// Version string of the adif file
version string
// Record count
records int
}
Expand All @@ -33,29 +33,58 @@ type dedupeADIFReader struct {
seen map[string]bool
}

type elementData struct {
// ADIF field name (in ASCII, set to lowercase)
name string
// ADIF field (if nil, only the field name exists)
value string
// ADIF data type indicator (optional, set to uppercase)
typecode byte
// ADIF specifier always has a corresponding value
// If hasValue is false, string inside "<>" is
// a tag without a value
hasValue bool
// ADIF specifier can optionally have a type
hasType bool
// Length of value bytes/string
valueLength int
}

func (ardr *baseADIFReader) ReadRecord() (ADIFRecord, error) {
record := NewADIFRecord()

if !ardr.headerRead {
ardr.readHeader()
}
buf, err := ardr.readRecord()
if err != nil {
if err != io.EOF {
adiflog.Printf("readRecord: %v", err)

foundeor := false
for !foundeor {
element, err := ardr.readElement()
if err != nil {
if err != io.EOF {
adiflog.Printf("readElement: %v", err)
}
return nil, err
}
if element.name == "eor" && !element.hasValue {
foundeor = true
break
}
if element.hasValue {
// TODO: accomodate types
record.values[element.name] = element.value
}
return nil, err
}
if len(bytes.TrimSpace(buf)) == 0 {
// No data left
return nil, io.EOF
}
record, err := ParseADIFRecord(buf)
if err == nil {
ardr.records += 1
return record, nil
}
return record, err
// Successfully parsed the record
ardr.records++
return record, nil
}

// Errors
var InvalidFieldLength = errors.New("Invalid field length.")
var TypeCodeExceedOneByte = errors.New("Type Code exceeds one byte.")
var UnknownColons = errors.New("Unknown colons in the tag.")

func (ardr *dedupeADIFReader) ReadRecord() (ADIFRecord, error) {
for true {
record, err := ardr.baseADIFReader.ReadRecord()
Expand Down Expand Up @@ -86,95 +115,154 @@ func NewDedupeADIFReader(r io.Reader) *dedupeADIFReader {

func (ardr *baseADIFReader) init(r io.Reader) {
ardr.rdr = bufio.NewReader(r)
ardr.headerRead = false
// Assumption
ardr.version = 2
ardr.version = "2.0"
ardr.records = 0
}

func (ardr *baseADIFReader) readHeader() {
ardr.headerRead = true
eoh := []byte("<eoh>")
adif_version := []byte("<adif_ver:")
chunk, err := ardr.readChunk()
// check header
filestart, err := ardr.rdr.Peek(1)
if err != nil {
// TODO: Log the error somewhere
return
}
if bytes.HasPrefix(chunk, []byte("<")) {
if bytes.HasPrefix(bytes.ToLower(chunk), adif_version) {
ver_len_str_end := bytes.Index(chunk, []byte(">"))
ver_len_str := string(chunk[len(adif_version):ver_len_str_end])
ver_len, err := strconv.Atoi(ver_len_str)
if err != nil {
adiflog.Fatal(err)
}
ver_len_end := ver_len_str_end + 1 + ver_len
ardr.version, err = strconv.ParseFloat(
string(chunk[ver_len_str_end+1:ver_len_end]), 0)
excess := chunk[ver_len_end:]
eoh_end := bIndexCI(excess, eoh) + len(eoh)
excess = excess[eoh_end:]
ardr.excess = excess[tagStartPos(excess):]
} else {
ardr.excess = chunk
ardr.noHeader = filestart[0] == '<'
// if header does not exist, header can be skipped
// and treated as read
ardr.headerRead = ardr.noHeader
}

func (ardr *baseADIFReader) readHeader() {
foundeoh := false
for !foundeoh {
element, err := ardr.readElement()
if err != nil {
// TODO: Log the error somewhere
return
}
if element.name == "eoh" && !element.hasValue {
foundeoh = true
break
}
if element.name == "adif_ver" && element.hasValue {
ardr.version = element.value
}
return
}
for !bContainsCI(chunk, eoh) {
newchunk, _ := ardr.readChunk()
chunk = append(chunk, newchunk...)
}
offset := bIndexCI(chunk, eoh) + len(eoh)
chunk = chunk[offset:]
ardr.excess = chunk[tagStartPos(chunk):]

ardr.headerRead = true
}

func (ardr *baseADIFReader) readChunk() ([]byte, error) {
chunk := make([]byte, 1024)
n, err := ardr.rdr.Read(chunk)
if err != nil {
return nil, err
}
return chunk[:n], nil
func (ardr *baseADIFReader) RecordCount() int {
return ardr.records
}

func (ardr *baseADIFReader) readRecord() ([]byte, error) {
eor := []byte("<eor>")
buf := bytes.TrimSpace(ardr.excess)
ardr.excess = nil
for !bContainsCI(buf, eor) {
newchunk, err := ardr.readChunk()
func (ardr *baseADIFReader) readElement() (*elementData, error) {
var c byte
var err error
var fieldname []byte
var fieldvalue []byte
var fieldtype byte
var fieldlenstr []byte
var fieldlength int = 0

data := &elementData{}
data.name = ""
data.value = ""
data.typecode = 0
data.valueLength = 0

// Look for "<" (open tag) first
foundopentag := false
for !foundopentag {
// Read a byte (aka character)
c, err = ardr.rdr.ReadByte()
if err != nil {
ardr.excess = nil
if err == io.EOF {
buf = trimLotwEof(buf)
// Expected, pass it up the chain
if len(buf) > 0 {
return bytes.TrimSpace(buf), nil
return nil, err
}
foundopentag = c == '<'
}

// Get field name
data.hasValue = false
data.hasType = false
// Look for ">" (close tag) next
foundclosetag := false
foundcolonnum := 0
foundtype := false
for !foundclosetag {
// Read a byte (aka character)
c, err = ardr.rdr.ReadByte()
if err != nil {
return nil, err
}
foundclosetag = c == '>'
if foundclosetag {
break
}
switch foundcolonnum {
case 0:
// no colon yet: append the byte to the field name
if c == ':' {
foundcolonnum++
data.hasValue = true
} else {
fieldname = append(fieldname, c)
}
break
case 1:
// 1 colon found:
// handle the byte as a digit in the length
if c == ':' {
foundcolonnum++
data.hasType = true
} else {
if c >= '0' && c <= '9' {
fieldlenstr = append(fieldlenstr, c)
} else {
return nil, InvalidFieldLength
}
return nil, err
}
adiflog.Println(err)
return nil, err
break
case 2:
// 2 colons found:
// pick up only one byte and use it as a field type
if !foundtype {
fieldtype = c
foundtype = true
} else {
return nil, TypeCodeExceedOneByte
}
break
// This code should not be reached...
default:
return nil, UnknownColons
}
buf = append(buf, newchunk...)
}
buf = trimLotwEof(buf)
record_end := bIndexCI(buf, eor)
ardr.excess = buf[record_end+len(eor):]
return buf[:record_end], nil
}

func trimLotwEof(buf []byte) []byte {
// LotW ends their files with a non-standard EOF tag.
lotwEOF := []byte("<app_lotw_eof>")
if eofIndex := bIndexCI(buf, lotwEOF); eofIndex != -1 {
buf = buf[:eofIndex]
// Make the field name lowercase
data.name = string(bStrictToLower(fieldname))
// Make the field type name uppercase
if foundtype {
data.typecode = charToUpper(fieldtype)
}
return buf
}

func (ardr *baseADIFReader) RecordCount() int {
return ardr.records
// Get field length
if data.hasValue {
fieldlength, err = strconv.Atoi(string(fieldlenstr))
if err != nil {
return nil, err
}
data.valueLength = fieldlength

// Get field value/content,
// with the byte length specified by the field length
for i := 0; i < fieldlength; i++ {
c, err = ardr.rdr.ReadByte()
if err != nil {
return nil, err
}
fieldvalue = append(fieldvalue, c)
}
data.value = string(fieldvalue)
}

return data, nil
}
Loading

0 comments on commit f287508

Please sign in to comment.