From b88939ac72a98eea37c69a26c766c5db99b283ed Mon Sep 17 00:00:00 2001 From: Vadim Zapolski Date: Wed, 18 Oct 2023 20:27:48 +0300 Subject: [PATCH] add new option SetCustomParseMediaType (#308) * add new option SetCustomParseMediaType to customise mediatype parsing --- detect.go | 21 ++++++------ detect_test.go | 8 +++-- envelope.go | 6 ++-- options.go | 13 +++++++ options_test.go | 90 +++++++++++++++++++++++++++++++++++++++++++++++++ parser.go | 4 +++ part.go | 15 ++++++--- 7 files changed, 136 insertions(+), 21 deletions(-) create mode 100644 options_test.go diff --git a/detect.go b/detect.go index 81543d4e..1dda9211 100644 --- a/detect.go +++ b/detect.go @@ -4,14 +4,13 @@ import ( "strings" inttp "github.com/jhillyerd/enmime/internal/textproto" - "github.com/jhillyerd/enmime/mediatype" ) // detectMultipartMessage returns true if the message has a recognized multipart Content-Type header func detectMultipartMessage(root *Part, multipartWOBoundaryAsSinglepart bool) bool { // Parse top-level multipart ctype := root.Header.Get(hnContentType) - mtype, params, _, err := mediatype.Parse(ctype) + mtype, params, _, err := root.parseMediaType(ctype) if err != nil { return false } @@ -35,27 +34,27 @@ func detectMultipartMessage(root *Part, multipartWOBoundaryAsSinglepart bool) bo // - Content-Disposition: attachment; filename="frog.jpg" // - Content-Disposition: inline; filename="frog.jpg" // - Content-Type: attachment; filename="frog.jpg" -func detectAttachmentHeader(header inttp.MIMEHeader) bool { - mtype, params, _, _ := mediatype.Parse(header.Get(hnContentDisposition)) +func detectAttachmentHeader(root *Part, header inttp.MIMEHeader) bool { + mtype, params, _, _ := root.parseMediaType(header.Get(hnContentDisposition)) if strings.ToLower(mtype) == cdAttachment || (strings.ToLower(mtype) == cdInline && len(params) > 0) { return true } - mtype, _, _, _ = mediatype.Parse(header.Get(hnContentType)) + mtype, _, _, _ = root.parseMediaType(header.Get(hnContentType)) return strings.ToLower(mtype) == cdAttachment } // detectTextHeader returns true, if the the MIME headers define a valid 'text/plain' or 'text/html' // part. If the emptyContentTypeIsPlain argument is set to true, a missing Content-Type header will // result in a positive plain part detection. -func detectTextHeader(header inttp.MIMEHeader, emptyContentTypeIsText bool) bool { +func detectTextHeader(root *Part, header inttp.MIMEHeader, emptyContentTypeIsText bool) bool { ctype := header.Get(hnContentType) if ctype == "" && emptyContentTypeIsText { return true } - if mtype, _, _, err := mediatype.Parse(ctype); err == nil { + if mtype, _, _, err := root.parseMediaType(ctype); err == nil { switch mtype { case ctTextPlain, ctTextHTML: return true @@ -68,23 +67,23 @@ func detectTextHeader(header inttp.MIMEHeader, emptyContentTypeIsText bool) bool // detectBinaryBody returns true if the mail header defines a binary body. func detectBinaryBody(root *Part) bool { header := inttp.MIMEHeader(root.Header) // Use internal header methods. - if detectTextHeader(header, true) { + if detectTextHeader(root, header, true) { // It is text/plain, but an attachment. // Content-Type: text/plain; name="test.csv" // Content-Disposition: attachment; filename="test.csv" // Check for attachment only, or inline body is marked // as attachment, too. - mtype, _, _, _ := mediatype.Parse(header.Get(hnContentDisposition)) + mtype, _, _, _ := root.parseMediaType(header.Get(hnContentDisposition)) return strings.ToLower(mtype) == cdAttachment } - isBin := detectAttachmentHeader(header) + isBin := detectAttachmentHeader(root, header) if !isBin { // This must be an attachment, if the Content-Type is not // 'text/plain' or 'text/html'. // Example: // Content-Type: application/pdf; name="doc.pdf" - mtype, _, _, _ := mediatype.Parse(header.Get(hnContentType)) + mtype, _, _, _ := root.parseMediaType(header.Get(hnContentType)) mtype = strings.ToLower(mtype) if mtype != ctTextPlain && mtype != ctTextHTML { return true diff --git a/detect_test.go b/detect_test.go index b092db8e..aba9b1ff 100644 --- a/detect_test.go +++ b/detect_test.go @@ -141,8 +141,10 @@ func TestDetectAttachmentHeader(t *testing.T) { }, } + root := &Part{parser: &defaultParser} + for _, s := range htests { - got := detectAttachmentHeader(s.header) + got := detectAttachmentHeader(root, s.header) if got != s.want { t.Errorf("detectAttachmentHeader(%v) == %v, want: %v", s.header, got, s.want) } @@ -192,8 +194,10 @@ func TestDetectTextHeader(t *testing.T) { }, } + root := &Part{parser: &defaultParser} + for _, s := range htests { - got := detectTextHeader(s.header, s.emptyIsPlain) + got := detectTextHeader(root, s.header, s.emptyIsPlain) if got != s.want { t.Errorf("detectTextHeader(%v, %v) == %v, want: %v", s.header, s.emptyIsPlain, got, s.want) diff --git a/envelope.go b/envelope.go index aeb7cc33..3c4bb444 100644 --- a/envelope.go +++ b/envelope.go @@ -12,8 +12,6 @@ import ( "github.com/jaytaylor/html2text" "github.com/jhillyerd/enmime/internal/coding" inttp "github.com/jhillyerd/enmime/internal/textproto" - "github.com/jhillyerd/enmime/mediatype" - "github.com/pkg/errors" ) @@ -232,7 +230,7 @@ func parseTextOnlyBody(root *Part, e *Envelope) error { var charset string var isHTML bool if ctype := root.Header.Get(hnContentType); ctype != "" { - if mediatype, mparams, _, err := mediatype.Parse(ctype); err == nil { + if mediatype, mparams, _, err := root.parseMediaType(ctype); err == nil { isHTML = (mediatype == ctTextHTML) if mparams[hpCharset] != "" { charset = mparams[hpCharset] @@ -271,7 +269,7 @@ func parseTextOnlyBody(root *Part, e *Envelope) error { func parseMultiPartBody(root *Part, e *Envelope) error { // Parse top-level multipart ctype := root.Header.Get(hnContentType) - mediatype, params, _, err := mediatype.Parse(ctype) + mediatype, params, _, err := root.parseMediaType(ctype) if err != nil { return fmt.Errorf("unable to parse media type: %v", err) } diff --git a/options.go b/options.go index 5ae24b41..ea9afe40 100644 --- a/options.go +++ b/options.go @@ -63,3 +63,16 @@ type rawContentOption bool func (o rawContentOption) apply(p *Parser) { p.rawContent = bool(o) } + +// SetCustomParseMediaType if provided, will be used to parse media type instead of the default ParseMediaType +// function. This may be used to parse media type parameters that would otherwise be considered malformed. +// By default parsing happens using ParseMediaType +func SetCustomParseMediaType(customParseMediaType CustomParseMediaType) Option { + return parseMediaTypeOption(customParseMediaType) +} + +type parseMediaTypeOption CustomParseMediaType + +func (o parseMediaTypeOption) apply(p *Parser) { + p.customParseMediaType = CustomParseMediaType(o) +} diff --git a/options_test.go b/options_test.go new file mode 100644 index 00000000..8a7010b9 --- /dev/null +++ b/options_test.go @@ -0,0 +1,90 @@ +package enmime + +import ( + "fmt" + "strings" + "testing" +) + +func TestSetCustomParseMediaType(t *testing.T) { + alwaysReturnHTML := func(ctype string) (mtype string, params map[string]string, invalidParams []string, err error) { + return "text/html", nil, nil, err + } + changeAndUtilizeDefault := func(ctype string) (mtype string, params map[string]string, invalidParams []string, err error) { + modifiedStr := strings.ReplaceAll(ctype, "application/Pamir Viewer", "application/PamirViewer") + return ParseMediaType(modifiedStr) + } + tcases := []struct { + ctype string + want string + customParseMediaType CustomParseMediaType + }{ + { + ctype: "text/plain", + want: "text/plain", + customParseMediaType: nil, + }, + { + ctype: "text/plain", + want: "text/html", + customParseMediaType: alwaysReturnHTML, + }, + { + ctype: "text/plain; charset=utf-8", + want: "text/html", + customParseMediaType: alwaysReturnHTML, + }, + { + ctype: "application/Pamir Viewer; name=\"2023-384.pmrv\"", + want: "application/pamirviewer", + customParseMediaType: changeAndUtilizeDefault, + }, + } + + for _, tcase := range tcases { + p := &Part{parser: NewParser(SetCustomParseMediaType(tcase.customParseMediaType))} + + got, _, _, _ := p.parseMediaType(tcase.ctype) + if got != tcase.want { + t.Errorf("Parser.parseMediaType(%v) == %v, want: %v", + tcase.ctype, got, tcase.want) + } + } +} + +func ExampleSetCustomParseMediaType() { + // for the sake of simplicity replaces space in a very specific invalid content-type: "application/Pamir Viewer" + replaceSpecificContentType := func(ctype string) (mtype string, params map[string]string, invalidParams []string, err error) { + modifiedStr := strings.ReplaceAll(ctype, "application/Pamir Viewer", "application/PamirViewer") + + return ParseMediaType(modifiedStr) + } + + invalidMessageContent := `From: +Content-Type: multipart/mixed; + boundary="----=_NextPart_000_000F_01D9FAC6.09EB3B60" + +------=_NextPart_000_000F_01D9FAC6.09EB3B60 +Content-Type: application/Pamir Viewer; + name="2023-10-13.pmrv" +Content-Transfer-Encoding: base64 +Content-Disposition: attachment; + filename="2023-10-13.pmrv" + +f6En7vFpNql3tfMkoKABP1iBEf+M/qF6LCAIvyRbpH6uDCqcKKGmH3e6OiqN5eCfqUk= +` + + p := NewParser(SetCustomParseMediaType(replaceSpecificContentType)) + e, err := p.ReadEnvelope(strings.NewReader(invalidMessageContent)) + + fmt.Println(err) + fmt.Println(len(e.Attachments)) + fmt.Println(e.Attachments[0].ContentType) + fmt.Println(e.Attachments[0].FileName) + + // Output: + // + // 1 + // application/pamirviewer + // 2023-10-13.pmrv +} diff --git a/parser.go b/parser.go index e3ade8d9..4894fbf1 100644 --- a/parser.go +++ b/parser.go @@ -13,6 +13,9 @@ func AllowCorruptTextPartErrorPolicy(p *Part, err error) bool { return false } +// CustomParseMediaType parses media type. See ParseMediaType for more details +type CustomParseMediaType func(ctype string) (mtype string, params map[string]string, invalidParams []string, err error) + // Parser parses MIME. // Default parser is a valid one. type Parser struct { @@ -21,6 +24,7 @@ type Parser struct { readPartErrorPolicy ReadPartErrorPolicy skipMalformedParts bool rawContent bool + customParseMediaType CustomParseMediaType } // defaultParser is a Parser with default configuration. diff --git a/part.go b/part.go index 3989407c..4649fe5f 100644 --- a/part.go +++ b/part.go @@ -15,8 +15,6 @@ import ( "github.com/gogs/chardet" "github.com/jhillyerd/enmime/internal/coding" inttp "github.com/jhillyerd/enmime/internal/textproto" - "github.com/jhillyerd/enmime/mediatype" - "github.com/pkg/errors" ) @@ -126,7 +124,7 @@ func (p *Part) setupHeaders(r *bufio.Reader, defaultContentType string) error { ctype = defaultContentType } // Parse Content-Type header. - mtype, mparams, minvalidParams, err := mediatype.Parse(ctype) + mtype, mparams, minvalidParams, err := p.parseMediaType(ctype) if err != nil { return err } @@ -149,7 +147,7 @@ func (p *Part) setupHeaders(r *bufio.Reader, defaultContentType string) error { func (p *Part) setupContentHeaders(mediaParams map[string]string) { header := inttp.MIMEHeader(p.Header) // Determine content disposition, filename, character set. - disposition, dparams, _, err := mediatype.Parse(header.Get(hnContentDisposition)) + disposition, dparams, _, err := p.parseMediaType(header.Get(hnContentDisposition)) if err == nil { // Disposition is optional p.Disposition = disposition @@ -327,6 +325,15 @@ func (p *Part) decodeContent(r io.Reader, readPartErrorPolicy ReadPartErrorPolic return nil } +// parses media type using custom or default media type parser +func (p *Part) parseMediaType(ctype string) (mtype string, params map[string]string, invalidParams []string, err error) { + if p.parser == nil || p.parser.customParseMediaType == nil { + return ParseMediaType(ctype) + } + + return p.parser.customParseMediaType(ctype) +} + // IsBase64CorruptInputError returns true when err is of type base64.CorruptInputError. // // It can be used to create ReadPartErrorPolicy functions.