user/user.go

// Copyright 2016 The Upspin Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package user provides tools for parsing and validating user names.
package user // import "upspin.io/user"

import (
	"strings"

	"golang.org/x/text/secure/precis"

	"upspin.io/errors"
	"upspin.io/upspin"
)

// Parse splits an upspin.UserName into user and domain and returns the pair.
// It also returns the "+" suffix part of the user name, if it has one. For example,
// given the user name
//	ann+backup@example.com
// it would return the strings
// 	"ann+backup" "backup" "example.com"
//
// Parsed validates the name as an e-mail address and lower-cases the  domain
// so it is canonical.
//
// The rules are:
//
// 	<name> := <user name>@<domain name>
//
// 	<domain name> :=
//
// 	- each . separated token < 64 characters
// 	- character set for tokens [a-z0-9\-]
// 	- final token at least two characters
// 	- whole name < 254 characters
// 	- characters are case insensitive
// 	- final period is OK, but we remove it
//
// We ignore the rules of punycode, which is defined in https://tools.ietf.org/html/rfc3490 .
//
// 	<user name> :=
//
// Names are validated and canonicalized by the UsernameCasePreserved profile
// of the RFC 7613, "Preparation, Enforcement, and Comparison of Internationalized Strings",
// also known as PRECIS.
//
// Further restrictions are added here. The only ASCII punctuation characters
// that are legal are "!#$%&'*+-/=?^_{|}~", and a name that is only ASCII punctuation
// is rejected.
//
// As a special case for use in Access and Group files, the name "*" is allowed.
//
// Case is significant and spaces are not allowed.
//
// The username suffix is tightly constrained: It uses the same character
// set as domains, but of course the spacing of periods is irrelevant.
//
// Facebook and Google constrain usernames to [a-zA-Z0-9+-.],
// ignoring the period and, in Google only, ignoring everything
// from a plus sign onwards. We accept a superset of this but do not
// follow the "ignore" rules.
//
func Parse(userName upspin.UserName) (user, suffix, domain string, err error) {
	const op = errors.Op("user.Parse")
	name := string(userName)
	if len(userName) >= 254 {
		return "", "", "", errors.E(op, errors.Invalid, userName, "name too long")
	}
	if strings.Count(name, "@") != 1 {
		return "", "", "", errors.E(op, errors.Invalid, userName, errors.Str("user name must contain one @ symbol"))
	}
	at := strings.IndexByte(name, '@')
	user, domain = name[:at], name[at+1:]
	if user == "*" {
		// An important special case:
	} else {
		user, suffix, err = parseUser(op, userName, user)
		if err != nil {
			return "", "", "", err
		}
	}
	domain, err = parseDomain(op, userName, domain)
	if err != nil {
		return "", "", "", err
	}
	return user, suffix, domain, nil
}

// ParseUser parses the component of a user name before the '@', that is, the
// user component of an email address. The rules are defined in the
// documentation for Parse except that "*" is not a valid user and the user name
// itself must be less than 255 bytes long.
func ParseUser(user string) (userName, suffix string, err error) {
	return parseUser(errors.Op("user.ParseUser"), upspin.UserName(user), user)
}

// parseUser is the implementation of ParseUser, also called by Parse.
// It takes the full UserName as well as the user component, to aid in error reporting.
func parseUser(op errors.Op, userName upspin.UserName, user string) (string, string, error) {
	if len(user) >= 255 {
		return errParseUser(op, userName, "user name too long")
	}
	if user == "" {
		return errParseUser(op, userName, "missing user name")
	}
	plus := strings.IndexByte(user, '+')
	if plus == len(user)-1 { // Check first because PRECIS dislikes + at end of string.
		return errParseUser(op, userName, "empty +suffix in user name")
	}
	// Validate and canonicalize the user name - and maybe suffix, but
	// the suffix is checked more thoroughly below. We include the suffix
	// here because PRECIS will prevent things like "+" or "ann+" or
	// "+ann" as the full name. That is, we do PRECIS validation on
	// the full user+suffix.
	user, err := canonicalize(user)
	if err != nil {
		return "", "", errors.E(op, errors.Invalid, user, err)
	}
	// Valid +suffix (if any)?
	suffix := ""
	if plus >= 0 {
		if plus == 0 {
			return errParseUser(op, userName, "user name cannot start with +suffix")
		}
		suffix = user[plus+1:]
		if strings.IndexByte(suffix, '+') > 0 {
			return errParseUser(op, userName, "multiple +suffixes in user name")
		}
		for _, c := range suffix {
			if !okDomainChar(c) {
				return errParseUser(op, userName, "bad symbol in +suffix")
			}
		}
	}
	return user, suffix, nil
}

// ParseDomain parses the component of a user name after the '@', that is, the
// domain component of an email address. The rules are defined in the
// documentation for Parse except the domain name itself must be less than 255
// bytes long.
func ParseDomain(domain string) (string, error) {
	return parseDomain(errors.Op("user.ParseDomain"), upspin.UserName(domain), domain)
}

// parseDomain is the implementation of ParseDomain, also called by Parse.
// It takes the full UserName as well as the domain component, to aid in error reporting.
func parseDomain(op errors.Op, userName upspin.UserName, domain string) (string, error) {
	if len(domain) >= 255 {
		return errParseDomain(op, userName, "domain name too long")
	}
	// Final period in domain is legal but is dropped.
	domain = strings.TrimSuffix(domain, ".")
	if domain == "" {
		return errParseDomain(op, userName, "missing domain name")
	}
	if strings.Count(domain, ".") == 0 {
		return errParseDomain(op, userName, "domain name must contain a period")
	}
	// Valid domain name?
	period := -1 // First time through loop will fail if first byte is a period.
	isUpper := false
	for i, c := range domain {
		if !okDomainChar(c) {
			return errParseDomain(op, userName, "bad symbol in domain name")
		}
		if c == '.' {
			if i-1 >= period+64 {
				return errParseDomain(op, userName, "invalid domain name element")
			}
			if i-1 == period || i-1 >= period+64 {
				return errParseDomain(op, userName, "invalid domain name element")
			}
			period = i
		}
		if 'A' <= c && c <= 'Z' {
			isUpper = true
		}
	}
	// Last domain element must be at least two bytes  (".co")
	if period+2 >= len(domain) {
		return errParseDomain(op, userName, "invalid domain name")
	}
	// Lower-case the domain name if necessary.
	if isUpper {
		domain = strings.ToLower(domain)
	}
	return domain, nil
}

func errParseUser(op errors.Op, userName upspin.UserName, msg string) (u, s string, err error) {
	return "", "", errors.E(op, errors.Invalid, userName, msg)
}

func errParseDomain(op errors.Op, userName upspin.UserName, msg string) (d string, err error) {
	return "", errors.E(op, errors.Invalid, userName, msg)
}

func canonicalize(user string) (string, error) {
	// PRECIS allows any ASCII character, but we are more restrictive.
	// That's OK because the ASCII check is cheap and almost always
	// sufficient.
	allPunct := true
	simple := true
	for _, r := range user {
		if illegalASCIIPunctuation(r) {
			return "", errors.Errorf("illegal character %q", r)
		}
		if !legalASCIIPunctuation(r) {
			allPunct = false
		}
		if !simpleUserNameChar(r) {
			simple = false
		}
	}
	if allPunct {
		return "", errors.Errorf("user name contains only punctuation")
	}
	if !simple {
		return precis.UsernameCasePreserved.String(user)
	}
	return user, nil
}

// Used by canonicalize to identify simple strings that don't need PRECIS processing.
// Note we don't check punctuation here because identifiers allow punctuation but
// only in certain places; let PRECIS do the work. "*" is the exception.
func simpleUserNameChar(r rune) bool {
	switch {
	case 'a' <= r && r <= 'z':
		return true
	case 'A' <= r && r <= 'Z':
		return true
	case '0' <= r && r <= '9':
		return true
	}
	return false
}

// illegalASCIIPunctuation reports whether the rune is an ASCII punctuation
// character that is allowed by PRECIS but not by us within a user name.
// We include @ because this does not look at the domain name, just the user part.
func illegalASCIIPunctuation(r rune) bool {
	return strings.ContainsRune(" @\"(),:;<>[\\]`", r)
}

// legalASCIIPunctuation reports whether the rune is an ASCII punctuation
// character that is allowed by us.
func legalASCIIPunctuation(r rune) bool {
	return strings.ContainsRune("!#.$%&'*+-/=?^_{|}~", r)
}

// See the comments for UserAndDomain.
func okDomainChar(r rune) bool {
	switch {
	case 'a' <= r && r <= 'z':
		return true
	case 'A' <= r && r <= 'Z':
		return true
	case '0' <= r && r <= '9':
		return true
	case strings.ContainsRune("+-.", r):
		return true
	}
	return false
}

// Clean returns the user name in canonical form as described by
// the comments for the Parse function.
func Clean(userName upspin.UserName) (upspin.UserName, error) {
	user, _, domain, err := Parse(userName)
	if err != nil {
		return "", err
	}
	// Do we need to rebuild? Avoid allocation if we can.
	userString := string(userName)
	atSign := strings.IndexByte(userString, '@')
	if user == userString[:atSign] && domain == userString[atSign+1:] {
		return userName, nil
	}
	return upspin.UserName(user + "@" + domain), nil
}