Permalink
Find file
9e47576 Oct 11, 2010
405 lines (303 sloc) 12.2 KB
<?php
#
# RFC 822/2822/5322 Email Parser
#
# By Cal Henderson <cal@iamcal.com>
#
# This code is dual licensed:
# CC Attribution-ShareAlike 2.5 - http://creativecommons.org/licenses/by-sa/2.5/
# GPLv3 - http://www.gnu.org/copyleft/gpl.html
#
# $Revision$
#
##################################################################################
function is_valid_email_address($email, $options=array()){
#
# you can pass a few different named options as a second argument,
# but the defaults are usually a good choice.
#
$defaults = array(
'allow_comments' => true,
'public_internet' => true, # turn this off for 'strict' mode
);
$opts = array();
foreach ($defaults as $k => $v) $opts[$k] = isset($options[$k]) ? $options[$k] : $v;
$options = $opts;
####################################################################################
#
# NO-WS-CTL = %d1-8 / ; US-ASCII control characters
# %d11 / ; that do not include the
# %d12 / ; carriage return, line feed,
# %d14-31 / ; and white space characters
# %d127
# ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
# DIGIT = %x30-39
$no_ws_ctl = "[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f]";
$alpha = "[\\x41-\\x5a\\x61-\\x7a]";
$digit = "[\\x30-\\x39]";
$cr = "\\x0d";
$lf = "\\x0a";
$crlf = "(?:$cr$lf)";
####################################################################################
#
# obs-char = %d0-9 / %d11 / ; %d0-127 except CR and
# %d12 / %d14-127 ; LF
# obs-text = *LF *CR *(obs-char *LF *CR)
# text = %d1-9 / ; Characters excluding CR and LF
# %d11 /
# %d12 /
# %d14-127 /
# obs-text
# obs-qp = "\" (%d0-127)
# quoted-pair = ("\" text) / obs-qp
$obs_char = "[\\x00-\\x09\\x0b\\x0c\\x0e-\\x7f]";
$obs_text = "(?:$lf*$cr*(?:$obs_char$lf*$cr*)*)";
$text = "(?:[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f]|$obs_text)";
#
# there's an issue with the definition of 'text', since 'obs_text' can
# be blank and that allows qp's with no character after the slash. we're
# treating that as bad, so this just checks we have at least one
# (non-CRLF) character
#
$text = "(?:$lf*$cr*$obs_char$lf*$cr*)";
$obs_qp = "(?:\\x5c[\\x00-\\x7f])";
$quoted_pair = "(?:\\x5c$text|$obs_qp)";
####################################################################################
#
# obs-FWS = 1*WSP *(CRLF 1*WSP)
# FWS = ([*WSP CRLF] 1*WSP) / ; Folding white space
# obs-FWS
# ctext = NO-WS-CTL / ; Non white space controls
# %d33-39 / ; The rest of the US-ASCII
# %d42-91 / ; characters not including "(",
# %d93-126 ; ")", or "\"
# ccontent = ctext / quoted-pair / comment
# comment = "(" *([FWS] ccontent) [FWS] ")"
# CFWS = *([FWS] comment) (([FWS] comment) / FWS)
#
# note: we translate ccontent only partially to avoid an infinite loop
# instead, we'll recursively strip *nested* comments before processing
# the input. that will leave 'plain old comments' to be matched during
# the main parse.
#
$wsp = "[\\x20\\x09]";
$obs_fws = "(?:$wsp+(?:$crlf$wsp+)*)";
$fws = "(?:(?:(?:$wsp*$crlf)?$wsp+)|$obs_fws)";
$ctext = "(?:$no_ws_ctl|[\\x21-\\x27\\x2A-\\x5b\\x5d-\\x7e])";
$ccontent = "(?:$ctext|$quoted_pair)";
$comment = "(?:\\x28(?:$fws?$ccontent)*$fws?\\x29)";
$cfws = "(?:(?:$fws?$comment)*(?:$fws?$comment|$fws))";
#
# these are the rules for removing *nested* comments. we'll just detect
# outer comment and replace it with an empty comment, and recurse until
# we stop.
#
$outer_ccontent_dull = "(?:$fws?$ctext|$quoted_pair)";
$outer_ccontent_nest = "(?:$fws?$comment)";
$outer_comment = "(?:\\x28$outer_ccontent_dull*(?:$outer_ccontent_nest$outer_ccontent_dull*)+$fws?\\x29)";
####################################################################################
#
# atext = ALPHA / DIGIT / ; Any character except controls,
# "!" / "#" / ; SP, and specials.
# "$" / "%" / ; Used for atoms
# "&" / "'" /
# "*" / "+" /
# "-" / "/" /
# "=" / "?" /
# "^" / "_" /
# "`" / "{" /
# "|" / "}" /
# "~"
# atom = [CFWS] 1*atext [CFWS]
$atext = "(?:$alpha|$digit|[\\x21\\x23-\\x27\\x2a\\x2b\\x2d\\x2f\\x3d\\x3f\\x5e\\x5f\\x60\\x7b-\\x7e])";
$atom = "(?:$cfws?(?:$atext)+$cfws?)";
####################################################################################
#
# qtext = NO-WS-CTL / ; Non white space controls
# %d33 / ; The rest of the US-ASCII
# %d35-91 / ; characters not including "\"
# %d93-126 ; or the quote character
# qcontent = qtext / quoted-pair
# quoted-string = [CFWS]
# DQUOTE *([FWS] qcontent) [FWS] DQUOTE
# [CFWS]
# word = atom / quoted-string
$qtext = "(?:$no_ws_ctl|[\\x21\\x23-\\x5b\\x5d-\\x7e])";
$qcontent = "(?:$qtext|$quoted_pair)";
$quoted_string = "(?:$cfws?\\x22(?:$fws?$qcontent)*$fws?\\x22$cfws?)";
#
# changed the '*' to a '+' to require that quoted strings are not empty
#
$quoted_string = "(?:$cfws?\\x22(?:$fws?$qcontent)+$fws?\\x22$cfws?)";
$word = "(?:$atom|$quoted_string)";
####################################################################################
#
# obs-local-part = word *("." word)
# obs-domain = atom *("." atom)
$obs_local_part = "(?:$word(?:\\x2e$word)*)";
$obs_domain = "(?:$atom(?:\\x2e$atom)*)";
####################################################################################
#
# dot-atom-text = 1*atext *("." 1*atext)
# dot-atom = [CFWS] dot-atom-text [CFWS]
$dot_atom_text = "(?:$atext+(?:\\x2e$atext+)*)";
$dot_atom = "(?:$cfws?$dot_atom_text$cfws?)";
####################################################################################
#
# domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
# dcontent = dtext / quoted-pair
# dtext = NO-WS-CTL / ; Non white space controls
#
# %d33-90 / ; The rest of the US-ASCII
# %d94-126 ; characters not including "[",
# ; "]", or "\"
$dtext = "(?:$no_ws_ctl|[\\x21-\\x5a\\x5e-\\x7e])";
$dcontent = "(?:$dtext|$quoted_pair)";
$domain_literal = "(?:$cfws?\\x5b(?:$fws?$dcontent)*$fws?\\x5d$cfws?)";
####################################################################################
#
# local-part = dot-atom / quoted-string / obs-local-part
# domain = dot-atom / domain-literal / obs-domain
# addr-spec = local-part "@" domain
$local_part = "(($dot_atom)|($quoted_string)|($obs_local_part))";
$domain = "(($dot_atom)|($domain_literal)|($obs_domain))";
$addr_spec = "$local_part\\x40$domain";
#
# this was previously 256 based on RFC3696, but dominic's errata was accepted.
#
if (strlen($email) > 254) return 0;
#
# we need to strip nested comments first - we replace them with a simple comment
#
if ($options['allow_comments']){
$email = email_strip_comments($outer_comment, $email, "(x)");
}
#
# now match what's left
#
if (!preg_match("!^$addr_spec$!", $email, $m)){
return 0;
}
$bits = array(
'local' => isset($m[1]) ? $m[1] : '',
'local-atom' => isset($m[2]) ? $m[2] : '',
'local-quoted' => isset($m[3]) ? $m[3] : '',
'local-obs' => isset($m[4]) ? $m[4] : '',
'domain' => isset($m[5]) ? $m[5] : '',
'domain-atom' => isset($m[6]) ? $m[6] : '',
'domain-literal' => isset($m[7]) ? $m[7] : '',
'domain-obs' => isset($m[8]) ? $m[8] : '',
);
#
# we need to now strip comments from $bits[local] and $bits[domain],
# since we know they're in the right place and we want them out of the
# way for checking IPs, label sizes, etc
#
if ($options['allow_comments']){
$bits['local'] = email_strip_comments($comment, $bits['local']);
$bits['domain'] = email_strip_comments($comment, $bits['domain']);
}
#
# length limits on segments
#
if (strlen($bits['local']) > 64) return 0;
if (strlen($bits['domain']) > 255) return 0;
#
# restrictions on domain-literals from RFC2821 section 4.1.3
#
# RFC4291 changed the meaning of :: in IPv6 addresses - i can mean one or
# more zero groups (updated from 2 or more).
#
if (strlen($bits['domain-literal'])){
$Snum = "(\d{1,3})";
$IPv4_address_literal = "$Snum\.$Snum\.$Snum\.$Snum";
$IPv6_hex = "(?:[0-9a-fA-F]{1,4})";
$IPv6_full = "IPv6\:$IPv6_hex(?:\:$IPv6_hex){7}";
$IPv6_comp_part = "(?:$IPv6_hex(?:\:$IPv6_hex){0,7})?";
$IPv6_comp = "IPv6\:($IPv6_comp_part\:\:$IPv6_comp_part)";
$IPv6v4_full = "IPv6\:$IPv6_hex(?:\:$IPv6_hex){5}\:$IPv4_address_literal";
$IPv6v4_comp_part = "$IPv6_hex(?:\:$IPv6_hex){0,5}";
$IPv6v4_comp = "IPv6\:((?:$IPv6v4_comp_part)?\:\:(?:$IPv6v4_comp_part\:)?)$IPv4_address_literal";
#
# IPv4 is simple
#
if (preg_match("!^\[$IPv4_address_literal\]$!", $bits['domain'], $m)){
if (intval($m[1]) > 255) return 0;
if (intval($m[2]) > 255) return 0;
if (intval($m[3]) > 255) return 0;
if (intval($m[4]) > 255) return 0;
}else{
#
# this should be IPv6 - a bunch of tests are needed here :)
#
while (1){
if (preg_match("!^\[$IPv6_full\]$!", $bits['domain'])){
break;
}
if (preg_match("!^\[$IPv6_comp\]$!", $bits['domain'], $m)){
list($a, $b) = explode('::', $m[1]);
$folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b";
$groups = explode(':', $folded);
if (count($groups) > 7) return 0;
break;
}
if (preg_match("!^\[$IPv6v4_full\]$!", $bits['domain'], $m)){
if (intval($m[1]) > 255) return 0;
if (intval($m[2]) > 255) return 0;
if (intval($m[3]) > 255) return 0;
if (intval($m[4]) > 255) return 0;
break;
}
if (preg_match("!^\[$IPv6v4_comp\]$!", $bits['domain'], $m)){
list($a, $b) = explode('::', $m[1]);
$b = substr($b, 0, -1); # remove the trailing colon before the IPv4 address
$folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b";
$groups = explode(':', $folded);
if (count($groups) > 5) return 0;
break;
}
return 0;
}
}
}else{
#
# the domain is either dot-atom or obs-domain - either way, it's
# made up of simple labels and we split on dots
#
$labels = explode('.', $bits['domain']);
#
# this is allowed by both dot-atom and obs-domain, but is un-routeable on the
# public internet, so we'll fail it (e.g. user@localhost)
#
if ($options['public_internet']){
if (count($labels) == 1) return 0;
}
#
# checks on each label
#
foreach ($labels as $label){
if (strlen($label) > 63) return 0;
if (substr($label, 0, 1) == '-') return 0;
if (substr($label, -1) == '-') return 0;
}
#
# last label can't be all numeric
#
if ($options['public_internet']){
if (preg_match('!^[0-9]+$!', array_pop($labels))) return 0;
}
}
return 1;
}
##################################################################################
function email_strip_comments($comment, $email, $replace=''){
while (1){
$new = preg_replace("!$comment!", $replace, $email);
if (strlen($new) == strlen($email)){
return $email;
}
$email = $new;
}
}
##################################################################################
?>