Skip to content

Commit

Permalink
normalize certain known referrers that have multiple domains under a …
Browse files Browse the repository at this point in the history
…single host. see #43
  • Loading branch information
dannyvankooten committed Jan 22, 2020
1 parent fa492f6 commit cd4a743
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 5 deletions.
21 changes: 18 additions & 3 deletions src/class-aggregator.php
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ public function aggregate() {
// increment referrals
if ( $referrer_url !== '' && ! $this->in_blacklist( $referrer_url, $blacklist ) ) {

$referrer_url = $this->sanitize_url( $referrer_url );
$referrer_url = $this->clean_url( $referrer_url );
$referrer_url = $this->normalize_url( $referrer_url );

if ( ! isset( $referrer_stats[ $referrer_url ] ) ) {
$referrer_stats[ $referrer_url ] = array(
Expand Down Expand Up @@ -202,8 +203,7 @@ private function in_blacklist( $url, array $blacklist ) {
return false;
}

public function sanitize_url( $url ) {
$whitelisted_params = array( 'page_id', 'p', 'cat', 'product' );
public function clean_url( $url ) {

// remove # from URL
$url = preg_replace( '/#.*$/', '', $url );
Expand All @@ -216,6 +216,8 @@ public function sanitize_url( $url ) {
$params = array();
parse_str( $query_str, $params );

// strip all non-whitelisted params from url
$whitelisted_params = array( 'page_id', 'p', 'cat', 'product' );
$new_params = array_intersect_key( $params, array_flip( $whitelisted_params ) );
$new_query_str = http_build_query( $new_params );
$new_url = substr( $url, 0, $pos + 1 ) . $new_query_str;
Expand All @@ -230,4 +232,17 @@ public function sanitize_url( $url ) {
return $url;
}

public function normalize_url( $url ) {
$aggregations = array(
'/(google|bing)\.([a-z]{2,3}(?:\.[a-z]{2,3})?)\/(?:search|url)/' => '$1.$2',
'/(?:i|m)\.facebook\.com/' => 'facebook.com',
'/pinterest\.com\/pin\/.*/' => 'pinterest.com',
'/linkedin\.com\/feed.*/' => 'linkedin.com',
'/(?:www|m)\.baidu\.com.*/' => 'www.baidu.com',
'/yandex\.ru\/clck.*/' => 'yandex.ru',
);

return preg_replace(array_keys($aggregations), array_values($aggregations), $url);
}

}
27 changes: 25 additions & 2 deletions tests/AggregatorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

final class AggregatorTest extends TestCase
{
public function testSanitizeUrl() : void
public function test_clean_url() : void
{
$a = new Aggregator();

Expand All @@ -18,7 +18,30 @@ public function testSanitizeUrl() : void
];

foreach ($tests as $input => $output) {
$this->assertEquals($output, $a->sanitize_url($input));
$this->assertEquals($output, $a->clean_url($input));
}
}

public function test_normalize_url() : void
{
$a = new Aggregator();
$tests = [
'https://wordpress.org/plugins/koko-analytics/' => 'https://wordpress.org/plugins/koko-analytics/',
'https://www.google.com/search' => 'https://www.google.com',
'https://www.google.co.uk/search' => 'https://www.google.co.uk',
'https://www.google.nl/url' => 'https://www.google.nl',
'https://m.facebook.com' => 'https://facebook.com',
'https://m.facebook.com/profile/whatever' => 'https://facebook.com/profile/whatever',
'https://pinterest.com/pin/foobar' => 'https://pinterest.com',
'https://www.linkedin.com/feed' => 'https://www.linkedin.com',
'https://www.linkedin.com/feed/update/urn:li:activity:6620280880285921280' => 'https://www.linkedin.com',
'https://www.baidu.com/link' => 'https://www.baidu.com',
'https://m.baidu.com/from=844b/bd_page_type=1/ssid=98c26c6f6e676d65697869620b/uid=0/pu=usm%402%2Csz%40320_1001%2Cta%40iphone_2_9.0_24_79.0/baiduid=B24A174BB75A8A37CEA414106EC583CB/w=0_10_/t=iphone/l=1/tc' => 'https://www.baidu.com',
'https://yandex.ru/clck/jsredir' => 'https://yandex.ru',
];

foreach ($tests as $input => $output) {
$this->assertEquals($output, $a->normalize_url($input));
}
}
}

0 comments on commit cd4a743

Please sign in to comment.