Skip to content

Commit

Permalink
playing with examples - added readbility library for feature process
Browse files Browse the repository at this point in the history
  • Loading branch information
hasantayyar committed May 6, 2012
1 parent 65e9dbe commit fdfe3a5
Show file tree
Hide file tree
Showing 5 changed files with 1,212 additions and 28 deletions.
22 changes: 11 additions & 11 deletions examples/xsites-example.php → examples/example.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,38 @@
*/

require_once(__DIR__.'/../autoload.php');
require_once(__DIR__.'/xsites-functions.php');
require_once(__DIR__.'/functions.php');

define('DICTIONARY_POLICY', Noop\Bayes\Tokenizer\Html::POLICY_METAS);
define('MATCH_POLICY', Noop\Bayes\Tokenizer\Html::POLICY_TEXTS | Noop\Bayes\Tokenizer\Html::POLICY_METAS | Noop\Bayes\Tokenizer\Html::POLICY_HEADERS | Noop\Bayes\Tokenizer\Html::POLICY_LINKS);

xsites_log('Loading dictionary');
printf('Loading dictionary');

$bayes_dic = xsites_get_dictionary();

xsites_log('Matching now');
printf('Matching now');

$tokenizer = new Noop\Bayes\Tokenizer\Html;
$tokenizer->setPolicy(MATCH_POLICY);

foreach (array_slice($dic, floor(count($dic) / 2)) as $site) {
$contents = xsites_get_site($site);
if ($contents == '' && strlen($contents) < 1000) {
//xsites_log('Site not responsible, skipping');
//printf('Site not responsible, skipping');
} else {
xsites_log('Matching "%s"', $site);
printf('Probability: %.6f'.PHP_EOL, $bayes_dic->match($tokenizer->tokenize($contents)));
printf("\nMatching \"%s\"", $site);
printf("\nProbability: %.6f".PHP_EOL, $bayes_dic->match($tokenizer->tokenize($contents)));
}
}

xsites_log('Matching common sites');
printf("\nMatching common sites");

foreach (array('rus.delfi.lv', 'youtube.com', 'google.com', 'wikipedia.com') as $site) {
foreach (array('dailyporno.tumblr.com','blogcu.com', 'youtube.com', 'google.com', 'wikipedia.com') as $site) {
$contents = xsites_get_site($site);
if ($contents == '' && strlen($contents) < 1000) {
//xsites_log('Site not responsible, skipping');
//printf('Site not responsible, skipping');
} else {
xsites_log('Matching "%s"', $site);
printf('Probability: %.6f'.PHP_EOL, $bayes_dic->match($tokenizer->tokenize($contents)));
printf("\nMatching '%s'", $site);
printf("\nProbability: %.6f".PHP_EOL, $bayes_dic->match($tokenizer->tokenize($contents)));
}
}
16 changes: 5 additions & 11 deletions examples/xsites-functions.php → examples/functions.php
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
<?php

$dic = file_get_contents(__DIR__.'/xsites.txt');
$dic = file_get_contents(__DIR__.'/sites.txt');
$dic = explode(PHP_EOL, $dic);

function xsites_log() {
print '--> ';
call_user_func_array('printf', func_get_args());
print PHP_EOL;
}

function xsites_get_dictionary() {
global $dic;

Expand All @@ -32,19 +26,19 @@ function xsites_get_dictionary() {
}

function xsites_get_site($url) {
$cache = sys_get_temp_dir() . '/xsite-cache-'.trim($url);
$cache = __DIR__.'/data/xsite-cache-'.trim($url);
if (is_readable($cache)) {
return file_get_contents($cache);
} else {
xsites_log('Caching site "%s" to "%s"', $url, $cache);

printf("\nCaching site \"%s\" to \"%s\"", $url, $cache);
$context = stream_context_create(array(
'http' => array(
'timeout' => 1 // Timeout in seconds
)
));
$contents = file_get_contents('http://'.$url, 0, $context);
ini_set('default_socket_timeout', 1);
$contents = file_get_contents('http://'.$url, 0, $context);
// if(strlen($contents)>0) will be better
file_put_contents($cache, $contents);
return $contents;
}
Expand Down
7 changes: 1 addition & 6 deletions examples/xsites.txt → examples/sites.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ www.orgasm.com
www.xvideos.comtagsporn
www.youjizz.com
xhamster.com
www.alphaporno.com
www.keezmovies.com
www.wildmaturemoms.com
www.slutload.com
Expand All @@ -23,9 +22,7 @@ www.hotpornshow.com
www.premiumhdv.com
indianpornvideos.com
www.porntube.com
elite-forum.org
www.hardsextube.com
www.bullporn.com
www.freefuckvidz.com
www.brazzers.com
www.persiankitty.com
Expand All @@ -36,7 +33,6 @@ www.freeporn.com
www.jizzbo.com
www.porn.to
www.pinkworld.com
porn.hu
www.shufuni.com
allrusamateurs.com
www.yobt.com
Expand All @@ -48,5 +44,4 @@ www.porn8.com
euroteenmovs.com
www.rawtube.com
www.porn.org
foodporndaily.com
www.empflix.com
foodporndaily.com
110 changes: 110 additions & 0 deletions examples/tools/readbility/JSLikeHtmlElement.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
<?php
/**
* JavaScript-like HTML DOM Element
*
* This class extends PHP's DOMElement to allow
* users to get and set the innerHTML property of
* HTML elements in the same way it's done in
* JavaScript.
*
* Example usage:
* @code
* require_once 'JSLikeHTMLElement.php';
* header('Content-Type: text/plain');
* $doc = new DOMDocument();
* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>');
* $elem = $doc->getElementsByTagName('div')->item(0);
*
* // print innerHTML
* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>'
* echo "\n\n";
*
* // set innerHTML
* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>';
* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>'
* echo "\n\n";
*
* // print document (with our changes)
* echo $doc->saveXML();
* @endcode
*
* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net
* @see http://fivefilters.org (the project this was written for)
*/
class JSLikeHTMLElement extends DOMElement
{
/**
* Used for setting innerHTML like it's done in JavaScript:
* @code
* $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>';
* @endcode
*/
public function __set($name, $value) {
if ($name == 'innerHTML') {
// first, empty the element
for ($x=$this->childNodes->length-1; $x>=0; $x--) {
$this->removeChild($this->childNodes->item($x));
}
// $value holds our new inner HTML
if ($value != '') {
$f = $this->ownerDocument->createDocumentFragment();
// appendXML() expects well-formed markup (XHTML)
$result = @$f->appendXML($value); // @ to suppress PHP warnings
if ($result) {
if ($f->hasChildNodes()) $this->appendChild($f);
} else {
// $value is probably ill-formed
$f = new DOMDocument();
$value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
// Using <htmlfragment> will generate a warning, but so will bad HTML
// (and by this point, bad HTML is what we've got).
// We use it (and suppress the warning) because an HTML fragment will
// be wrapped around <html><body> tags which we don't really want to keep.
// Note: despite the warning, if loadHTML succeeds it will return true.
$result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>');
if ($result) {
$import = $f->getElementsByTagName('htmlfragment')->item(0);
foreach ($import->childNodes as $child) {
$importedNode = $this->ownerDocument->importNode($child, true);
$this->appendChild($importedNode);
}
} else {
// oh well, we tried, we really did. :(
// this element is now empty
}
}
}
} else {
$trace = debug_backtrace();
trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
}
}

/**
* Used for getting innerHTML like it's done in JavaScript:
* @code
* $string = $div->innerHTML;
* @endcode
*/
public function __get($name)
{
if ($name == 'innerHTML') {
$inner = '';
foreach ($this->childNodes as $child) {
$inner .= $this->ownerDocument->saveXML($child);
}
return $inner;
}

$trace = debug_backtrace();
trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
return null;
}

public function __toString()
{
return '['.$this->tagName.']';
}
}
?>
Loading

0 comments on commit fdfe3a5

Please sign in to comment.