From 279b317850753cab7b74e3fd5a5f5a40d424fd09 Mon Sep 17 00:00:00 2001 From: Jacob Andresen Date: Sat, 7 Jan 2012 13:49:05 +0100 Subject: [PATCH] cleaning --- README.md | 3 - Rakefile | 9 ++ cli/clear.php | 13 -- cli/configuration.php | 10 -- cli/crawl.php | 17 -- cli/index.php | 22 --- cli/search.php | 20 --- cli/setup.php | 27 ---- database/account.sql | 22 +-- database/content.sql | 69 ++++----- database/job.sql | 20 +++ php/YASE/Account.php | 109 ------------- php/YASE/Collection.php | 115 -------------- php/YASE/CollectionDomain.php | 52 ------- php/YASE/Configuration.php.example | 10 -- php/YASE/Crawler.php | 122 --------------- php/YASE/Document.php | 58 ------- php/YASE/Encoding.php | 36 ----- php/YASE/Framework.php | 19 --- php/YASE/HTMLRobot.php | 61 -------- php/YASE/HTTPClient.php | 172 --------------------- php/YASE/Indexer.php | 86 ----------- php/YASE/PDFRobot.php | 34 ---- php/YASE/Searcher.php | 48 ------ php/YASE/URL.php | 116 -------------- spec/javascripts/ApplicationSpec.js | 19 +++ spec/javascripts/CollectionGridSpec.js | 15 ++ spec/javascripts/JobWindowSpec.js | 15 ++ spec/javascripts/LoginWindowSpec.js | 7 + spec/javascripts/support/jasmine.yml | 73 +++++++++ spec/javascripts/support/jasmine_config.rb | 23 +++ spec/javascripts/support/jasmine_runner.rb | 32 ++++ test/php/AccountTest.php | 37 ----- test/php/BasicTestSuite.php | 30 ---- test/php/CollectionTest.php | 44 ------ test/php/CrawlerTest.php | 17 -- test/php/IndexerTest.php | 18 --- test/php/SearcherTest.php | 17 -- test/php/configuration.php | 10 -- web/configuration.php | 10 -- web/index.html | 15 -- web/token.php | 16 -- 42 files changed, 252 insertions(+), 1416 deletions(-) delete mode 100644 README.md create mode 100644 Rakefile delete mode 100644 cli/clear.php delete mode 100644 cli/configuration.php delete mode 100644 cli/crawl.php delete mode 100644 cli/index.php delete mode 100644 cli/search.php delete mode 100644 cli/setup.php create mode 100644 database/job.sql delete mode 100644 php/YASE/Account.php delete mode 100644 php/YASE/Collection.php delete mode 100644 php/YASE/CollectionDomain.php delete mode 100644 php/YASE/Configuration.php.example delete mode 100644 php/YASE/Crawler.php delete mode 100644 php/YASE/Document.php delete mode 100644 php/YASE/Encoding.php delete mode 100644 php/YASE/Framework.php delete mode 100644 php/YASE/HTMLRobot.php delete mode 100644 php/YASE/HTTPClient.php delete mode 100644 php/YASE/Indexer.php delete mode 100644 php/YASE/PDFRobot.php delete mode 100644 php/YASE/Searcher.php delete mode 100644 php/YASE/URL.php create mode 100644 spec/javascripts/ApplicationSpec.js create mode 100644 spec/javascripts/CollectionGridSpec.js create mode 100644 spec/javascripts/JobWindowSpec.js create mode 100644 spec/javascripts/LoginWindowSpec.js create mode 100644 spec/javascripts/support/jasmine.yml create mode 100644 spec/javascripts/support/jasmine_config.rb create mode 100644 spec/javascripts/support/jasmine_runner.rb delete mode 100644 test/php/AccountTest.php delete mode 100644 test/php/BasicTestSuite.php delete mode 100644 test/php/CollectionTest.php delete mode 100644 test/php/CrawlerTest.php delete mode 100644 test/php/IndexerTest.php delete mode 100644 test/php/SearcherTest.php delete mode 100644 test/php/configuration.php delete mode 100644 web/configuration.php delete mode 100644 web/index.html delete mode 100644 web/token.php diff --git a/README.md b/README.md deleted file mode 100644 index 10d6947..0000000 --- a/README.md +++ /dev/null @@ -1,3 +0,0 @@ -a search engine framework in php - -Copyright 2011, Jacob Andresen diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..d673004 --- /dev/null +++ b/Rakefile @@ -0,0 +1,9 @@ + +begin + require 'jasmine' + load 'jasmine/tasks/jasmine.rake' +rescue LoadError + task :jasmine do + abort "Jasmine is not available. In order to run jasmine, you must: (sudo) gem install jasmine" + end +end diff --git a/cli/clear.php b/cli/clear.php deleted file mode 100644 index ec907c7..0000000 --- a/cli/clear.php +++ /dev/null @@ -1,13 +0,0 @@ - -require_once("configuration.php"); -require_once("YASE/Framework.php"); - -mysql_query("delete from collection_domain") or die(mysql_error()); -mysql_query("delete from document") or die(mysql_error()); -mysql_query("delete from collection") or die(mysql_error()); -mysql_query("delete from field") or die(mysql_error()); -mysql_query("delete from token") or die(mysql_error()); -mysql_query("delete from account") or die (mysql_error()); - -?> diff --git a/cli/configuration.php b/cli/configuration.php deleted file mode 100644 index 234bb3a..0000000 --- a/cli/configuration.php +++ /dev/null @@ -1,10 +0,0 @@ - -define("MAX_CONTENT_LENGTH", 2000000); -define("MYSQL_HOST", "127.0.0.1"); -define("MYSQL_USER", "yase"); -define("MYSQL_PASSWORD", "yase"); -define("MYSQL_DATABASE", "yase"); -define("TMP_YASE", "/tmp/yase/"); -define("YASE_WEB", "http://localhost/yase"); -?> diff --git a/cli/crawl.php b/cli/crawl.php deleted file mode 100644 index 78b6bbc..0000000 --- a/cli/crawl.php +++ /dev/null @@ -1,17 +0,0 @@ - -require_once('configuration.php'); -require_once('YASE/Framework.php'); - -if (sizeof($argv) < 3 || $argv[0] == "" || $argv[1] == "") { - print "YASE crawler\r\n"; - print "usage:\r\n"; - print " crawl.php [userName] [password] \r\n"; - exit - 1; -} - -$account = Account::login($argv[1], $argv[2]); - -$crawler = new Crawler($account->collections[0]); -$crawler->start(); -?> diff --git a/cli/index.php b/cli/index.php deleted file mode 100644 index a9ffaf4..0000000 --- a/cli/index.php +++ /dev/null @@ -1,22 +0,0 @@ - -require_once('configuration.php'); -require_once('YASE/Framework.php'); - -if (!isset($argv[1])) { - $argv[1] = ""; -} -$account = $argv[1]; -if ($argv[1] == "") { - print "YASE indexer\r\n"; - print "usage:\r\n"; - print " index.php [userName] [password] \r\n"; - exit - 1; -} - -$account = Account::login($argv[1], $argv[2]); -$collection = $account->collections[0]; - -$indexer = new Indexer($collection); -$indexer->start(); -?> diff --git a/cli/search.php b/cli/search.php deleted file mode 100644 index 607856e..0000000 --- a/cli/search.php +++ /dev/null @@ -1,20 +0,0 @@ - -require_once("configuration.php"); -require_once("YASE/Framework.php"); - -if (sizeof($argv) < 4 || $argv[0] == "" || $argv[1] == "") { - print "YASE searcher \r\n"; - print " usage:\r\n"; - print " search.php [userName] [password] [query]\r\n"; - exit - 1; -} - -$account = Account::login($argv[1], $argv[2]); - -print_r ($account->collections[0]); - -$s = new Searcher($account->collections[0]); -print_r($s->search($argv[3],0)); - -?> diff --git a/cli/setup.php b/cli/setup.php deleted file mode 100644 index 095870f..0000000 --- a/cli/setup.php +++ /dev/null @@ -1,27 +0,0 @@ - - -require_once("configuration.php"); -require_once("YASE/Framework.php"); - -if (!isset($argv[1])) { - $argv[1] = ""; -} -if (!isset($argv[2])) { - $argv[2] = ""; -} -if ($argv[1] == "" || $argv[2] == "") { - print "YASE setup \r\n"; - print " usage:\r\n"; - print " setup.php [username] [password] [domain]\r\n"; - exit - 1; -} - -$userName = $argv[1]; -$password = $argv[2]; -$domain = $argv[3]; - -$account = Account::create((object)array("userName" => $userName, "password" => $password, "firstName" => "", "lastName" => "")); -$collection = Collection::create((object)array("accountId" => $account->id, "name" => $domain, "startUrl" => $domain, "pageLimit" => 1500, "levelLimit" => 15)); -$collection->addDomain($userName); -?> diff --git a/database/account.sql b/database/account.sql index 6e3380c..a0b6741 100755 --- a/database/account.sql +++ b/database/account.sql @@ -1,16 +1,18 @@ --- 2011, Jacob Andresen +create sequence account_seq START 1; +create sequence token_seq START 1; + create table account ( - id int NOT NULL primary key auto_increment , - username varchar(256) NOT NULL UNIQUE, - password varchar(256) NOT NULL, - first_name varchar(256), - last_name varchar(256) + account_id integer PRIMARY KEY DEFAULT nextval('account_seq'), + username varchar(60) NOT NULL UNIQUE, + password varchar(60) NOT NULL, + first_name varchar(60), + last_name varchar(60) ); create table token ( - id int NOT NULL primary key auto_increment, + token_id integer PRIMARY KEY DEFAULT nextval('token_seq'), value varchar(60), - account_id int NOT NULL, - last_seen date, - FOREIGN KEY (account_id) references account(id) + account_id integer NOT NULL, + last_seen date, + FOREIGN KEY (account_id) references account(account_id) ); diff --git a/database/content.sql b/database/content.sql index 44e2ba8..c61ad57 100755 --- a/database/content.sql +++ b/database/content.sql @@ -1,57 +1,42 @@ --- 2011, Jacob Andresen +create sequence collection_seq START 1; +create sequence collection_domain_seq START 1; +create sequence document_seq START 1; + create table collection ( - id int NOT NULL primary key auto_increment, - account_id int, + collection_id integer PRIMARY KEY DEFAULT nextval('collection_seq'), + account_id integer, name varchar(256), - page_limit int, - level_limit int, - seen_documents int, - indexed_documents int, + page_limit integer, + level_limit integer, + seen_documents integer, + indexed_documents integer, start_url varchar(512), - last_updated datetime, - foreign key(account_id) references account(id) + last_updated date, + foreign key(account_id) references account(account_id) ); create table collection_domain ( - id int NOT NULL primary key auto_increment, - collection_id int, - domain varchar(255), - foreign key(collection_id) references collection(id) + id integer PRIMARY KEY DEFAULT nextval('collection_domain_seq'), + collection_id integer, + domain varchar(255), + foreign key(collection_id) references collection(collection_id) ); create table document ( - id int NOT NULL primary key auto_increment, - collection_id int, + document_id integer PRIMARY KEY DEFAULT nextval('document_seq'), + collection_id integer, url varchar(256), md5 varchar(20), - level int, + level integer, content_type varchar(256), retrieved timestamp, - content LONGTEXT, - FOREIGN KEY(collection_id) references collection(id), - FULLTEXT(content) -) engine=MyISAM; - -create table filter ( - id int NOT NULL primary key auto_increment, - name varchar(64), - path varchar(255), - regex varchar(255) -) engine=MyISAM; + content TEXT, + FOREIGN KEY(collection_id) references collection(collection_id) +); -create table field ( - id int NOT NULL primary key auto_increment, - document_id int, +create table facet ( + facet_id integer PRIMARY KEY DEFAULT nextval('facet_seq'), + document_id integer, name varchar(256), - content LONGTEXT, - foreign key(document_id) references document(id), - FULLTEXT(content) -) engine=MyISAM; - -create table document_field ( - id int NOT NULL primary key auto_increment, - field_id int, - filter_id int, - foreign key(field_id) references field(id), - foreign key(filter_id) references filter(id) -) engine=MyISAM; + content varchar(256) +) diff --git a/database/job.sql b/database/job.sql new file mode 100644 index 0000000..cc6f970 --- /dev/null +++ b/database/job.sql @@ -0,0 +1,20 @@ +create sequence job_seq START 1; +create sequence job_seq START 1; + +create table job_type ( + job_type_id integer PRIMARY KEY DEFAULT nextval('job_type_seq'), + name varchar(60) NOT NULL UNIQUE +); + +create table job ( + job_id integer PRIMARY KEY DEFAULT nextval('job_seq'), + name varchar(60) NOT NULL UNIQUE, + collection_id integer NOT NULL, + last_modified datetime, + last_run datetime, + start_time datetime, + stop_time datetime, + FOREIGN KEY (collection_id) references collection(collection_id) +); + + diff --git a/php/YASE/Account.php b/php/YASE/Account.php deleted file mode 100644 index b153f16..0000000 --- a/php/YASE/Account.php +++ /dev/null @@ -1,109 +0,0 @@ - -class Account -{ - public $id; - public $userName; - public $password; - public $firstName; - public $lastName; - - public $collections; - - public static function create($data) - { - $SQL = "INSERT INTO account(username, password, first_name, last_name) VALUES('" . $data->userName . "','" . $data->password . "','" . $data->firstName . "','" . $data->lastName . "')"; - mysql_query($SQL) or die("create failed:" . $SQL . mysql_error()); - - $a = new Account(); - $a->id = mysql_insert_id(); - $a->userName = $data->userName; - $a->password = $data->password; - $a->firstName = $data->firstName; - $a->lastName = $data->lastName; - - return $a; - } - - public static function retrieve($data) - { - $SQL = "SELECT id,username,password,first_name,last_name from account where id='" . $data->id . "'"; - $res = mysql_query($SQL) or die ("read failed:" . $SQL . mysql_error()); - $row = mysql_fetch_array($res); - - $a = new Account(); - $a->id = $row[0]; - $a->userName = $row[1]; - $a->password = $row[2]; - $a->firstName = $row[3]; - $a->lastName = $row[4]; - - $a->collections = Collection::retrieve((object)array("accountId" => $a->id)); - - return $a; - } - - public static function update($data) - { - $SQL = "UPDATE account where id=" . $data->id . " set username='" . $data->userName . "',password='" . $data->password . "',first_name='" . $data->firstName . "',last_name='" . $data->lastName . "'"; - mysql_query($SQL) or die ("Account update failed:" . $SQL . mysql_error()); - } - - public static function destroy($id) - { - mysql_query("DELETE FROM account where id=$id"); - } - - public static function login($userName, $password) - { - $res = mysql_query("SELECT id from account where username='" . $userName . "' and password='" . $password . "'") or die(mysql_error()); - $row = mysql_fetch_array($res); - - $id = $row[0]; - - if (isset($id)) { - Account::generateToken($userName, $password); - return (Account::retrieve((object)array("id" => $id))); - } else { - throw (new Exception("login failed for user " . $userName)); - } - } - - public static function tokenLogin($token) - { - $sql = "SELECT a.id from account a, token t where t.value='$token' and t.account_id=a.id"; - $res = mysql_query($sql); - - $row = mysql_fetch_array($res); - $id = $row[0]; - - if (isset($id)) { - return '{id:"'.$id.'",token:"'.$token.'"}'; - } - } - - public static function generateToken($userName, $password) - { - $token = md5($userName . $password . rand()); - $sql = "select id from account where username='$userName' and password='$password'"; - $res = mysql_query($sql) or die (" failed logging in"); - $row = mysql_fetch_array($res); - $id = $row['id']; - - $sql = "insert into token(account_id, value) values( '$id', '$token');"; - mysql_query($sql) or die; - return $token; - } - - public static function getToken($userName, $password) - { - $sql = "select a.id,t.value from account a, token t where a.username='$userName' and a.password='$password' and t.account_id=a.id ;"; - - $res = mysql_query($sql) or die (" failed getting token:" . mysql_error()); - $row = mysql_fetch_array($res); - - return $row['value']; - } -} - -?> diff --git a/php/YASE/Collection.php b/php/YASE/Collection.php deleted file mode 100644 index 4850a19..0000000 --- a/php/YASE/Collection.php +++ /dev/null @@ -1,115 +0,0 @@ - -class Collection -{ - public $id; - public $accountId; - public $name; - public $pageLimit; - public $levelLimit; - public $startUrl; - - public $domains; - - public function __construct() - { - $this->domains = array(); - } - - public static function create($data) - { - $c = new Collection(); - - if (!isset($data) || !isset($data->accountId) ){ - throw new Exception("missing data"); - return; - } - - $c->accountId = $data->accountId; - $c->name = $data->name; - - $SQL = "INSERT INTO collection(account_id, name, page_limit, level_limit, start_url) VALUES(" . $data->accountId . ", '" . $data->name . "', " . $data->pageLimit . ", " . $data->levelLimit . ",'" . $data->startUrl . "')"; - - mysql_query($SQL) or die ("collection create failed: $SQL" . mysql_error()); - - $c->domains = array(); - $c->id = mysql_insert_id(); - - return ($c); - } - - public static function retrieve($data) - { - if (!isset($data)) { - print "missing data"; - return; - } - if (isset($data->accountId) && $data->accountId!="") { - $SQL = "SELECT id,name,page_limit,level_limit,start_url FROM collection where account_id=" . $data->accountId; - } else { - $SQL = "SELECT id,name,page_limit,level_limit,start_url FROM collection where id=" . $data->id; - } - $res = mysql_query($SQL) or die("collection retrieve failed:" . $SQL . " -> " . mysql_error()); - - $collections = array(); - while ($row = mysql_fetch_row($res)) { - $c = new Collection(); - $c->id = $row[0]; - $c->name = $row[1]; - $c->pageLimit = $row[2]; - $c->levelLImit = $row[3]; - $c->startUrl = $row[4]; - $c->domains = CollectionDomain::retrieve(json_decode('{"collectionId":"' . $c->id . '"}')); - - array_push($collections, $c); - } - return $collections; - } - - public static function update($data) - { - mysql_query("UPDATE collection where id=" . $data->id . " set account_id=" . $data->accountId . ",name='" . $data->name . "', page_limit='" . $data->pageLimit . "', level_limit='" . $data->levelLimit . ")") or die (mysql_error()); - } - - public static function destroy($id) - { - mysql_query("DELETE FROM collection WHERE ID=$id") or die (mysql_error()); - } - - public function addDomain($domain) - { - $d = new CollectionDomain(); - $d->domain = $domain; - $d->collectionId = $this->id; - CollectionDomain::create($d); - $this->domains = CollectionDomain::retrieve(json_decode('{"collectionId":"' . $this->id . '"}')); - } - - public function inAllowedDomains($URL) - { - $host = URL::extractHost($URL); - foreach ($this->domains as $d) { - $domain = str_replace("www.", "", $d->domain); - if (strpos($host, $domain) !== false) { - return true; - } - } - return false; - } - - public function getDomainId($url) - { - foreach ($this->domains as $domain) - { - if (URL::inDomain($url, $domain->domain)) { - return ($domain->id); - } - } - } - - public function log($message) - { - print $message . "\r\n"; - } -} -?> diff --git a/php/YASE/CollectionDomain.php b/php/YASE/CollectionDomain.php deleted file mode 100644 index d0d033b..0000000 --- a/php/YASE/CollectionDomain.php +++ /dev/null @@ -1,52 +0,0 @@ - -class CollectionDomain -{ - public $id; - public $collectionId; - public $domain; - - public function __construct() - { - } - - public static function create($data) - { - $d = new CollectionDomain(); - $SQL = "INSERT INTO collection_domain(collection_id,domain) values(" . $data->collectionId . ",'" . $data->domain . "')"; - mysql_query($SQL) or die (mysql_error()); - $d->id = mysql_insert_id(); - $d->collectionId = $data->collectionId; - $d->domain = $data->domain; - return $d; - } - - public static function retrieve($data) - { - if (isset($data->id)) { - $res = mysql_query("SELECT id,domain FROM collection_domain where id=" . $data->id) or die (mysql_error()); - } else { - $res = mysql_query("SELECT id,domain from collection_domain where collection_id=" . $data->collectionId) or die (mysql_error()); - } - - $domains = array(); - while ($row = mysql_fetch_row($res)) { - $d = new CollectionDomain(); - $d->id = $row[0]; - $d->domain = $row[1]; - array_push($domains, $d); - } - return $domains; - } - - public static function update($data) - { - $res = mysql_query("UPDATE collection_domain WHERE id=" . $data->id . " SET collection_id=" . $data->collectionId . " and domain='" . $data->domain . "'") or die(mysql_error()); - } - - public static function destroy($data) - { - mysql_query("DELETE FROM collection_domain where id=" . $data->id); - } - -} diff --git a/php/YASE/Configuration.php.example b/php/YASE/Configuration.php.example deleted file mode 100644 index 234bb3a..0000000 --- a/php/YASE/Configuration.php.example +++ /dev/null @@ -1,10 +0,0 @@ - -define("MAX_CONTENT_LENGTH", 2000000); -define("MYSQL_HOST", "127.0.0.1"); -define("MYSQL_USER", "yase"); -define("MYSQL_PASSWORD", "yase"); -define("MYSQL_DATABASE", "yase"); -define("TMP_YASE", "/tmp/yase/"); -define("YASE_WEB", "http://localhost/yase"); -?> diff --git a/php/YASE/Crawler.php b/php/YASE/Crawler.php deleted file mode 100644 index 23f0311..0000000 --- a/php/YASE/Crawler.php +++ /dev/null @@ -1,122 +0,0 @@ - -class Crawler -{ - public $collection; - public $level; - public $processURLs; - public $seenURLs; - public $crawledURLs; - public $httpClient; - - public function __construct($params) - { - $this->collections = Collection::retrieve($params); - $this->collection = $this->collections[0]; - - if (!isset($this->collection)) { - - throw new Exception("failed to find collection for"); - } - - if (isset($params->pageLimit)) { - $this->pageLimit = $params->pageLimit; - } else { - $this->pageLimit = 1500; - } - $this->processURLs = array(); - $this->seenURLS = array(); - $this->foundURLs = array(); - $this->crawledURLs = array(); - $this->startUrl = $this->collection->startUrl; - - $this->httpClient = new HTTPClient(); - } - - public function start() - { - $this->collection->log("page limit:" . $this->pageLimit); - if ($this->shouldCrawl($this->startUrl)) { - mysql_query("delete from document where coolection_id='" . $this->collection->id . "'"); - $this->crawl($this->startUrl, 0, $this->startUrl); - } else { - $this->collection->log("failed to start crawl"); - } - } - - public function crawl($url, $level, $parent) - { - if (count($this->crawledURLs) > $this->pageLimit) { - return; - } - - $this->collection->log("crawl - [level:$level] [page:" . count($this->crawledURLs) . "] - $url "); - - $document = $this->httpClient->getDocument($url); - $document->level = $level; - - if ($document->contentType == "application/pdf") { - $p = new PDFRobot($this->collection->accountId); - $document->content = $p->clean($document); - $document->content = htmlentities($document->content, ENT_QUOTES); - - array_push($this->crawledURLs, $url); - return $document->save($this->collection->id); - } else { - - if (!($this->shouldCrawl($url))) { - $this->collection->log("SKIP $url"); - array_push($this->foundURLs, $url); //skip document - return false; - } - - preg_match_all("/\content, $matches); - foreach ($matches[1] as $item) { - $fullUrl = URL::expandUrl($item, $url); - if ($this->shouldCrawl($fullUrl)) { - $link = new Document(); - $link->url = $fullUrl; - $link->level = $level + 1; - array_push($this->foundURLs, $link); - array_push($this->processURLs, $link); - } - } - - $document->content = htmlentities($document->content, ENT_QUOTES); - $document->save($this->collection->id); - array_push($this->crawledURLs, $url); - if (count($this->crawledURLs) > $this->pageLimit) { - $this->collection->log("hit page limit!"); - $this->collection->log("#crawledURLs:" . count($this->crawledURLs)); - return; - } - - while ($child = array_shift($this->processURLs)) { - if ($child->url != "") { - if (!in_array($child->url, ($this->crawledURLs))) { - $this->crawl($child->url, ($child->level), $url); - } - } - } - } - } - - private function shouldCrawl($url) - { - if (in_array($url, $this->crawledURLs)) { - return false; - } - if ($this->collection->inAllowedDomains($url) == false) { - return false; - } - - if ($this->level > $this->collection->levelLimit || - count($this->crawledURLs) > $this->collection->pageLimit || - URL::filter($this->collection->getDomainId($url), $url, "crawlerfilter")) { - return false; - } - return true; - } -}; -?> diff --git a/php/YASE/Document.php b/php/YASE/Document.php deleted file mode 100644 index f7ef37a..0000000 --- a/php/YASE/Document.php +++ /dev/null @@ -1,58 +0,0 @@ - -class Document -{ - public $id; - public $collectionId; - public $level; - public $url; - public $title; - public $contentType; - public $content; - - public function __construct() - { - } - - public function save($collectionId) - { - if ($collectionId == "") { - die ("coding error. trying to save to empty collection\r\n"); - } - - if (strlen($this->url) > 1028) { - return false; - } - if (strlen($this->content) > MAX_CONTENT_LENGTH) { - return false; - } - if (strlen($this->content) < 1) { - return false; - } - - $this->url = urlencode($this->url); - $SQL = "INSERT IGNORE into document(collection_id, url, content_type, content, level) values('" . $collectionId . "','" . $this->url . "','" . $this->contentType . "','" . $this->content . "','" . $this->level . "')"; - mysql_query($SQL) or die("SQL error:" . $SQL . " \r\nfailed to insert into document:" . mysql_error()); - return true; - } - - //TODO: rename to hasTextContent? - public function shouldCrawl() - { - if ( - ($this->contentType == "application/x-zip") || - ($this->contentType == "application/xml") || - ($this->contentType == "application/json") || - ($this->contentType == "image/jpeg") || - ($this->contentType == "image/jpg") || - ($this->contentType == "image/bmp") || - ($this->contentType == "image/png") || - ($this->contentType == "text/css") || - ($this->contentType == "text/javascript") - ) { - return false; - } - return true; - } -}; -?> diff --git a/php/YASE/Encoding.php b/php/YASE/Encoding.php deleted file mode 100644 index ad180b7..0000000 --- a/php/YASE/Encoding.php +++ /dev/null @@ -1,36 +0,0 @@ - -class Encoding -{ - public static function isUTF8($string) - { - $c = 0; - $b = 0; - $bits = 0; - $len = strlen($string); - for ($i = 0; $i < $len; $i++) - { - $c = ord($string[$i]); - if ($c > 128) { - if ($c >= 254) return false; - elseif ($c >= 252) $bits = 6; - elseif ($c >= 248) $bits = 5; - elseif ($c >= 240) $bits = 4; - elseif ($c >= 224) $bits = 3; - elseif ($c >= 192) $bits = 2; - else return false; - if (($i + $bits) > $len) return false; - while ($bits > 1) { - $i++; - $b = ord($str[$i]); - if ($b < 128 || $b > 191) return false; - $bits--; - } - } - } - return true; - } -} - -; -?> diff --git a/php/YASE/Framework.php b/php/YASE/Framework.php deleted file mode 100644 index 3a7d172..0000000 --- a/php/YASE/Framework.php +++ /dev/null @@ -1,19 +0,0 @@ - -//NOTE: YASE assumes that you have the settings from configuration.php.example -require_once('Account.php'); -require_once('Collection.php'); -require_once('CollectionDomain.php'); -require_once('Encoding.php'); -require_once('URL.php'); -require_once('HTTPClient.php'); -require_once('Document.php'); -require_once('Crawler.php'); -require_once('Indexer.php'); -require_once('Searcher.php'); -require_once('HTMLRobot.php'); -require_once('PDFRobot.php'); - -mysql_connect(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD) or die(mysql_error()); -mysql_select_db(MYSQL_DATABASE) or die(mysql_error()); -?> diff --git a/php/YASE/HTMLRobot.php b/php/YASE/HTMLRobot.php deleted file mode 100644 index 5484872..0000000 --- a/php/YASE/HTMLRobot.php +++ /dev/null @@ -1,61 +0,0 @@ - -class HTMLRobot -{ - public static function clean($html) - { - $html = preg_replace("//is", ' ', $html); - $html = preg_replace("//is", ' ', $html); - $html = preg_replace("/