Skip to content

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
khanzanicdecay committed Jun 18, 2014
0 parents commit a33877d
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
103 changes: 103 additions & 0 deletions scraper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
<?php
//hasta aqui http://www.gsmarena.com/sendo-phones-18.php
require 'scraperwiki/simple_html_dom.php';

class GSMAParser {

var $brands = array();
var $models = 0;
var $html;

function init() {
$this->brands = array();
$this->models = 0;


$this->parseBrands();
echo count( $this->brands) . ' parsed brands'. "\n";

$this->parseModels();
echo $this->models . ' parsed models'. "\n";
}

function parseBrands(){
$html_content = scraperwiki::scrape("http://www.gsmarena.com/makers.php3");
$html = str_get_html($html_content);

$i = 0;
$temp = array();
foreach ($html->find("div.st-text a") as $el) {

if($i % 2 == 0){
$img = $el->find('img',0);
$b['link'] = 'http://www.gsmarena.com/'.$el->href;
$b['img'] = $img->src;
$b['name'] = $img->alt;
$temp = explode('-',$el->href);
$b['id'] = (int) substr($temp[2], 0, -4);

$this->brands[] = $b;

scraperwiki::save_sqlite(array("id"=>$b['id']), $b, "cell_brand");

}

$i++;

}

$html->__destruct();
}

function parseModels(){
$temp = array();
foreach ($this->brands as $b) {

$this->parseModelsPage($b['id'],$b['name'],$b['link']);

}

}

function parseModelsPage($brandId,$brandName,$page){

$html_content = scraperwiki::scrape($page);
$this->html = str_get_html($html_content);

foreach ($this->html->find("div.makers a") as $el) {
$img = $el->find('img',0);
$m['name'] = $brandName . ' ' . $el->find('strong',0)->innertext;
$m['img'] = $img->src;
$m['link'] = 'http://www.gsmarena.com/'.$el->href;
$m['desc'] = $img->title;
$temp = explode('-',$el->href);
$m['id'] = (int) substr($temp[1], 0, -4);
$m['brand_id'] = $brandId;

scraperwiki::save_sqlite(array("id"=>$m['id']), $m, "cell_model");

$this->models++;

}

$pagination = $this->html->find("div.nav-pages",0);

if($pagination){
$nextPageLink = $pagination->lastChild();
if($nextPageLink && $nextPageLink->title=="Next page"){
$this->parseModelsPage($brandId,$brandName,'http://www.gsmarena.com/'.$nextPageLink->href);
}
}

$this->html->__destruct();

}

}

$parser = new GSMAParser();

$parser->init();


?>

0 comments on commit a33877d

Please sign in to comment.