diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66d464d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# Ignore output of scraper +data.sqlite diff --git a/scraper.php b/scraper.php new file mode 100644 index 0000000..856b6da --- /dev/null +++ b/scraper.php @@ -0,0 +1,103 @@ +brands = array(); + $this->models = 0; + + + $this->parseBrands(); + echo count( $this->brands) . ' parsed brands'. "\n"; + + $this->parseModels(); + echo $this->models . ' parsed models'. "\n"; + } + + function parseBrands(){ + $html_content = scraperwiki::scrape("http://www.gsmarena.com/makers.php3"); + $html = str_get_html($html_content); + + $i = 0; + $temp = array(); + foreach ($html->find("div.st-text a") as $el) { + + if($i % 2 == 0){ + $img = $el->find('img',0); + $b['link'] = 'http://www.gsmarena.com/'.$el->href; + $b['img'] = $img->src; + $b['name'] = $img->alt; + $temp = explode('-',$el->href); + $b['id'] = (int) substr($temp[2], 0, -4); + + $this->brands[] = $b; + + scraperwiki::save_sqlite(array("id"=>$b['id']), $b, "cell_brand"); + + } + + $i++; + + } + + $html->__destruct(); + } + + function parseModels(){ + $temp = array(); + foreach ($this->brands as $b) { + + $this->parseModelsPage($b['id'],$b['name'],$b['link']); + + } + + } + + function parseModelsPage($brandId,$brandName,$page){ + + $html_content = scraperwiki::scrape($page); + $this->html = str_get_html($html_content); + + foreach ($this->html->find("div.makers a") as $el) { + $img = $el->find('img',0); + $m['name'] = $brandName . ' ' . $el->find('strong',0)->innertext; + $m['img'] = $img->src; + $m['link'] = 'http://www.gsmarena.com/'.$el->href; + $m['desc'] = $img->title; + $temp = explode('-',$el->href); + $m['id'] = (int) substr($temp[1], 0, -4); + $m['brand_id'] = $brandId; + + scraperwiki::save_sqlite(array("id"=>$m['id']), $m, "cell_model"); + + $this->models++; + + } + + $pagination = $this->html->find("div.nav-pages",0); + + if($pagination){ + $nextPageLink = $pagination->lastChild(); + if($nextPageLink && $nextPageLink->title=="Next page"){ + $this->parseModelsPage($brandId,$brandName,'http://www.gsmarena.com/'.$nextPageLink->href); + } + } + + $this->html->__destruct(); + + } + +} + +$parser = new GSMAParser(); + +$parser->init(); + + +?>