From ffedfe4e0f3c22072548a0ae3dfe1a85bc293083 Mon Sep 17 00:00:00 2001 From: stock Date: Tue, 13 Jan 2015 15:43:38 +0800 Subject: [PATCH] release 1.2 --- core/common/page/page.go | 27 +++- core/common/request/request.go | 65 ++++++++- core/downloader/downloader_http.go | 34 ++++- core/downloader/downloader_test.go | 10 +- core/scheduler/scheduler_simple.go | 35 +++++ core/scheduler/scheduler_test.go | 4 +- core/spider/spider.go | 63 ++++++--- example/baidu_baike_page_processor/main.go | 21 ++- .../main.go | 130 ++++++++++++++++++ example/sina_stock_json_processor/main.go | 2 +- 10 files changed, 353 insertions(+), 38 deletions(-) create mode 100644 core/scheduler/scheduler_simple.go create mode 100644 example/github_login_profile_page_processor/main.go diff --git a/core/common/page/page.go b/core/common/page/page.go index 46793a8..a7115f0 100644 --- a/core/common/page/page.go +++ b/core/common/page/page.go @@ -113,9 +113,14 @@ func (this *Page) GetRequest() *request.Request { return this.req } +// GetUrlTag returns name of url. +func (this *Page) GetUrlTag() string { + return this.req.GetUrlTag() +} + // AddTargetRequest adds one new Request waitting for crawl. func (this *Page) AddTargetRequest(url string, respType string) *Page { - this.targetRequests = append(this.targetRequests, request.NewRequest(url, respType)) + this.targetRequests = append(this.targetRequests, request.NewRequest(url, respType, "", "GET", "", nil, nil, nil)) return this } @@ -127,6 +132,26 @@ func (this *Page) AddTargetRequests(urls []string, respType string) *Page { return this } +// AddTargetRequest adds one new Request waitting for crawl. +// The respType is "html" or "json" or "jsonp" or "text". +// The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. +// The method is POST or GET. +// The postdata is http body string. +// The header is http header. +// The cookies is http cookies. +func (this *Page) AddTargetRequestWithParams(req *request.Request) *Page { + this.targetRequests = append(this.targetRequests, req) + return this +} + +// AddTargetRequests adds new Requests waitting for crawl. +func (this *Page) AddTargetRequestsWithParams(reqs []*request.Request) *Page { + for _, req := range reqs { + this.AddTargetRequestWithParams(req) + } + return this +} + // GetTargetRequests returns the target requests that will put into Scheduler func (this *Page) GetTargetRequests() []*request.Request { return this.targetRequests diff --git a/core/common/request/request.go b/core/common/request/request.go index 55df797..c31bdc2 100644 --- a/core/common/request/request.go +++ b/core/common/request/request.go @@ -1,22 +1,79 @@ // Package request implements request entity contains url and other relevant informaion. package request +import ( + "net/http" +) + // Request represents object waiting for being crawled. type Request struct { - url string + url string + + // Responce type: html json jsonp text respType string + + // GET POST + method string + + // POST data + postdata string + + // name for marking url and distinguish different urls in PageProcesser and Pipeline + urltag string + + // http header + header http.Header + + // http cookies + cookies []*http.Cookie + + // Redirect function for downloader used in http.Client + // If CheckRedirect returns an error, the Client's Get + // method returns both the previous Response. + // If CheckRedirect returns error.New("normal"), the error process after client.Do will ignore the error. + checkRedirect func(req *http.Request, via []*http.Request) error } // NewRequest returns initialized Request object. -// The respType is "json" or "html" -func NewRequest(url string, respType string) *Request { - return &Request{url, respType} +// The respType is json, jsonp, html, text +/* +func NewRequestSimple(url string, respType string, urltag string) *Request { + return &Request{url:url, respType:respType} +} +*/ + +func NewRequest(url string, respType string, urltag string, method string, postdata string, header http.Header, cookies []*http.Cookie, checkRedirect func(req *http.Request, via []*http.Request) error) *Request { + return &Request{url, respType, method, postdata, urltag, header, cookies, checkRedirect} } func (this *Request) GetUrl() string { return this.url } +func (this *Request) GetUrlTag() string { + return this.urltag +} + +func (this *Request) GetMethod() string { + return this.method +} + +func (this *Request) GetPostdata() string { + return this.postdata +} + +func (this *Request) GetHeader() http.Header { + return this.header +} + +func (this *Request) GetCookies() []*http.Cookie { + return this.cookies +} + func (this *Request) GetResponceType() string { return this.respType } + +func (this *Request) GetRedirectFunc() func(req *http.Request, via []*http.Request) error { + return this.checkRedirect +} diff --git a/core/downloader/downloader_http.go b/core/downloader/downloader_http.go index 2fb511a..61d609a 100644 --- a/core/downloader/downloader_http.go +++ b/core/downloader/downloader_http.go @@ -15,9 +15,11 @@ import ( "io" "io/ioutil" "net/http" + "net/url" // "regexp" // "golang.org/x/net/html" - // "strings" + "strings" + //"fmt" ) // The HttpDownloader download page by package net/http. @@ -183,19 +185,37 @@ func (this *HttpDownloader) changeCharsetEncodingAuto(contentTypeStr string, sor // Download file and change the charset of page charset. func (this *HttpDownloader) downloadFile(p *page.Page, req *request.Request) (*page.Page, string) { var err error - var url string - if url = req.GetUrl(); len(url) == 0 { + var urlstr string + if urlstr = req.GetUrl(); len(urlstr) == 0 { mlog.LogInst().LogError("url is empty") p.SetStatus(true, "url is empty") return p, "" } + client := &http.Client{ + CheckRedirect: req.GetRedirectFunc(), + } + httpreq, err := http.NewRequest(req.GetMethod(), req.GetUrl(), strings.NewReader(req.GetPostdata())) + if header := req.GetHeader(); header != nil { + httpreq.Header = req.GetHeader() + } + if cookies := req.GetCookies(); cookies != nil { + for i := range cookies { + httpreq.AddCookie(cookies[i]) + } + } + var resp *http.Response - if resp, err = http.Get(url); err != nil { - mlog.LogInst().LogError(err.Error()) - p.SetStatus(true, err.Error()) - return p, "" + if resp, err = client.Do(httpreq); err != nil { + if e, ok := err.(*url.Error); ok && e.Err != nil && e.Err.Error() == "normal" { + // normal + } else { + mlog.LogInst().LogError(err.Error()) + p.SetStatus(true, err.Error()) + return p, "" + } } + p.SetHeader(resp.Header) p.SetCookies(resp.Cookies()) diff --git a/core/downloader/downloader_test.go b/core/downloader/downloader_test.go index 0caa529..3716d64 100644 --- a/core/downloader/downloader_test.go +++ b/core/downloader/downloader_test.go @@ -15,10 +15,10 @@ import ( ) func TestDownloadHtml(t *testing.T) { - return + //return //request := request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&callback=t13975294&id=23521&pagesize=45&dire=f&dpc=1") var req *request.Request - req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html") + req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html", "", "GET", "", nil, nil, nil) var dl downloader.Downloader dl = downloader.NewHttpDownloader() @@ -58,9 +58,9 @@ func TestDownloadHtml(t *testing.T) { } func TestDownloadJson(t *testing.T) { - return + //return var req *request.Request - req = request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=23521&pagesize=4&dire=f&dpc=1", "json") + req = request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=23521&pagesize=4&dire=f&dpc=1", "json", "", "GET", "", nil, nil, nil) var dl downloader.Downloader dl = downloader.NewHttpDownloader() @@ -81,7 +81,7 @@ func TestDownloadJson(t *testing.T) { func TestCharSetChange(t *testing.T) { var req *request.Request //req = request.NewRequest("http://stock.finance.sina.com.cn/usstock/api/jsonp.php/t/US_CategoryService.getList?page=1&num=60", "jsonp") - req = request.NewRequest("http://soft.chinabyte.com/416/13164916.shtml", "html") + req = request.NewRequest("http://soft.chinabyte.com/416/13164916.shtml", "html", "", "GET", "", nil, nil, nil) var dl downloader.Downloader dl = downloader.NewHttpDownloader() diff --git a/core/scheduler/scheduler_simple.go b/core/scheduler/scheduler_simple.go new file mode 100644 index 0000000..5cbc19d --- /dev/null +++ b/core/scheduler/scheduler_simple.go @@ -0,0 +1,35 @@ +// Copyright 2014 Hu Cong. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The package is useless +package scheduler + +import ( + "github.com/hu17889/go_spider/core/common/request" +) + +type SimpleScheduler struct { + queue chan *request.Request +} + +func NewSimpleScheduler() *SimpleScheduler { + ch := make(chan *request.Request, 1024) + return &SimpleScheduler{ch} +} + +func (this *SimpleScheduler) Push(requ *request.Request) { + this.queue <- requ +} + +func (this *SimpleScheduler) Poll() *request.Request { + if len(this.queue) == 0 { + return nil + } else { + return <-this.queue + } +} + +func (this *SimpleScheduler) Count() int { + return len(this.queue) +} diff --git a/core/scheduler/scheduler_test.go b/core/scheduler/scheduler_test.go index 318aa6d..3f9e281 100644 --- a/core/scheduler/scheduler_test.go +++ b/core/scheduler/scheduler_test.go @@ -14,7 +14,7 @@ import ( func TestQueueScheduler(t *testing.T) { var r *request.Request - r = request.NewRequest("http://baidu.com", "html") + r = request.NewRequest("http://baidu.com", "html", "", "GET", make(map[string]string)) fmt.Printf("%v\n", r) var s *scheduler.QueueScheduler @@ -37,7 +37,7 @@ func TestQueueScheduler(t *testing.T) { // remove duplicate s = scheduler.NewQueueScheduler(true) - r2 := request.NewRequest("http://qq.com", "html") + r2 := request.NewRequest("http://qq.com", "html", "", "GET", make(map[string]string)) s.Push(r) s.Push(r2) s.Push(r) diff --git a/core/spider/spider.go b/core/spider/spider.go index da929c5..d94a3a2 100644 --- a/core/spider/spider.go +++ b/core/spider/spider.go @@ -12,6 +12,7 @@ import ( "github.com/hu17889/go_spider/core/pipeline" "github.com/hu17889/go_spider/core/scheduler" "math/rand" + //"net/http" "time" //"fmt" ) @@ -71,11 +72,32 @@ func (this *Spider) Taskname() string { return this.taskname } -// Deal with one url and return the PageItems +// Deal with one url and return the PageItems. func (this *Spider) Get(url string, respType string) *page_items.PageItems { - var urls []string - urls = append(urls, url) - items := this.GetAll(urls, respType) + req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil) + return this.GetByRequest(req) +} + +// Deal with several urls and return the PageItems slice. +func (this *Spider) GetAll(urls []string, respType string) []*page_items.PageItems { + for _, u := range urls { + req := request.NewRequest(u, respType, "", "GET", "", nil, nil, nil) + this.AddRequest(req) + } + + pip := pipeline.NewCollectPipelinePageItems() + this.AddPipeline(pip) + + this.Run() + + return pip.GetCollected() +} + +// Deal with one url and return the PageItems with other setting. +func (this *Spider) GetByRequest(req *request.Request) *page_items.PageItems { + var reqs []*request.Request + reqs = append(reqs, req) + items := this.GetAllByRequest(reqs) if len(items) != 0 { return items[0] } @@ -83,11 +105,11 @@ func (this *Spider) Get(url string, respType string) *page_items.PageItems { } // Deal with several urls and return the PageItems slice -func (this *Spider) GetAll(urls []string, respType string) []*page_items.PageItems { +func (this *Spider) GetAllByRequest(reqs []*request.Request) []*page_items.PageItems { // push url - for _, u := range urls { - req := request.NewRequest(u, respType) - this.addRequest(req) + for _, req := range reqs { + //req := request.NewRequest(u, respType, urltag, method, postdata, header, cookies) + this.AddRequest(req) } pip := pipeline.NewCollectPipelinePageItems() @@ -236,29 +258,38 @@ func (this *Spider) sleep() { } func (this *Spider) AddUrl(url string, respType string) *Spider { - req := request.NewRequest(url, respType) - this.addRequest(req) + req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil) + this.AddRequest(req) return this } func (this *Spider) AddUrls(urls []string, respType string) *Spider { for _, url := range urls { - req := request.NewRequest(url, respType) - this.addRequest(req) + req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil) + this.AddRequest(req) } return this } // add Request to Schedule -func (this *Spider) addRequest(req *request.Request) { +func (this *Spider) AddRequest(req *request.Request) *Spider { if req == nil { mlog.LogInst().LogError("request is nil") - return + return this } else if req.GetUrl() == "" { mlog.LogInst().LogError("request is empty") - return + return this } this.pScheduler.Push(req) + return this +} + +// +func (this *Spider) AddRequests(reqs []*request.Request) *Spider { + for _, req := range reqs { + this.AddRequest(req) + } + return this } // core processer @@ -274,7 +305,7 @@ func (this *Spider) pageProcess(req *request.Request) { this.pPageProcesser.Process(p) for _, req := range p.GetTargetRequests() { //fmt.Printf("%v\n",req) - this.addRequest(req) + this.AddRequest(req) } // output diff --git a/example/baidu_baike_page_processor/main.go b/example/baidu_baike_page_processor/main.go index 6b8504e..c861ac4 100644 --- a/example/baidu_baike_page_processor/main.go +++ b/example/baidu_baike_page_processor/main.go @@ -4,6 +4,7 @@ package main import ( "fmt" "github.com/hu17889/go_spider/core/common/page" + "github.com/hu17889/go_spider/core/common/request" "github.com/hu17889/go_spider/core/spider" "strings" ) @@ -41,7 +42,17 @@ func main() { // PageProcesser ; // task name used in Pipeline for record; sp := spider.NewSpider(NewMyPageProcesser(), "TaskName") - pageItems := sp.Get("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html") // url, html is the responce type ("html" or "json" or "jsonp" or "text") + // GetWithParams Params: + // 1. Url. + // 2. Responce type is "html" or "json" or "jsonp" or "text". + // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. + // 4. The method is POST or GET. + // 5. The postdata is body string sent to sever. + // 6. The header is header for http request. + // 7. Cookies + req := request.NewRequest("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html", "", "GET", "", nil, nil, nil) + pageItems := sp.GetByRequest(req) + //pageItems := sp.Get("http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "html") url := pageItems.GetRequest().GetUrl() println("-----------------------------------spider.Get---------------------------------") @@ -55,7 +66,13 @@ func main() { "http://baike.baidu.com/view/1628025.htm?fromtitle=http&fromid=243074&type=syn", "http://baike.baidu.com/view/383720.htm?fromtitle=html&fromid=97049&type=syn", } - pageItemsArr := sp.SetThreadnum(2).GetAll(urls, "html") + var reqs []*request.Request + for _, url := range urls { + req := request.NewRequest(url, "html", "", "GET", "", nil, nil, nil) + reqs = append(reqs, req) + } + pageItemsArr := sp.SetThreadnum(2).GetAllByRequest(reqs) + //pageItemsArr := sp.SetThreadnum(2).GetAll(urls, "html") for _, item := range pageItemsArr { url = item.GetRequest().GetUrl() println("url\t:\t" + url) diff --git a/example/github_login_profile_page_processor/main.go b/example/github_login_profile_page_processor/main.go new file mode 100644 index 0000000..7d257b0 --- /dev/null +++ b/example/github_login_profile_page_processor/main.go @@ -0,0 +1,130 @@ +package main + +import ( + "github.com/PuerkitoBio/goquery" + "github.com/hu17889/go_spider/core/common/page" + "github.com/hu17889/go_spider/core/common/request" + "github.com/hu17889/go_spider/core/pipeline" + "github.com/hu17889/go_spider/core/spider" + "net/http" + "net/url" + "strings" + // "fmt" + "errors" +) + +type MyPageProcesser struct { + cookies []*http.Cookie +} + +func NewMyPageProcesser() *MyPageProcesser { + return &MyPageProcesser{} +} + +func (this *MyPageProcesser) login(name string, pwd string) { +} + +// Parse html dom here and record the parse result that we want to Page. +// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html. +func (this *MyPageProcesser) Process(p *page.Page) { + + if p.GetUrlTag() == "site_login" { + //fmt.Printf("%v\n", p.GetCookies()) + this.cookies = p.GetCookies() + // AddTargetRequestWithParams Params: + // 1. Url. + // 2. Responce type is "html" or "json" or "jsonp" or "text". + // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. + // 4. The method is POST or GET. + // 5. The postdata is body string sent to sever. + // 6. The header is header for http request. + // 7. Cookies + // 8. Http redirect function + if len(this.cookies) != 0 { + p.AddField("info", "get cookies success") + req := request.NewRequest("http://backadmin.hucong.net/site/index", "html", "site_index", "GET", "", nil, this.cookies, nil) + p.AddTargetRequestWithParams(req) + } else { + p.AddField("info", "get cookies failed") + } + } else { + //fmt.Printf("%v\n", p.GetBodyStr()) + query := p.GetHtmlParser() + pageTitle := query.Find(".page-content .page-title").Text() + + if len(pageTitle) != 0 { + p.AddField("page_title", pageTitle) + p.AddField("info", "login success") + } else { + p.AddField("info", "login failed") + } + + } + + return + if !p.IsSucc() { + println(p.Errormsg()) + return + } + + query := p.GetHtmlParser() + var urls []string + query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) { + href, _ := s.Attr("href") + urls = append(urls, "http://github.com/"+href) + }) + // these urls will be saved and crawed by other coroutines. + p.AddTargetRequests(urls, "html") + + name := query.Find(".entry-title .author").Text() + name = strings.Trim(name, " \t\n") + repository := query.Find(".entry-title .js-current-repository").Text() + repository = strings.Trim(repository, " \t\n") + //readme, _ := query.Find("#readme").Html() + if name == "" { + p.SetSkip(true) + } + // the entity we want to save by Pipeline + p.AddField("author", name) + p.AddField("project", repository) + //p.AddField("readme", readme) +} + +// function that prevent redirect for getting cookies +// If CheckRedirect function returns error.New("normal"), the error process after client.Do will ignore the error. +func myRedirect(req *http.Request, via []*http.Request) error { + return errors.New("normal") +} + +func main() { + + // POST data + post_arg := url.Values{ + "name": {"admin"}, + "pwd": {"admin"}, + } + + // http header + header := make(http.Header) + header.Set("Content-Type", "application/x-www-form-urlencoded") + + // Spider input: + // PageProcesser ; + // Task name used in Pipeline for record; + // AddUrlWithParams Params: + // 1. Url. + // 2. Responce type is "html" or "json" or "jsonp" or "text". + // 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline. + // 4. The method is POST or GET. + // 5. The postdata is body string sent to sever. + // 6. The header is header for http request. + // 7. Cookies + // 8. Http redirect function + req := request.NewRequest("http://backadmin.hucong.net/main/user/login", "html", "site_login", "POST", post_arg.Encode(), header, nil, myRedirect) + + spider.NewSpider(NewMyPageProcesser(), "TaskName"). + AddRequest(req). + AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen + SetThreadnum(3). // Crawl request by three Coroutines + Run() +} diff --git a/example/sina_stock_json_processor/main.go b/example/sina_stock_json_processor/main.go index 13f8b5c..ac18062 100644 --- a/example/sina_stock_json_processor/main.go +++ b/example/sina_stock_json_processor/main.go @@ -1,4 +1,4 @@ -// The example gets stock newses from site sina.com. +// The example gets stock newses from site sina.com (http://live.sina.com.cn/zt/f/v/finance/globalnews1). // The spider is continuous service. // The stock api returns json result. // It fetchs news at regular intervals that has been set in the config file.