Skip to content

Commit

Permalink
release 1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
stock committed Jan 13, 2015
1 parent 7b30c2b commit ffedfe4
Show file tree
Hide file tree
Showing 10 changed files with 353 additions and 38 deletions.
27 changes: 26 additions & 1 deletion core/common/page/page.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,14 @@ func (this *Page) GetRequest() *request.Request {
return this.req
}

// GetUrlTag returns name of url.
func (this *Page) GetUrlTag() string {
return this.req.GetUrlTag()
}

// AddTargetRequest adds one new Request waitting for crawl.
func (this *Page) AddTargetRequest(url string, respType string) *Page {
this.targetRequests = append(this.targetRequests, request.NewRequest(url, respType))
this.targetRequests = append(this.targetRequests, request.NewRequest(url, respType, "", "GET", "", nil, nil, nil))
return this
}

Expand All @@ -127,6 +132,26 @@ func (this *Page) AddTargetRequests(urls []string, respType string) *Page {
return this
}

// AddTargetRequest adds one new Request waitting for crawl.
// The respType is "html" or "json" or "jsonp" or "text".
// The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
// The method is POST or GET.
// The postdata is http body string.
// The header is http header.
// The cookies is http cookies.
func (this *Page) AddTargetRequestWithParams(req *request.Request) *Page {
this.targetRequests = append(this.targetRequests, req)
return this
}

// AddTargetRequests adds new Requests waitting for crawl.
func (this *Page) AddTargetRequestsWithParams(reqs []*request.Request) *Page {
for _, req := range reqs {
this.AddTargetRequestWithParams(req)
}
return this
}

// GetTargetRequests returns the target requests that will put into Scheduler
func (this *Page) GetTargetRequests() []*request.Request {
return this.targetRequests
Expand Down
65 changes: 61 additions & 4 deletions core/common/request/request.go
Original file line number Diff line number Diff line change
@@ -1,22 +1,79 @@
// Package request implements request entity contains url and other relevant informaion.
package request

import (
"net/http"
)

// Request represents object waiting for being crawled.
type Request struct {
url string
url string

// Responce type: html json jsonp text
respType string

// GET POST
method string

// POST data
postdata string

// name for marking url and distinguish different urls in PageProcesser and Pipeline
urltag string

// http header
header http.Header

// http cookies
cookies []*http.Cookie

// Redirect function for downloader used in http.Client
// If CheckRedirect returns an error, the Client's Get
// method returns both the previous Response.
// If CheckRedirect returns error.New("normal"), the error process after client.Do will ignore the error.
checkRedirect func(req *http.Request, via []*http.Request) error
}

// NewRequest returns initialized Request object.
// The respType is "json" or "html"
func NewRequest(url string, respType string) *Request {
return &Request{url, respType}
// The respType is json, jsonp, html, text
/*
func NewRequestSimple(url string, respType string, urltag string) *Request {
return &Request{url:url, respType:respType}
}
*/

func NewRequest(url string, respType string, urltag string, method string, postdata string, header http.Header, cookies []*http.Cookie, checkRedirect func(req *http.Request, via []*http.Request) error) *Request {
return &Request{url, respType, method, postdata, urltag, header, cookies, checkRedirect}
}

func (this *Request) GetUrl() string {
return this.url
}

func (this *Request) GetUrlTag() string {
return this.urltag
}

func (this *Request) GetMethod() string {
return this.method
}

func (this *Request) GetPostdata() string {
return this.postdata
}

func (this *Request) GetHeader() http.Header {
return this.header
}

func (this *Request) GetCookies() []*http.Cookie {
return this.cookies
}

func (this *Request) GetResponceType() string {
return this.respType
}

func (this *Request) GetRedirectFunc() func(req *http.Request, via []*http.Request) error {
return this.checkRedirect
}
34 changes: 27 additions & 7 deletions core/downloader/downloader_http.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@ import (
"io"
"io/ioutil"
"net/http"
"net/url"
// "regexp"
// "golang.org/x/net/html"
// "strings"
"strings"
//"fmt"
)

// The HttpDownloader download page by package net/http.
Expand Down Expand Up @@ -183,19 +185,37 @@ func (this *HttpDownloader) changeCharsetEncodingAuto(contentTypeStr string, sor
// Download file and change the charset of page charset.
func (this *HttpDownloader) downloadFile(p *page.Page, req *request.Request) (*page.Page, string) {
var err error
var url string
if url = req.GetUrl(); len(url) == 0 {
var urlstr string
if urlstr = req.GetUrl(); len(urlstr) == 0 {
mlog.LogInst().LogError("url is empty")
p.SetStatus(true, "url is empty")
return p, ""
}

client := &http.Client{
CheckRedirect: req.GetRedirectFunc(),
}
httpreq, err := http.NewRequest(req.GetMethod(), req.GetUrl(), strings.NewReader(req.GetPostdata()))
if header := req.GetHeader(); header != nil {
httpreq.Header = req.GetHeader()
}
if cookies := req.GetCookies(); cookies != nil {
for i := range cookies {
httpreq.AddCookie(cookies[i])
}
}

var resp *http.Response
if resp, err = http.Get(url); err != nil {
mlog.LogInst().LogError(err.Error())
p.SetStatus(true, err.Error())
return p, ""
if resp, err = client.Do(httpreq); err != nil {
if e, ok := err.(*url.Error); ok && e.Err != nil && e.Err.Error() == "normal" {
// normal
} else {
mlog.LogInst().LogError(err.Error())
p.SetStatus(true, err.Error())
return p, ""
}
}

p.SetHeader(resp.Header)
p.SetCookies(resp.Cookies())

Expand Down
10 changes: 5 additions & 5 deletions core/downloader/downloader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ import (
)

func TestDownloadHtml(t *testing.T) {
return
//return
//request := request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&callback=t13975294&id=23521&pagesize=45&dire=f&dpc=1")
var req *request.Request
req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html")
req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html", "", "GET", "", nil, nil, nil)

var dl downloader.Downloader
dl = downloader.NewHttpDownloader()
Expand Down Expand Up @@ -58,9 +58,9 @@ func TestDownloadHtml(t *testing.T) {
}

func TestDownloadJson(t *testing.T) {
return
//return
var req *request.Request
req = request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=23521&pagesize=4&dire=f&dpc=1", "json")
req = request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&id=23521&pagesize=4&dire=f&dpc=1", "json", "", "GET", "", nil, nil, nil)

var dl downloader.Downloader
dl = downloader.NewHttpDownloader()
Expand All @@ -81,7 +81,7 @@ func TestDownloadJson(t *testing.T) {
func TestCharSetChange(t *testing.T) {
var req *request.Request
//req = request.NewRequest("http://stock.finance.sina.com.cn/usstock/api/jsonp.php/t/US_CategoryService.getList?page=1&num=60", "jsonp")
req = request.NewRequest("http://soft.chinabyte.com/416/13164916.shtml", "html")
req = request.NewRequest("http://soft.chinabyte.com/416/13164916.shtml", "html", "", "GET", "", nil, nil, nil)

var dl downloader.Downloader
dl = downloader.NewHttpDownloader()
Expand Down
35 changes: 35 additions & 0 deletions core/scheduler/scheduler_simple.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright 2014 Hu Cong. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// The package is useless
package scheduler

import (
"github.com/hu17889/go_spider/core/common/request"
)

type SimpleScheduler struct {
queue chan *request.Request
}

func NewSimpleScheduler() *SimpleScheduler {
ch := make(chan *request.Request, 1024)
return &SimpleScheduler{ch}
}

func (this *SimpleScheduler) Push(requ *request.Request) {
this.queue <- requ
}

func (this *SimpleScheduler) Poll() *request.Request {
if len(this.queue) == 0 {
return nil
} else {
return <-this.queue
}
}

func (this *SimpleScheduler) Count() int {
return len(this.queue)
}
4 changes: 2 additions & 2 deletions core/scheduler/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (

func TestQueueScheduler(t *testing.T) {
var r *request.Request
r = request.NewRequest("http://baidu.com", "html")
r = request.NewRequest("http://baidu.com", "html", "", "GET", make(map[string]string))
fmt.Printf("%v\n", r)

var s *scheduler.QueueScheduler
Expand All @@ -37,7 +37,7 @@ func TestQueueScheduler(t *testing.T) {
// remove duplicate
s = scheduler.NewQueueScheduler(true)

r2 := request.NewRequest("http://qq.com", "html")
r2 := request.NewRequest("http://qq.com", "html", "", "GET", make(map[string]string))
s.Push(r)
s.Push(r2)
s.Push(r)
Expand Down
63 changes: 47 additions & 16 deletions core/spider/spider.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/hu17889/go_spider/core/pipeline"
"github.com/hu17889/go_spider/core/scheduler"
"math/rand"
//"net/http"
"time"
//"fmt"
)
Expand Down Expand Up @@ -71,23 +72,44 @@ func (this *Spider) Taskname() string {
return this.taskname
}

// Deal with one url and return the PageItems
// Deal with one url and return the PageItems.
func (this *Spider) Get(url string, respType string) *page_items.PageItems {
var urls []string
urls = append(urls, url)
items := this.GetAll(urls, respType)
req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil)
return this.GetByRequest(req)
}

// Deal with several urls and return the PageItems slice.
func (this *Spider) GetAll(urls []string, respType string) []*page_items.PageItems {
for _, u := range urls {
req := request.NewRequest(u, respType, "", "GET", "", nil, nil, nil)
this.AddRequest(req)
}

pip := pipeline.NewCollectPipelinePageItems()
this.AddPipeline(pip)

this.Run()

return pip.GetCollected()
}

// Deal with one url and return the PageItems with other setting.
func (this *Spider) GetByRequest(req *request.Request) *page_items.PageItems {
var reqs []*request.Request
reqs = append(reqs, req)
items := this.GetAllByRequest(reqs)
if len(items) != 0 {
return items[0]
}
return nil
}

// Deal with several urls and return the PageItems slice
func (this *Spider) GetAll(urls []string, respType string) []*page_items.PageItems {
func (this *Spider) GetAllByRequest(reqs []*request.Request) []*page_items.PageItems {
// push url
for _, u := range urls {
req := request.NewRequest(u, respType)
this.addRequest(req)
for _, req := range reqs {
//req := request.NewRequest(u, respType, urltag, method, postdata, header, cookies)
this.AddRequest(req)
}

pip := pipeline.NewCollectPipelinePageItems()
Expand Down Expand Up @@ -236,29 +258,38 @@ func (this *Spider) sleep() {
}

func (this *Spider) AddUrl(url string, respType string) *Spider {
req := request.NewRequest(url, respType)
this.addRequest(req)
req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil)
this.AddRequest(req)
return this
}

func (this *Spider) AddUrls(urls []string, respType string) *Spider {
for _, url := range urls {
req := request.NewRequest(url, respType)
this.addRequest(req)
req := request.NewRequest(url, respType, "", "GET", "", nil, nil, nil)
this.AddRequest(req)
}
return this
}

// add Request to Schedule
func (this *Spider) addRequest(req *request.Request) {
func (this *Spider) AddRequest(req *request.Request) *Spider {
if req == nil {
mlog.LogInst().LogError("request is nil")
return
return this
} else if req.GetUrl() == "" {
mlog.LogInst().LogError("request is empty")
return
return this
}
this.pScheduler.Push(req)
return this
}

//
func (this *Spider) AddRequests(reqs []*request.Request) *Spider {
for _, req := range reqs {
this.AddRequest(req)
}
return this
}

// core processer
Expand All @@ -274,7 +305,7 @@ func (this *Spider) pageProcess(req *request.Request) {
this.pPageProcesser.Process(p)
for _, req := range p.GetTargetRequests() {
//fmt.Printf("%v\n",req)
this.addRequest(req)
this.AddRequest(req)
}

// output
Expand Down

0 comments on commit ffedfe4

Please sign in to comment.