/
article.go
135 lines (128 loc) · 3.6 KB
/
article.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package ptt
import (
"bytes"
"errors"
"fmt"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
)
type Comment struct {
PushTag string
PushUserID string
PushContent string
PushIpdatetime string
}
type Article struct {
Title string
Author string
Date string
Content string
Ip string
Comments []Comment
All, Count, P, B, N int
}
func NewArticle(url string) (*Article, error) {
if r, err := IsUrlValid(url); r != true {
return nil, errors.New(fmt.Sprintf("Error: url %s invalid: %v", url, err))
}
doc, err := GetDocument(url)
if err != nil {
return nil, err
}
a := &Article{}
// check if 404 not found
not_found := doc.Find("div.bbs-content").Text()
if not_found == "404 - Not Found." {
return nil, errors.New(fmt.Sprintf("Error: url %s not found", url))
}
// get selector of main content
main_content := doc.Find("div#main-content")
// get selector of article metaline
main_content.Find("div.article-metaline").Each(func(i int, s *goquery.Selection) {
k := s.Find("span.article-meta-tag").Text()
v := s.Find("span.article-meta-value").Text()
switch k {
case "作者":
a.Author = v
case "標題":
a.Title = v
case "時間":
a.Date = v
}
// remove article metaline
s.Remove()
})
// remove remain article metaline
main_content.Find("div.article-metaline-right").Each(func(i int, s *goquery.Selection) {
s.Remove()
})
// get selector of pushes
pushes := main_content.Find("div.push")
a.Comments = make([]Comment, pushes.Size())
pushes.Each(func(i int, push *goquery.Selection) {
push_tag := strings.Trim(push.Find("span.push-tag").Text(), " \t\n\r")
push_user_id := strings.Trim(push.Find("span.push-userid").Text(), " \t\n\r")
push_content := strings.Trim(push.Find("span.push-content").Text(), ": \t\n\r")
push_ipdatetime := strings.Trim(push.Find("span.push-ipdatetime").Text(), " \t\n\r")
switch push_tag {
case "推":
a.P += 1
case "噓":
a.B += 1
default:
a.N += 1
}
a.Comments[i] = Comment{push_tag, push_user_id, push_content, push_ipdatetime}
push.Remove()
})
// count: 推噓文相抵後的數量; all: 推文總數
a.All = a.P + a.B + a.N
a.Count = a.P - a.B
// get ip
html, err := main_content.Html()
if err != nil {
return nil, err
}
r, err := regexp.Compile("(※ 發信站: ).*")
if err != nil {
return nil, err
}
ip := r.FindString(html)
r, err = regexp.Compile("[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+")
if err != nil {
return nil, err
}
a.Ip = r.FindString(ip)
// remove redundant f2 class and remain text of others class
main_content.Find("*").Each(func(i int, s *goquery.Selection) {
text := s.Text()
if strings.Contains(text, "※ 發信站:") || strings.Contains(text, "※ 文章網址:") || strings.Contains(text, "※ 編輯:") {
s.Remove()
} else {
s.ReplaceWithHtml(text)
}
})
content, err := main_content.Html()
if err != nil {
return nil, err
}
a.Content = strings.Trim(content, "-\t\n\r")
return a, nil
}
func (c *Comment) String() string {
return fmt.Sprintf("%q %q: %q\t%q", c.PushTag, c.PushUserID, c.PushContent, c.PushIpdatetime)
}
func (a *Article) String() string {
var buffer bytes.Buffer
meta := fmt.Sprintf("%q\n作者: %q, 日期: %q\n", a.Title, a.Author, a.Date)
buffer.WriteString(meta)
content := fmt.Sprintf("%q\n來源: %q\n", a.Content, a.Ip)
buffer.WriteString(content)
push_info := fmt.Sprintf("推文數: %v, 噓文數: %v, 其他: %v\n", a.P, a.B, a.N)
buffer.WriteString(push_info)
for _, c := range a.Comments {
buffer.WriteString(fmt.Sprintf("%q\n", c))
}
return buffer.String()
}