-
Notifications
You must be signed in to change notification settings - Fork 3
/
redditFetchGorp.go
295 lines (257 loc) · 8.72 KB
/
redditFetchGorp.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
package main
import (
"database/sql"
"errors"
"fmt"
"github.com/PuerkitoBio/goquery"
_ "github.com/go-sql-driver/mysql"
"github.com/kimxilxyong/gorp"
"github.com/kimxilxyong/intogooglego/post"
_ "github.com/lib/pq"
"io"
"log"
"net/http"
"unicode"
)
// Print Debug info to stdout (0: off, 1: error, 2: warning, 3: info, 4: debug)
var DebugLevel int = 3
func RedditPostScraper(sub string) (err error) {
//drivername := "postgres"
//dsn := "user=golang password=golang dbname=golang sslmode=disable"
//dialect := gorp.PostgresDialect{}
drivername := "mysql"
dsn := "golang:golang@/golang?parseTime=true"
dialect := gorp.MySQLDialect{"InnoDB", "UTF8"}
// connect to db using standard Go database/sql API
db, err := sql.Open(drivername, dsn)
if err != nil {
return errors.New("sql.Open failed: " + err.Error())
}
// Open doesn't open a connection. Validate DSN data:
if err = db.Ping(); err != nil {
return errors.New("db.Ping failed: " + err.Error())
}
// construct a gorp DbMap
dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
defer dbmap.Db.Close()
dbmap.DebugLevel = DebugLevel
// Will log all SQL statements + args as they are run
// The first arg is a string prefix to prepend to all log messages
//dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds))
// register the structs you wish to use with gorp
// you can also use the shorter dbmap.AddTable() if you
// don't want to override the table name
tablename := "posts_reddit_test"
// SetKeys(true) means we have a auto increment primary key, which
// will get automatically bound to your struct post-insert
table := dbmap.AddTableWithName(post.Post{}, tablename)
table.SetKeys(true, "PID")
// create the table. in a production system you'd generally
// use a migration tool, or create the tables via scripts
if err = dbmap.CreateTablesIfNotExists(); err != nil {
return errors.New("Create tables failed: " + err.Error())
}
// Force create all indexes for this database
if err = dbmap.CreateIndexes(); err != nil {
return errors.New("Create indexes failed: " + err.Error())
}
// Get data from reddit
geturl := "http://www.reddit.com/r/" + sub + "/new"
resp, err := http.Get(geturl)
if err != nil {
return errors.New("Failed to http.Get from " + geturl + ": " + err.Error())
}
if resp != nil {
if resp.Body == nil {
return errors.New("Body from " + geturl + " is nil!")
} else {
defer resp.Body.Close()
}
} else {
return errors.New("Response from " + geturl + " is nil!")
}
if resp.StatusCode != 200 { // 200 = OK
httperr := fmt.Sprintf("Failed to http.Get from %s: Http Status code: %d: Msg: %s", geturl, resp.StatusCode, resp.Status)
return errors.New(httperr)
}
// Create a new post slice and then parse the response body into ps
ps := make([]post.Post, 0)
ps, err = ParseHtmlReddit(resp.Body, ps)
if err != nil {
return errors.New("Error in RedditParseHtml: " + geturl + ": " + err.Error())
}
foundnewposts := false
updatedposts := 0
// insert rows - auto increment PKs will be set properly after the insert
for _, htmlpost := range ps {
if htmlpost.Err == nil {
var postcount int
// Store reddit sub
htmlpost.PostSub = sub
// check if post already exists
intSelectResult := make([]int, 0)
postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) +
" where WebPostId = :post_id"
_, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{
"post_id": htmlpost.WebPostId,
})
if err != nil {
return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error()))
}
if len(intSelectResult) == 0 {
return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql))
}
postcount = intSelectResult[0]
// DEBUG
if DebugLevel > 3 {
fmt.Println("HTMLpost.WebPostId: " + htmlpost.WebPostId)
fmt.Printf("HTMLpost.Id: %v\n", htmlpost.Id)
fmt.Printf("DBpost count: %v \n", postcount)
}
// New post? then insert
if postcount == 0 {
foundnewposts = true
err = dbmap.Insert(&htmlpost)
if DebugLevel > 2 {
// Print out the crawled info
fmt.Println("----------- INSERT POST START -----------------")
fmt.Println(htmlpost.String("insert"))
}
if err != nil {
return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error())
}
if DebugLevel > 2 {
// Print out the end of the crawled info
fmt.Println("----------- INSERT POST END -------------------")
}
} else {
// Post already exists, do an update
dbposts := make([]post.Post, 0)
getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id"
_, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{
"post_id": htmlpost.WebPostId,
})
if err != nil {
return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failes\n", htmlpost.WebPostId, err.Error()))
}
var dbpost post.Post
if len(dbposts) > 0 {
dbpost = dbposts[0]
} else {
return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql))
}
// DEBUG
if DebugLevel > 3 {
fmt.Printf("DBPOST: %s\n", dbpost.String("dbpost"))
fmt.Printf("DBpost.Id: %v\n", dbpost.Id)
fmt.Printf("DBpost.Score: %v\n", dbpost.Score)
}
if htmlpost.Score != dbpost.Score {
if DebugLevel > 2 {
// Print out the update info
fmt.Println("----------- UPDATE POST START -----------------")
fmt.Println("Title: " + dbpost.Title)
fmt.Printf("Id: %v\n", dbpost.Id)
fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score)
fmt.Println("----------- UPDATE POST END -------------------")
}
dbpost.Score = htmlpost.Score
affectedrows, err := dbmap.Update(&dbpost)
switch {
case err != nil:
return errors.New("update table " + tablename + " failed: " + err.Error())
case affectedrows == 0:
return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id))
default:
updatedposts++
}
}
}
} else {
if DebugLevel > 1 {
fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error())
}
}
}
if !foundnewposts {
if DebugLevel > 2 {
fmt.Println("No new posts found at " + geturl)
}
}
if updatedposts > 0 {
if DebugLevel > 2 {
fmt.Printf("%d posts have been updated from %s\n", updatedposts, geturl)
}
}
return
}
// Parse for posts in html from reddit, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlReddit(io io.Reader, ps []post.Post) (psout []post.Post, err error) {
// Create a qoquery document to parse from an io.Reader
doc, err := goquery.NewDocumentFromReader(io)
if err != nil {
return ps, errors.New("Failed to parse HTML: " + err.Error())
}
// Find reddit posts = elements with class "thing"
thing := doc.Find(".thing")
for iThing := range thing.Nodes {
// Create a new post struct - if the crawling fails the post will have an Err attached
// but will be added to the outgoing (psout) slice nevertheless
post := post.NewPost()
post.Site = "reddit"
// use `singlething` as a selection of one single post
singlething := thing.Eq(iThing)
// get the reddit post identifier
reddit_post_id, exists := singlething.Attr("data-fullname")
if exists == false {
singlehtml, _ := singlething.Html()
post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml)
} else {
post.WebPostId = reddit_post_id
// find an element with class title and a child with class may-blank
// and remove CRLF and unnecessary whitespaces
post.Title = stringMinifier(singlething.Find(".title .may-blank").Text())
// Get the post user
post.User = singlething.Find(".author").Text()
// Get the post url
post.Url, _ = singlething.Find(".comments.may-blank").Attr("href")
// Get the post likes score
post.SetScore(singlething.Find(".score.likes").Text())
// Get the post date
reddit_postdate, exists := singlething.Find("time").Attr("datetime")
if exists == false {
singlehtml, _ := singlething.Html()
post.Err = fmt.Errorf("datetime not found in %s", singlehtml)
} else {
post.SetPostDate(reddit_postdate)
}
}
ps = append(ps, post)
}
return ps, err
}
func stringMinifier(in string) (out string) {
white := false
for _, c := range in {
if unicode.IsSpace(c) {
if !white {
out = out + " "
}
white = true
} else {
out = out + string(c)
white = false
}
}
return
}
func main() {
err := RedditPostScraper("golang")
if err != nil {
if DebugLevel > 0 {
log.Fatalln("Failed to fetch from sub reddit golang: ", err)
panic(err)
}
}
}