-
Notifications
You must be signed in to change notification settings - Fork 1
/
gopherbot.hs
222 lines (196 loc) · 9.18 KB
/
gopherbot.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
{-
Copyright (C) 2005 John Goerzen <jgoerzen@complete.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
module Main where
import Config
import Types
import Control.Monad(when, unless)
import Control.Exception(finally, bracket)
import System.Directory
import DB
import Database.HDBC
import Utils
import MissingH.Path.FilePath
import MissingH.Network
import NetClient
import DirParser
import Control.Concurrent
import Data.List
import Control.Exception(bracket_)
import RobotsTxt
import Control.Concurrent
import System.IO
import qualified Data.Map as Map
{- | Main entry point for the program. -}
main = niceSocketsDo $ -- Prepare things for sockets
do setCurrentDirectory baseDir -- chdir to the working dir
l <- newLock -- Global lock for db updates
initdb -- Initialize the database and get a conn
gasupply <- newMVar Map.empty -- Global MVar for current status
runScan gasupply l -- main scanner
{- | Set up all the threads and get them going. -}
runScan gasupply l =
do c <- dbconnect
n <- numToProc c
msg $ (show n) ++ " items to process"
when (n == 0) -- Nothing to do: prime the db
(mapM_ (\g -> updateItem l c g NotVisited "") startingAddresses)
{- Fork off the childthreads. Each one goes into a loop
of waiting for new items to process and processing them. -}
disconnect c
children <- mapM
(\_ -> myForkOS (procLoop l gasupply)) [1..numThreads]
-- This is the thread that displays status updates every so often
--stats <- forkOS (statsthread l)
-- When the main thread exits, so does the program, so
-- we wait for all children before exiting.
waitForChildren children
{- | A simple wrapper around forkOS to notify the main thread when each
individual thread dies. -}
myForkOS :: IO () -> IO (MVar ThreadId)
myForkOS io =
do mvar <- newEmptyMVar
forkIO (action `finally` (myThreadId >>= putMVar mvar))
return mvar
where action = do msg "started."
io
{- | Wait for child threads to die.
This should only happen when there is nothing else to spider. -}
waitForChildren :: [MVar ThreadId] -> IO ()
waitForChildren [] = msg $ "All children died; exiting."
waitForChildren (c:xs) =
do t <- takeMVar c
msg $ " *********** Thread died: " ++ (show t)
waitForChildren xs
{- | Main entry point for each worker thread. We just pop the first item,
then call procLoop'. -}
procLoop lock gasupply =
do bracket dbconnect disconnect (\c -> do
i <- popItem lock gasupply c
procLoop' lock gasupply c i
)
{- | Main worker loop. We receive an item and process it. If it's
Nothing, there is nothing else to do, so the thread shuts down.
Otherwise, call procItem, pop the next, and then call itself. -}
procLoop' lock gasupply c i =
do case i of
Nothing -> msg $ "Exiting"
Just item -> do procItem lock gasupply c item
-- Popping the next item before releasing the current
-- host is a simple form of being nice to remotes
i <- popItem lock gasupply c
procLoop' lock gasupply c i
{- | What happened when we checked the robots.txt file? -}
data RobotStatus = RobotsOK -- ^ Proceed
| RobotsDeny -- ^ Do not download this file
| RobotsError -- ^ Error occured; abort.
{- | Given a 'GAddress' (corresponding to a single item),
check to see if it's OK to download according to robots.txt.
-}
checkRobots :: Lock -> GASupply -> Connection -> GAddress -> IO RobotStatus
checkRobots lock gasupply c ga =
do let fspath = getFSPath garobots
dfe <- doesFileExist fspath
unless (dfe) (procItem lock gasupply c garobots) -- Download file if needed
dfe2 <- doesFileExist fspath -- Do we have it yet?
if dfe2
then -- Yes. Parse it, and see what happened.
do r <- parseRobots fspath
return $ case isURLAllowed r "gopherbot" (path ga) of
True -> RobotsOK
False -> RobotsDeny
else return RobotsError -- No. TCP error occured.
where garobots = ga {path = "robots.txt", dtype = '0'}
{- | Run an IO action, but only if it's OK according to robots.txt. -}
procIfRobotsOK :: Lock -> GASupply -> Connection -> GAddress -> IO () -> IO ()
procIfRobotsOK lock gasupply c item action =
do r <- if (path item /= "robots.txt")
then checkRobots lock gasupply c item
else -- Don't try to check if robots.txt itself is ok
return RobotsOK
case r of
RobotsOK -> action -- OK per robots.txt; run it.
RobotsDeny -> do msg $ "Excluded by robots.txt: " ++
(show item)
updateItem lock c item Excluded ""
RobotsError -> do msg $ "Problem getting robots.txt: " ++
host item
-- Next line not necessary; procItem
-- on robots.txt will have done it
-- already.
--noteErrorOnHost lock c (host item) (show msg)
-- TODO: better crash handling on robots.txt
{- | OK, we have an item. If it's OK according to robots.txt, download
and process it. -}
procItem lock gasupply c item = procIfRobotsOK lock gasupply c item $
do msg $ show item -- Show what we're up to
let fspath = getFSPath item
-- Create the directory for the file to go in, if necessary.
catch (bracket_ (acquire lock)
(release lock)
(createDirectoryIfMissing True
(fst . splitFileName $ fspath)))
(\e -> -- If we got an exception here, note an error for this item
do msg $ "Single-Item Error on " ++ (show item) ++ ": "
++ (show e)
updateItem lock c item ErrorState (show e)
)
fh <- catch ((openFile fspath WriteMode >>= (return . Just)))
(\e -> do msg $ "Single-item error on " ++ (show item) ++ ": " ++
(show e)
updateItem lock c item ErrorState (show e)
return Nothing
)
case fh of
Nothing -> return ()
Just h -> -- Now, download it. If it's a menu
--(item type 1), check it for links
-- (spider it). Error here means a TCP
-- problem, so mark every
-- item on this host as having the error.
catch (do dlItem item h
when (dtype item == '1') (spider lock c fspath)
updateItem lock c item Visited ""
)
(\e -> do msg $ "Error on " ++ (show item)
++ ": " ++ (show e)
noteErrorOnHost lock gasupply c
(host item) (show e)
)
{- | This function is called by procItem whenever it downloads a
menu. This function calles the parser, extracts items, and calles
DB.queueItems to handle them. (Insert into DB if new) -}
spider l c fspath =
do netreferences <- parseGMap fspath
let refs = filter filt netreferences
queueItems l c refs
where filt a = (not ((dtype a) `elem` ['i', '3', '8', '7', '2'])) &&
not (host a `elem` excludeServers)
{- | This thread prints a periodic status update. -}
statsthread :: Lock -> IO ()
statsthread l =
do c <- dbconnect
statsthread' l c
disconnect c
statsthread' l c =
do res <- quickQuery c "SELECT state, COUNT(*) from files group by state order by state" []
let counts = map (\[thiss, thisc] -> (fromSql thiss,
(fromSql thisc)::Integer)) res
let total = sum (map snd counts)
let totaltext = "Total " ++ show total
let statetxts = map (\(h, c) -> h ++ " " ++ show c) counts
let disp = concat . intersperse ", " $ totaltext : statetxts
msg disp
threadDelay (120 * 1000000)
statsthread' l c