Skip to content

Commit

Permalink
Fix some bugs
Browse files Browse the repository at this point in the history
darcs-hash:20051013162638-c2a52-6cf82b8e16c90587ec62de5bb8052a24f88e2596.gz
  • Loading branch information
jgoerzen committed Oct 13, 2005
1 parent 38a3cc5 commit a055872
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 7 deletions.
2 changes: 2 additions & 0 deletions RobotsTxt.hs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

module RobotsTxt where

-- FIXME: should only consider first user-agent match?

import Text.ParserCombinators.Parsec
import Data.Maybe
import MissingH.Str
Expand Down
23 changes: 16 additions & 7 deletions gopherbot.hs
Original file line number Diff line number Diff line change
Expand Up @@ -83,17 +83,23 @@ procLoop' lock gasupply c hl i =
i <- popItem lock gasupply c hl
--delHost hl (host item)
procLoop' lock gasupply c hl i
data RobotStatus = RobotsOK -- ^ Proceed
| RobotsDeny -- ^ Stop
| RobotsError -- ^ Error occured; abort.


checkRobots :: Lock -> Connection -> GAddress -> IO Bool
checkRobots :: Lock -> Connection -> GAddress -> IO RobotStatus
checkRobots lock c ga =
do let fspath = getFSPath garobots
dfe <- doesFileExist fspath
unless (dfe) (procItem lock c garobots)
dfe2 <- doesFileExist fspath
if dfe2
then do r <- parseRobots fspath
return $ isURLAllowed r "gopherbot" (path ga)
else return True
return $ case isURLAllowed r "gopherbot" (path ga) of
True -> RobotsOK
False -> RobotsDeny
else return RobotsError

where garobots = ga {path = "robots.txt", dtype = '0'}

Expand Down Expand Up @@ -121,10 +127,13 @@ procIfRobotsOK :: Lock -> Connection -> GAddress -> IO () -> IO ()
procIfRobotsOK lock c item action =
do r <- if (path item /= "robots.txt")
then checkRobots lock c item
else return True
if r
then action
else do fail $ "Excluded by robots.txt: " ++ (show item)
else return RobotsOK -- Don't try to re-process robots.txt itself
case r of
RobotsOK -> action
RobotsDeny -> do msg $ "Excluded by robots.txt: " ++ (show item)
updateItem lock c item ErrorState
RobotsError -> do msg $ "Blocking host due to connection problems with robots.txt: " ++ host item
noteErrorOnHost lock c (host item)


spider l c fspath =
Expand Down

0 comments on commit a055872

Please sign in to comment.