Permalink
Switch branches/tags
Nothing to show
Find file Copy path
8c44fe7 May 21, 2011
109 lines (93 sloc) 2.56 KB
#!/bin/python
import urllib2
import sys
from BeautifulSoup import BeautifulSoup
def strip_brackets(string):
"""
remove brackets from a string
leave brackets between "<a></a>" tags in place
hihi, this is like an automata
"""
string = "" + str(string)
#print "input: ",string
d = 0
k = 0
out = ''
for i in string:
#check for tag when not in parantheses mode
if d < 1:
if i == '>':
k-=1
if i =="<":
k += 1
#check for parentheses
if k < 1:
if i == '(':
d += 1
if d > 0:
out += ' '
else:
out += i
if i == ')' :
d -= 1
else:
out +=i
#print "output: ",out
return out
class PhilosophyGame():
"""
This class folows a given, or in other case random wikipedia article
and folows it's trace to the philosophy page
each time following the first link in the content, skipping links between parentheses
based on http://imgur.com/mqlKD
depends on beatifulsoup
still has some utf8 problems
"""
def __init__(self,prefix="http://en.wikipedia.org",userAgent='Mozilla/5.0'):
self.opener = urllib2.build_opener()
self.opener.addheaders = [('User-agent', userAgent)]
self.prefix = prefix
def trace(self,article):
"""
trace the first link in each article, that's not between parentheses
"""
print article
#TODO: check for loops?
#and maybe fix them? :p
#currently loops between Phonetic_transcription and International_Phonetic_Alphabet
#TODO: use dynamic computing (caching of results)
resource = self.opener.open(article)
data = resource.read()
resource.close()
soup = BeautifulSoup(data)
for i in soup.find('div',id="bodyContent").findAll({'ul' : True, 'p' : True},recursive=False):
#find first link here that isn't in parenthesis
i = BeautifulSoup(strip_brackets(i))
#print i
for j in i.findAll('a'):
k = 0
for val,att in j.attrs:
if val =="href":
nexturl =att
if val =="title":
next = att
k=1
if k==0: #citations or something, no title, skipp
continue
if next == "Philosophy":
print "You have arrived"
return
else:
if not nexturl.startswith("http://"):
nexturl = self.prefix + nexturl
else: #prefix sometimes switches between wicktionary and wikipedia
self.prefix = nexturl.rpartition("/wiki/")[0]
self.trace(nexturl)
return
if __name__ == "__main__":
game = PhilosophyGame()
if len(sys.argv) == 1 :
game.trace("http://en.wikipedia.org/wiki/Special:Random")
else:
for i in sys.argv[1:]:
game.trace("http://en.wikipedia.org/wiki/" + i)