Switch branches/tags
Nothing to show
Find file Copy path
8c44fe7 May 21, 2011
109 lines (93 sloc) 2.56 KB
import urllib2
import sys
from BeautifulSoup import BeautifulSoup
def strip_brackets(string):
remove brackets from a string
leave brackets between "<a></a>" tags in place
hihi, this is like an automata
string = "" + str(string)
#print "input: ",string
d = 0
k = 0
out = ''
for i in string:
#check for tag when not in parantheses mode
if d < 1:
if i == '>':
if i =="<":
k += 1
#check for parentheses
if k < 1:
if i == '(':
d += 1
if d > 0:
out += ' '
out += i
if i == ')' :
d -= 1
out +=i
#print "output: ",out
return out
class PhilosophyGame():
This class folows a given, or in other case random wikipedia article
and folows it's trace to the philosophy page
each time following the first link in the content, skipping links between parentheses
based on
depends on beatifulsoup
still has some utf8 problems
def __init__(self,prefix="",userAgent='Mozilla/5.0'):
self.opener = urllib2.build_opener()
self.opener.addheaders = [('User-agent', userAgent)]
self.prefix = prefix
def trace(self,article):
trace the first link in each article, that's not between parentheses
print article
#TODO: check for loops?
#and maybe fix them? :p
#currently loops between Phonetic_transcription and International_Phonetic_Alphabet
#TODO: use dynamic computing (caching of results)
resource =
data =
soup = BeautifulSoup(data)
for i in soup.find('div',id="bodyContent").findAll({'ul' : True, 'p' : True},recursive=False):
#find first link here that isn't in parenthesis
i = BeautifulSoup(strip_brackets(i))
#print i
for j in i.findAll('a'):
k = 0
for val,att in j.attrs:
if val =="href":
nexturl =att
if val =="title":
next = att
if k==0: #citations or something, no title, skipp
if next == "Philosophy":
print "You have arrived"
if not nexturl.startswith("http://"):
nexturl = self.prefix + nexturl
else: #prefix sometimes switches between wicktionary and wikipedia
self.prefix = nexturl.rpartition("/wiki/")[0]
if __name__ == "__main__":
game = PhilosophyGame()
if len(sys.argv) == 1 :
for i in sys.argv[1:]:
game.trace("" + i)