Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 127 lines (95 sloc) 2.91 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
#!/usr/bin/env python

#
# IANAPP (i am not a python programmer)
#


import cgi
import os
from sgmllib import SGMLParser

from google.appengine.api import urlfetch
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.ext import db
from google.appengine.ext.webapp import template
import pprint

class MainPage(webapp.RequestHandler):

def get(self):
template_values = {}
path = os.path.join(os.path.dirname(__file__), 'index.html')
if self.request.get('url'):
template_values['url'] = self.request.get('url')

try:
links = RevCanonical().revcanonical(self.request.get('url'))

if links:
template_values['link'] = links[0]
else:
template_values['link'] = template_values['url']
except Exception, e:
template_values['error'] = e;

self.response.out.write(template.render(path, template_values))

def post(self):
self.get()


class ApiPage(webapp.RequestHandler):
def get(self):

if self.request.get('url'):
url = self.request.get('url')
try:
links = RevCanonical().revcanonical(self.request.get('url'))

if links:
url = links[0]

self.response.out.write(url)
except Exception, e:
self.error(500)
self.response.out.write(e)
else:
self.response.out.write("Takes argument <code>url</code> returns reverse canonicalized URL, if found. Otherwise returns the passed URL.")

def post(self):
pass

class RevCanonical:
def revcanonical(self, url):
resp = urlfetch.fetch(url)
html = resp.content

fragment = len(url.split('#')) > 1 and '#' + url.split('#')[1] or ''

shorts = []

parser = LinkParser()
parser.feed(html)
links = parser.links

for l in links:
for e in l:
if e[0] == 'rel':
if e[1].count('alternate') and e[1].count('short'):
shorts.append(l)
elif e[1].count('short_url'):
shorts.append(l)
elif e[1].count('shorter-alternative'):
shorts.append(l)
elif e[1].count('short_url'):
shorts.append(l)
elif e[1].count('shortlink'):
shorts.append(l)
elif e[0] == 'rev':
if e[1].count('canonical'):
shorts.append(l)

return self.hrefs(shorts, fragment)

def hrefs(self, links, fragment = ''):
hrefs = []
for l in links:
for e in l:
if e[0] == 'href':
hrefs.append(e[1] + fragment)

return hrefs;

class LinkParser(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.links = []

    def do_link(self, attrs):
        hreflist = [e[1] for e in attrs if e[0]=='href']
        if hreflist:
            self.links.append(attrs)

    def end_head(self, attrs):
        self.setnomoretags()
    start_body = end_head


application = webapp.WSGIApplication( [('/', MainPage), ('/api', ApiPage)], debug=True)

def main():
run_wsgi_app(application)


if __name__ == '__main__':
  main()
Something went wrong with that request. Please try again.