Browse files

add links grabbing and reccuring

  • Loading branch information...
1 parent 4da074b commit 1c20675853db9d7b02a1b09de25993daa6145ddb Aleksei Pokrevskiy committed Aug 22, 2011
Showing with 281 additions and 20 deletions.
  1. +71 −0 BaseHTMLProcessor.py
  2. +159 −0 dialect.py
  3. +17 −0 mail.html
  4. +0 −1 mail.txt
  5. +34 −19 sendemail.py
View
71 BaseHTMLProcessor.py
@@ -0,0 +1,71 @@
+from sgmllib import SGMLParser
+import htmlentitydefs
+
+class BaseHTMLProcessor(SGMLParser):
+ def reset(self):
+ # extend (called by SGMLParser.__init__)
+ self.pieces = []
+ SGMLParser.reset(self)
+
+ def unknown_starttag(self, tag, attrs):
+ # called for each start tag
+ # attrs is a list of (attr, value) tuples
+ # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
+ # Ideally we would like to reconstruct original tag and attributes, but
+ # we may end up quoting attribute values that weren't quoted in the source
+ # document, or we may change the type of quotes around the attribute value
+ # (single to double quotes).
+ # Note that improperly embedded non-HTML code (like client-side Javascript)
+ # may be parsed incorrectly by the ancestor, causing runtime script errors.
+ # All non-HTML code must be enclosed in HTML comment tags (<!-- code -->)
+ # to ensure that it will pass through this parser unaltered (in handle_comment).
+ strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
+ self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
+
+ def unknown_endtag(self, tag):
+ # called for each end tag, e.g. for </pre>, tag will be "pre"
+ # Reconstruct the original end tag.
+ self.pieces.append("</%(tag)s>" % locals())
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for "&#160;", ref will be "160"
+ # Reconstruct the original character reference.
+ self.pieces.append("&#%(ref)s;" % locals())
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for "&copy;", ref will be "copy"
+ # Reconstruct the original entity reference.
+ self.pieces.append("&%(ref)s" % locals())
+ # standard HTML entities are closed with a semicolon; other entities are not
+ if htmlentitydefs.entitydefs.has_key(ref):
+ self.pieces.append(";")
+
+ def handle_data(self, text):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ # Store the original text verbatim.
+ self.pieces.append(text)
+
+ def handle_comment(self, text):
+ # called for each HTML comment, e.g. <!-- insert Javascript code here -->
+ # Reconstruct the original comment.
+ # It is especially important that the source document enclose client-side
+ # code (like Javascript) within comments so it can pass through this
+ # processor undisturbed; see comments in unknown_starttag for details.
+ self.pieces.append("<!--%(text)s-->" % locals())
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ # Reconstruct original processing instruction.
+ self.pieces.append("<?%(text)s>" % locals())
+
+ def handle_decl(self, text):
+ # called for the DOCTYPE, if present, e.g.
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ # "http://www.w3.org/TR/html4/loose.dtd">
+ # Reconstruct original DOCTYPE
+ self.pieces.append("<!%(text)s>" % locals())
+
+ def output(self):
+ """Return processed HTML as a single string"""
+ return "".join(self.pieces)
View
159 dialect.py
@@ -0,0 +1,159 @@
+import re
+from BaseHTMLProcessor import BaseHTMLProcessor
+
+class Dialectizer(BaseHTMLProcessor):
+ subs = ()
+ links = []
+
+ def reset(self):
+ # extend (called from __init__ in ancestor)
+ # Reset all data attributes
+ self.verbatim = 0
+ BaseHTMLProcessor.reset(self)
+
+ def start_a(self, attrs):
+ href = attrs[0][1]
+ if href.startswith('http:'):
+ self.links.append(attrs[0][1])
+
+ def start_pre(self, attrs):
+ # called for every <pre> tag in HTML source
+ # Increment verbatim mode count, then handle tag like normal
+ self.verbatim += 1
+ self.unknown_starttag("pre", attrs)
+
+ def end_pre(self):
+ # called for every </pre> tag in HTML source
+ # Decrement verbatim mode count
+ self.unknown_endtag("pre")
+ self.verbatim -= 1
+
+ def handle_data(self, text):
+ # override
+ # called for every block of text in HTML source
+ # If in verbatim mode, save text unaltered;
+ # otherwise process the text with a series of substitutions
+ self.pieces.append(self.verbatim and text or self.process(text))
+
+ def process(self, text):
+ # called from handle_data
+ # Process text block by performing series of regular expression
+ # substitutions (actual substitions are defined in descendant)
+ for fromPattern, toPattern in self.subs:
+ text = re.sub(fromPattern, toPattern, text)
+ return text
+
+class ChefDialectizer(Dialectizer):
+ """convert HTML to Swedish Chef-speak
+
+ based on the classic chef.x, copyright (c) 1992, 1993 John Hagerman
+ """
+ subs = ((r'a([nu])', r'u\1'),
+ (r'A([nu])', r'U\1'),
+ (r'a\B', r'e'),
+ (r'A\B', r'E'),
+ (r'en\b', r'ee'),
+ (r'\Bew', r'oo'),
+ (r'\Be\b', r'e-a'),
+ (r'\be', r'i'),
+ (r'\bE', r'I'),
+ (r'\Bf', r'ff'),
+ (r'\Bir', r'ur'),
+ (r'(\w*?)i(\w*?)$', r'\1ee\2'),
+ (r'\bow', r'oo'),
+ (r'\bo', r'oo'),
+ (r'\bO', r'Oo'),
+ (r'the', r'zee'),
+ (r'The', r'Zee'),
+ (r'th\b', r't'),
+ (r'\Btion', r'shun'),
+ (r'\Bu', r'oo'),
+ (r'\BU', r'Oo'),
+ (r'v', r'f'),
+ (r'V', r'F'),
+ (r'w', r'w'),
+ (r'W', r'W'),
+ (r'([a-z])[.]', r'\1. Bork Bork Bork!'))
+
+class FuddDialectizer(Dialectizer):
+ """convert HTML to Elmer Fudd-speak"""
+ subs = ((r'[rl]', r'w'),
+ (r'qu', r'qw'),
+ (r'th\b', r'f'),
+ (r'th', r'd'),
+ (r'n[.]', r'n, uh-hah-hah-hah.'))
+
+class OldeDialectizer(Dialectizer):
+ """convert HTML to mock Middle English"""
+ subs = ((r'i([bcdfghjklmnpqrstvwxyz])e\b', r'y\1'),
+ (r'i([bcdfghjklmnpqrstvwxyz])e', r'y\1\1e'),
+ (r'ick\b', r'yk'),
+ (r'ia([bcdfghjklmnpqrstvwxyz])', r'e\1e'),
+ (r'e[ea]([bcdfghjklmnpqrstvwxyz])', r'e\1e'),
+ (r'([bcdfghjklmnpqrstvwxyz])y', r'\1ee'),
+ (r'([bcdfghjklmnpqrstvwxyz])er', r'\1re'),
+ (r'([aeiou])re\b', r'\1r'),
+ (r'ia([bcdfghjklmnpqrstvwxyz])', r'i\1e'),
+ (r'tion\b', r'cioun'),
+ (r'ion\b', r'ioun'),
+ (r'aid', r'ayde'),
+ (r'ai', r'ey'),
+ (r'ay\b', r'y'),
+ (r'ay', r'ey'),
+ (r'ant', r'aunt'),
+ (r'ea', r'ee'),
+ (r'oa', r'oo'),
+ (r'ue', r'e'),
+ (r'oe', r'o'),
+ (r'ou', r'ow'),
+ (r'ow', r'ou'),
+ (r'\bhe', r'hi'),
+ (r've\b', r'veth'),
+ (r'se\b', r'e'),
+ (r"'s\b", r'es'),
+ (r'ic\b', r'ick'),
+ (r'ics\b', r'icc'),
+ (r'ical\b', r'ick'),
+ (r'tle\b', r'til'),
+ (r'll\b', r'l'),
+ (r'ould\b', r'olde'),
+ (r'own\b', r'oune'),
+ (r'un\b', r'onne'),
+ (r'rry\b', r'rye'),
+ (r'est\b', r'este'),
+ (r'pt\b', r'pte'),
+ (r'th\b', r'the'),
+ (r'ch\b', r'che'),
+ (r'ss\b', r'sse'),
+ (r'([wybdp])\b', r'\1e'),
+ (r'([rnt])\b', r'\1\1e'),
+ (r'from', r'fro'),
+ (r'when', r'whan'))
+
+def translate(url, dialectName="chef"):
+ """fetch URL and translate using dialect
+
+ dialect in ("chef", "fudd", "olde")"""
+ import urllib
+ sock = urllib.urlopen(url)
+ htmlSource = sock.read()
+ sock.close()
+ parserName = "%sDialectizer" % dialectName.capitalize()
+ parserClass = ChefDialectizer
+ parser = parserClass()
+ parser.feed(htmlSource)
+ parser.close()
+ return parser.links
+
+def test(url):
+ """test all dialects against URL"""
+
+ outfile = "s.html"
+ fsock = open(outfile, "wb")
+ fsock.write(translate(url))
+ fsock.close()
+# import webbrowser
+# webbrowser.open_new(outfile)
+
+if __name__ == "__main__":
+ test('http://www.aimsworldrunning.org/Calendar.htm')
View
17 mail.html
@@ -0,0 +1,17 @@
+Привет Лёша!<br><br>
+
+Надеюсь у тебя всё хоршо. Если нет, ты чувствуешь беспокойство и неуверенность перечитай пожалуйста этот текст<br><br>
+Не переживай, ПРОРВЁМСЯ!
+<hr>
+Лёша для тебя всегда&nbsp;<b>самым важным в жизни был бег</b>&nbsp;и теперь ты поменял весь смысл твоей жизни на&nbsp;<b>жрачку</b><br />
+<br />
+Лёша&nbsp;<b>старт уже совсем скоро</b>, нет времени пассивно отсиживаться и трусливо надеяться что твой организм справится сам, помоги ему!&nbsp;<b>помоги ему чтобы он смог бегать и снова стань счастливым</b><br />
+<br />
+Лёша не забывай ни на секунду&nbsp;<b>что на самом деле важно</b><br />
+<br />
+Лёша помни что сиюминутные удовольствия принесут&nbsp;<b>много боли в будущем и не позволят тебе осуществить свою самую главную мечту!</b><br />
+<br />
+Лёша я в тебя верю!&nbsp;<br />
+<br />
+<b>не забывай свою мечту и не предавай её!</b>
+<hr>
View
1 mail.txt
@@ -1 +0,0 @@
-???????????? ???? <b>????????</b> ?????????????? ?????????????????????????? ???? ??????????. ???????????????? ???????????? ?????? ????????????:
View
53 sendemail.py
@@ -1,27 +1,42 @@
-# Import smtplib for the actual sending function
+# -*- coding:utf-8 -*-
+from datetime import datetime
from email.mime.multipart import MIMEMultipart
+import random
import smtplib
# Import the email modules we'll need
from email.mime.text import MIMEText
+from time import sleep
+from dialect import translate
-# Open a plain text file for reading. For this example, assume that
-# the text file contains only ASCII characters.
-f = open('mail.txt', 'rb')
-html = f.read()
-f.close()
-# Create a text/plain message
-msg = MIMEMultipart('alternative')
-msg['Subject'] = "hello py"
-msg['From'] = 'kilonet@1gb.ru <Aleksei Marathon Mail>'
-msg['To'] = 'kpdpok@gmail.com'
+def send(link):
+ # Open a plain text file for reading. For this example, assume that
+ # the text file contains only ASCII characters.
+ f = open('mail.html', 'rb')
+ html = f.read()
+ f.close()
+ # Create a text/plain message
+ msg = MIMEMultipart('alternative')
+ msg['Subject'] = "Hello Marathoner!"
+ msg['From'] = 'kilonet@1gb.ru Aleksei Marathon Mail'
+ msg['To'] = 'kpdpok@gmail.com'
-part1 = MIMEText(html, 'html')
-msg.attach(part1)
+ html += '<br><br>Вот тебе линк на <a href="%s">марафон</a>' (link)
+ part1 = MIMEText(html, 'html')
+ msg.attach(part1)
-# Send the message via our own SMTP server, but don't include the
-# envelope header.
-s = smtplib.SMTP('smtp-56.1gb.ru', port=465)
-s.login('u268923', '842a19f4')
-s.sendmail('kilonet@1gb.ru', ['kpdpok@gmail.com'], msg.as_string())
-s.quit()
+ # Send the message via our own SMTP server, but don't include the
+ # envelope header.
+ s = smtplib.SMTP('smtp-56.1gb.ru', port=465)
+ s.login('u268923', '842a19f4')
+ s.sendmail('kilonet@1gb.ru', ['kpdpok@gmail.com'], msg.as_string())
+ s.quit()
+
+if __name__ == '__main__':
+ while True:
+ links = translate('http://www.aimsworldrunning.org/Calendar.htm ')
+ link = links[random.randint(0, len(links))]
+ send(link)
+ print 'sent on %s' % (str(datetime.utcnow()))
+ secs = random.randint(0 * 3600, 3 * 3600)
+ sleep(secs)

0 comments on commit 1c20675

Please sign in to comment.