Permalink
Browse files

update reddit links 1 week after posting

  • Loading branch information...
jseims committed Nov 30, 2011
1 parent 0215fcc commit e6ef6fcb179ef9fffdf6a8ae3faa88f34a334681
Showing with 96 additions and 3 deletions.
  1. +40 −0 config/10update_reddit
  2. +8 −3 scripts/get_reddit_images.py
  3. +48 −0 scripts/update_reddit_links.py
View
@@ -0,0 +1,40 @@
+# updates older images on reddit every hour, checks to see if it's currently running
+# (place in /etc/cron.hourly)
+#!/bin/bash
+
+unalias -a
+trap onexit SIGINT SIGSEGV SIGQUIT SIGTERM
+
+prog="update_reddit"
+lock="/tmp/${prog}.lock"
+
+onexit () {
+ rm -f "${lock}"
+ exit
+}
+
+# check if the lock file is in place.
+if [ -f $lock ]; then
+ # silent exit is better from cron jobs,
+ # echo "$0 Error: Lock file $lock is in place."
+ # echo "Make sure an old instance of this program is not running, remove it and try again."
+ exit
+fi
+date > $lock
+
+#
+# your script goes here
+#
+
+export PYTHONPATH=/data/tristara/config
+/data/tristara/scripts/update_reddit_links.py >> /var/log/tristara 2>&1
+
+#
+# exit your program calling onexit
+#
+
+onexit
+
+
+
+
@@ -16,11 +16,14 @@ def crawl_page(url_base, after):
def write_link(v):
url = v['url']
+ over_18 = 0
+ if v['over_18'] == 'true':
+ over_18 = 1
# only store links to acutal images (vs. html pages framing images)
if url.endswith(".jpg") or url.endswith(".png") or url.endswith(".gif"):
db.query("""
- INSERT IGNORE INTO reddit (domain, subreddit, id, author, score, thumbnail, permalink, url, title)
- VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)""",
+ INSERT IGNORE INTO reddit (domain, subreddit, id, author, score, thumbnail, permalink, url, title, over_18, created)
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(v['domain'],
v['subreddit'],
v['id'],
@@ -29,7 +32,9 @@ def write_link(v):
v['thumbnail'],
v['permalink'],
v['url'],
- v['title']))
+ v['title'],
+ over_18,
+ v['created']))
else:
print "ignoring url " + url
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+import urllib2, json, string, time
+import db
+
+NUM_LINKS = 10000 # how many links to process
+MIN_AGE = 604800 # seconds in a week
+
+def update_link(permalink, sequence):
+ # record attempt
+ db.query("""update reddit SET update_attempts = update_attempts + 1 WHERE sequence = %s""" % (sequence))
+ success = 0
+
+ # get info
+ url = "http://www.reddit.com%s.json" % (str(permalink))
+ print "loading %s" % (url)
+ try:
+ u = urllib2.urlopen(url)
+ data = json.load(u)
+ u.close()
+ data = data[0]["data"]["children"][0]["data"]
+ over_18 = 0
+ if data['over_18'] == 'true':
+ over_18 = 1
+ db.query("""
+ UPDATE reddit SET score = %s, created = %s, over_18 = %s, updated = 1 WHERE sequence = %s""",
+ (data["score"], data["created"], over_18, sequence))
+ success = 1
+
+ except Exception, e:
+ print "Error in loading page %s" % (url)
+
+ return success
+
+def main():
+ links = db.query("""select permalink, sequence from reddit where updated = 0 AND unix_timestamp() - created > %s AND update_attempts < 3 LIMIT %s;""", [MIN_AGE, NUM_LINKS])
+ success_count = 0
+ for link in links:
+ success_count = success_count + update_link(link["permalink"], link["sequence"])
+ # sleep 2 seconds as per Reddit's crawling TOS
+ time.sleep(2)
+ print "successfully updated %s links out of %s" % (success_count, len(links))
+
+
+
+if __name__ == "__main__":
+ main()
+

0 comments on commit e6ef6fc

Please sign in to comment.