Permalink
Browse files

Initial commit

  • Loading branch information...
0 parents commit e1095db13e502cac5292aa98455f36f70ddd4ad7 @johtso committed Apr 12, 2012
@@ -0,0 +1 @@
+*.pyc
@@ -0,0 +1 @@
+scrapy crawl olyplan -a start=20/01/11 -a end=
No changes.
@@ -0,0 +1,26 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Item, Field
+
+class FlexItem(Item):
+ def __setitem__(self, key, value):
+ if key not in self.fields:
+ self.fields[key] = Field()
+
+ self._values[key] = value
+
+class AppDataItem(FlexItem):
+ pass
+
+class AppIDItem(Item):
+ appid = Field()
+
+class AppDocItem(Item):
+ appid = Field()
+ desc = Field()
+ size = Field()
+ format = Field()
+ url = Field()
@@ -0,0 +1,8 @@
+from scrapy.contrib.loader import XPathItemLoader
+from olyplan.items import AppID
+from scrapy.contrib.loader.processor import TakeFirst, MapCompose
+
+class AppIDLoader(XPathItemLoader):
+ default_item_class = AppID
+
+ search_criteria_in = MapCompose(unicode.strip)
@@ -0,0 +1,87 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+import csv
+
+from scrapy.xlib.pydispatch import dispatcher
+from scrapy import signals
+from scrapy import log
+
+
+from olyplan.items import AppIDItem
+
+class OlyPipeline(object):
+ def __init__(self):
+ self.appdata = []
+ self.appdata_titles = set()
+ # dispatcher.connect(self.spider_opened, signals.spider_opened)
+ dispatcher.connect(self.spider_closed, signals.spider_closed)
+
+ appdocfields = ('appid', 'desc', 'size', 'format', 'url')
+
+ self.appdoc_writer = csv.DictWriter(open('appdocs.csv', 'wb'),
+ appdocfields,
+ restval=None,
+ extrasaction='raise'
+ )
+
+ headerrow = {}
+ for field in appdocfields:
+ headerrow[field] = field
+
+ self.appdoc_writer.writerow(headerrow)
+
+ def process_item(self, item, spider):
+ name = item.__class__.__name__
+
+ if name == "AppDocItem":
+ self.process_appdoc_item(item)
+ elif name == "AppDataItem":
+ self.process_appdata_item(item)
+
+ return item
+
+ def spider_closed(self, spider):
+
+ spider.log("Writing appdata to file...", level=log.INFO)
+
+ fieldnames = list(self.appdata_titles)
+ appdata_writer = csv.DictWriter(open('appdata.csv', 'wb'),
+ fieldnames,
+ restval=None,
+ extrasaction='raise',
+ # quoting=csv.QUOTE_NONNUMERIC,
+ # encoding="utf-8"
+ )
+
+ headerrow = {}
+ for field in fieldnames:
+ headerrow[field] = field
+
+ appdata_writer.writerow(headerrow)
+
+ for row in self.appdata:
+ row = self.utf_8_encoder(row)
+ appdata_writer.writerow(row)
+
+ spider.log("Appdata successfully written to file!", level=log.INFO)
+
+ def process_appdata_item(self, item):
+ data = dict(item)
+ self.appdata.append(data)
+ self.appdata_titles.update(data.keys())
+
+ def process_appdoc_item(self, item):
+ doc = self.utf_8_encoder(dict(item))
+
+ self.appdoc_writer.writerow(doc)
+
+ def utf_8_encoder(self, unicode_dict):
+ utf8_dict = {}
+ for key in unicode_dict:
+ if isinstance(unicode_dict[key], unicode):
+ utf8_dict[key] = unicode_dict[key].encode("utf-8")
+ else:
+ utf8_dict[key] = unicode_dict[key]
+ return utf8_dict
@@ -0,0 +1,33 @@
+# Scrapy settings for olyplan project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+# http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'olyplan'
+BOT_VERSION = '1.0'
+
+SPIDER_MODULES = ['olyplan.spiders']
+NEWSPIDER_MODULE = 'olyplan.spiders'
+DEFAULT_ITEM_CLASS = 'olyplan.items.ApplicationItem'
+#USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+USER_AGENT = ("Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US)"
+ " AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.86 Safari"
+ "/534.13")
+# CONCURRANT_SPIDERS = 1
+CONCURRANT_REQUESTS_PER_SPIDER = 3
+DOWNLOAD_DELAY = 1
+
+LOG_LEVEL = 'WARNING'
+
+#HTTPCACHE_ENABLED = True
+
+DOWNLOADER_MIDDLEWARES = {
+ 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': None,
+}
+
+ITEM_PIPELINES = [
+ 'olyplan.pipelines.OlyPipeline',
+]
@@ -0,0 +1,8 @@
+# This package will contain the spiders of your Scrapy project
+#
+# To create the first spider for your project use this command:
+#
+# scrapy genspider myspider myspider-domain.com
+#
+# For more info see:
+# http://doc.scrapy.org/topics/spiders.html
Oops, something went wrong.

0 comments on commit e1095db

Please sign in to comment.