Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
oiwn committed Apr 16, 2012
0 parents commit e095dfb
Show file tree
Hide file tree
Showing 9 changed files with 178 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
*.pyc
*.log
*.*~
*.py#
.hg
.hgignore
.git
30 changes: 30 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
"""
Configs for default spider
"""
from models import init_engine
from sqlalchemy.orm import sessionmaker

# save or not
MAX_THREADS = 3
USE_CACHE = True
SAVE_TO_DB = USE_CACHE
CACHE_DB = 'default_project'

db_engine = init_engine()
Session = sessionmaker(bind=db_engine)

def default_spider_params():
params = {
'thread_number': MAX_THREADS,
'network_try_limit': 20,
'task_try_limit': 20,
}
if USE_CACHE:
params.update({
'use_cache': USE_CACHE,
'cache_db': CACHE_DB,
'debug_error' :True,
})

return params
Binary file added data.sqlite
Binary file not shown.
34 changes: 34 additions & 0 deletions models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
"""
Models for default project
"""
import datetime

from sqlalchemy import create_engine
from sqlalchemy.orm import relationship, backref
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import (Column, Integer, Text, String, ForeignKey,
DateTime, PickleType, Table)

Base = declarative_base()

def init_engine():
db_engine = create_engine(
'sqlite+pysqlite:///data.sqlite', encoding='utf-8')
Base.metadata.create_all(db_engine)
return db_engine


class Item(Base):
__tablename__ = 'item'

sqlite_autoincrement = True
id = Column(Integer, primary_key=True)

title = Column(String(160))
author = Column(String(160))
description = Column(String(255))
url = Column(String(160))

last_update = Column(DateTime, default=datetime.datetime.now)

35 changes: 35 additions & 0 deletions spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
"""
Github projects spy
"""
from optparse import OptionParser

from grab import Grab
from grab.spider import Spider, Task
from grab.tools.logs import default_logging

from spiders.explore import ExploreSpider
from spiders.lang_python import LangPythonSpider
from config import default_spider_params, Session

if __name__ == '__main__':
default_logging()
parser = OptionParser()

# command line options
parser.add_option("-p", "--python", action="store_true",
dest="parse_python", default=False)

options, args = parser.parse_args()

if options.parse_python:
print "Scape python projects"
bot = LangPythonSpider(**default_spider_params())
else:
print "Scrape trandings"
bot = ExploreSpider(**default_spider_params())

bot.setup_proxylist('/var/proxylist.txt', 'http', auto_change=True)
bot.setup_grab(timeout=4096, connect_timeout=10)
bot.run()
print bot.render_stats()
Empty file added spiders/__init__.py
Empty file.
23 changes: 23 additions & 0 deletions spiders/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
from grab.spider import Spider

from models import Item
from config import Session


class BaseHubSpider(Spider):
initial_urls = ['http://github.com']

items_total = 0

def save(self, data):
session = Session()

if not session.query(Item).filter_by(title=data['title']).first():
obj = Item(**data)
session.add(obj)
session.commit()

def log_progress(self, str):
self.items_total += 1
print "Item scraped: %s" % str
25 changes: 25 additions & 0 deletions spiders/explore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
"""
Crawl trendings from http://github.com/explore
"""
from grab.spider import Spider

from spiders.base import BaseHubSpider

class ExploreSpider(BaseHubSpider):
initial_urls = ['http://github.com/explore']

def task_initial(self, grab, task):
repos = grab.xpath_list('//ol[@class="ranked-repositories"]/li')
for repo in repos[:5]:
data = {
'author': repo.xpath('./h3/a[1]/text()')[0],
'title': repo.xpath('./h3/a[2]/text()')[0],
'url': grab.make_url_absolute(
repo.xpath('./h3/a[2]/@href')[0], resolve_base=True),
'description': repo.xpath(
'./p[@class="description"]/text()')[0]
}

self.save(data)
self.log_progress(data['author'] + ' / ' + data['title'])
24 changes: 24 additions & 0 deletions spiders/lang_python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from grab.spider import Spider

from spiders.base import BaseHubSpider

class LangPythonSpider(BaseHubSpider):
initial_urls = ['https://github.com/languages/Python/most_watched']

def task_initial(self, grab, task):
repos = grab.xpath_list(
'//table[@class="repo"]//tr/td[@class="title"]/..')
for repo in repos:
data = {
'author': repo.xpath('./td[@class="owner"]/a/text()')[0],
'title': repo.xpath('./td[@class="title"]/a/text()')[0],
'url': grab.make_url_absolute(
repo.xpath('./td[@class="title"]/a/@href')[0],
resolve_base=True),
'description': repo.xpath(
'./following::tr/td[@class="desc"]/text()')[0]
}

self.save(data)
self.log_progress(data['author'] + ' / ' + data['title'])

0 comments on commit e095dfb

Please sign in to comment.