Skip to content

Commit

Permalink
version 0.5.2
Browse files Browse the repository at this point in the history
  • Loading branch information
jadbin committed Sep 19, 2016
0 parents commit eb67db3
Show file tree
Hide file tree
Showing 76 changed files with 5,310 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
*.pyc
.cache
*egg-info
build
dist
docs/_build
.coverage
.idea
.data
.log
.pid
23 changes: 23 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
language: python
python:
- 3.5

sudo: false

cache:
directories:
- $HOME/.cache/pip

install:
- pip install -r requirements-ci.txt

script:
- py.test --cov=xpaw tests

after_success:
- coveralls

branches:
only:
- master
- develop
13 changes: 13 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Copyright 2016 jadbin

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
6 changes: 6 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
include README.rst
include LICENSE
recursive-include bin *.sh
recursive-include conf *.yaml *.sh
recursive-include docs *.rst *.py
recursive-include tests *.py
46 changes: 46 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
XPaw
====

.. image:: https://travis-ci.org/jadbin/xpaw.svg?branch=master
:target: https://travis-ci.org/jadbin/xpaw

.. image:: https://coveralls.io/repos/jadbin/xpaw/badge.svg?branch=master&service=github
:target: https://coveralls.io/github/jadbin/xpaw?branch=master

.. image:: https://img.shields.io/badge/license-Apache 2-blue.svg
:target: https://github.com/jadbin/xpaw/blob/master/LICENSE


Overview
--------

XPaw is a distributed web scraping framework, used to crawl web pages and extract structured data from them.


Requirements
------------

- Python >= 3.5
- `MangoDB`_ as database backend
- `Kafka`_ as request queue
- `aiohttp`_
- `pyyaml`_
- `pymongo`_
- `pykafka`_
- `lxml`_

.. _MangoDB: https://www.mongodb.com/
.. _Kafka: http://kafka.apache.org/
.. _aiohttp: https://pypi.python.org/pypi/aiohttp
.. _pyyaml: https://pypi.python.org/pypi/pyyaml
.. _pymongo: https://pypi.python.org/pypi/pymongo
.. _pykafka: https://pypi.python.org/pypi/pykafka
.. _lxml: https://pypi.python.org/pypi/lxml


Documentation
-------------

Full documentation and usage examples for XPaw can be found on `readthedocs`_.

.. _readthedocs: http://xpaw.readthedocs.org/en/latest/
13 changes: 13 additions & 0 deletions bin/start-agent.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash

bin=$(dirname $0)
bin=$(cd "$bin"; pwd)

. "$bin"/xpaw-config.sh

config="$XPAW_CONF_DIR"/agent.yaml
data_dir="$XPAW_DATA_DIR"
logger="$XPAW_CONF_DIR"/logger.yaml

echo "start agent"
"$bin"/xpaw-daemon.sh start agent --config "$config" --data-dir "$data_dir" --logger "$logger" $@
13 changes: 13 additions & 0 deletions bin/start-fetcher.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash

bin=$(dirname $0)
bin=$(cd "$bin"; pwd)

. "$bin"/xpaw-config.sh

config="$XPAW_CONF_DIR"/fetcher.yaml
data_dir="$XPAW_DATA_DIR"
logger="$XPAW_CONF_DIR"/logger.yaml

echo "start fetcher"
"$bin"/xpaw-daemon.sh start fetcher --config "$config" --data-dir "$data_dir" --logger "$logger" $@
13 changes: 13 additions & 0 deletions bin/start-master.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash

bin=$(dirname $0)
bin=$(cd "$bin"; pwd)

. "$bin"/xpaw-config.sh

config="$XPAW_CONF_DIR"/master.yaml
data_dir="$XPAW_DATA_DIR"
logger="$XPAW_CONF_DIR"/logger.yaml

echo "start master"
"$bin"/xpaw-daemon.sh start master --config "$config" --data-dir "$data_dir" --logger "$logger" $@
9 changes: 9 additions & 0 deletions bin/stop-agent.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash

bin=$(dirname $0)
bin=$(cd "$bin"; pwd)

. "$bin"/xpaw-config.sh

echo "stop agent"
"$bin"/xpaw-daemon.sh stop agent $@
9 changes: 9 additions & 0 deletions bin/stop-fetcher.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash

bin=$(dirname $0)
bin=$(cd "$bin"; pwd)

. "$bin"/xpaw-config.sh

echo "stop fetcher"
"$bin"/xpaw-daemon.sh stop fetcher $@
9 changes: 9 additions & 0 deletions bin/stop-master.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash

bin=$(dirname $0)
bin=$(cd "$bin"; pwd)

. "$bin"/xpaw-config.sh

echo "stop master"
"$bin"/xpaw-daemon.sh stop master $@
16 changes: 16 additions & 0 deletions bin/xpaw-config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

bin=$(dirname $0)
bin=$(cd "$bin"; pwd)

export XPAW_HOME=${XPAW_HOME:-$(cd "$bin"/../; pwd)}
export XPAW_CONF_DIR=${XPAW_CONF_DIR:-"$XPAW_HOME"/conf}

if [ -f "$XPAW_CONF_DIR"/xpaw-env.sh ]; then
. "$XPAW_CONF_DIR"/xpaw-env.sh
fi

export XPAW_DATA_DIR=${XPAW_DATA_DIR:-"$XPAW_HOME"/.data}
export XPAW_LOG_DIR=${XPAW_LOG_DIR:-"$XPAW_HOME"/.log}
export XPAW_PID_DIR=${XPAW_PID_DIR:-"$XPAW_HOME"/.pid}
export XPAW_ID_STRING=${XPAW_ID_STRING:-$USER}
60 changes: 60 additions & 0 deletions bin/xpaw-daemon.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env bash

if [ $# -le 1 ]; then
exit 1
fi

bin=$(dirname $0)
bin=$(cd "$bin"; pwd)

. "$bin"/xpaw-config.sh

cmd=$1
shift
name=$1
shift

if [ ! -d "$XPAW_LOG_DIR" ]; then
mkdir -p "$XPAW_LOG_DIR"
fi
if [ ! -d "$XPAW_PID_DIR" ]; then
mkdir -p "$XPAW_PID_DIR"
fi

log="$XPAW_LOG_DIR"/xpaw-"$XPAW_ID_STRING"-"$name".log
pid="$XPAW_PID_DIR"/xpaw-"$XPAW_ID_STRING"-"$name".pid
stop_timeout=5

case $cmd in
start)
nohup "$PYTHON" -m xpaw "$cmd" "$name" $@ > "$log" 2>&1 < /dev/null &
echo $! > "$pid"
sleep 3
if ! ps -p $! > /dev/null; then
echo "fail to start $name"
exit 1
fi
;;
stop)
if [ -f "$pid" ]; then
target_pid=$(cat "$pid")
if kill -0 $target_pid > /dev/null 2>&1; then
echo "kill $target_pid"
kill $target_pid
sleep $stop_timeout
if kill -0 $target_pid > /dev/null 2>&1; then
echo "$name did not stop gracefully after $stop_timeout seconds: killing with kill -9"
kill -9 $target_pid
fi
else
echo "no $name to stop"
fi
rm -f "$pid"
else
echo "no $name to stop"
fi
;;
*)
exit 1
;;
esac
3 changes: 3 additions & 0 deletions conf/agent.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
agent_server_listen: 0.0.0.0:7340

proxy_checker:
1 change: 1 addition & 0 deletions conf/fetcher.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
master_rpc_addr: {MASTER_IP}:7310
16 changes: 16 additions & 0 deletions conf/logger.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
version: 1

loggers:
xpaw:
level: INFO
handlers: [console]

handlers:
console:
class: logging.StreamHandler
formatter: default

formatters:
default:
format: "%(asctime)s %(name)s: [%(levelname)s] %(message)s"
datefmt: "%b/%d/%Y %H:%M:%S"
7 changes: 7 additions & 0 deletions conf/master.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
master_rpc_listen: 0.0.0.0:7310
mqcache_rpc_listen: 0.0.0.0:7311

kafka_addr: {KAFKA_IP}:9092
zookeeper_addr: {ZOOKEEPER_IP}:2181
mongo_addr: mongodb://{MONGO_USER}:{USER_PWD}@{MONGO_IP}:27017
mqcache_rpc_addr: {MASTER_IP}:7311
19 changes: 19 additions & 0 deletions conf/xpaw-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

# The command of Python
export PYTHON=python3

# The directory where configuration files are stored.
export XPAW_CONF_DIR=${XPAW_CONF_DIR}

# The directory where data files are stored.
export XPAW_DATA_DIR=${XPAW_DATA_DIR}

# The directory where log files are stored.
export XPAW_LOG_DIR=${XPAW_LOG_DIR}

# The directory where pid files are stored.
export XPAW_PID_DIR=${XPAW_PID_DIR}

# A string representing this instance.
export XPAW_ID_STRING=${XPAW_ID_STRING}
Loading

0 comments on commit eb67db3

Please sign in to comment.