Skip to content

Commit

Permalink
If --coll and --cdx-server-url are unset cdx-index-client.py does a l…
Browse files Browse the repository at this point in the history
…oop over all index available in http://index.commoncrawl.org
  • Loading branch information
s-nt-s committed Nov 30, 2016
1 parent 237ef28 commit 586f2ed
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 6 deletions.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -18,7 +18,6 @@ It is often good idea to check how big the dataset is:
`./cdx-index-client.py -c CC-MAIN-2015-06 *.io --show-num-pages`

will print the number of pages that will be fetched to get a list of urls in the '*.io' domain.

This will give a relative size of the query. A query with thousands of pages may take a long time!

Then, you might fetch a list of urls from the index which are part of the *.io domain, as follows:
Expand Down Expand Up @@ -79,6 +78,7 @@ optional arguments:
--in-order Fetch pages in order (default is to shuffle page list)
```

If --coll and --cdx-server-url are unset cdx-index-client.py does a loop over all index available in http://index.commoncrawl.org

## Additional Use Cases

Expand Down
34 changes: 29 additions & 5 deletions cdx-index-client.py
Expand Up @@ -14,9 +14,16 @@

import logging

from urlparse import urljoin
from bs4 import BeautifulSoup

DEF_API_BASE = 'http://index.commoncrawl.org/'

def get_index_urls(url):
response = requests.get(url)
soup = BeautifulSoup(response.text,"lxml")
return [urljoin(url,a.attrs.get("href")+"-index") for a in soup.select("a") if "/CC-MAIN-" in a.attrs.get("href") ]

def get_num_pages(api_url, url, page_size=None):
""" Use the showNumPages query
to get the number of pages in the result set
Expand Down Expand Up @@ -195,9 +202,10 @@ def run_workers(num_workers, jobs, shuffle):
for worker in workers:
worker.terminate()
worker.join()
raise


def main():
def get_args():
url_help = """
url to query in the index:
For prefix, use:
Expand Down Expand Up @@ -241,7 +249,7 @@ def main():
help='size of each page in blocks, >=1')

group = parser.add_mutually_exclusive_group()
group.add_argument('-c', '--coll', default='CC-MAIN-2015-06',
group.add_argument('-c', '--coll',
help='The index collection to use')

group.add_argument('--cdx-server-url',
Expand All @@ -266,9 +274,10 @@ def main():
parser.add_argument('--in-order', action='store_true',
help='Fetch pages in order (default is to shuffle page list)')

# Logging
r = parser.parse_args()
return parser.parse_args()

def main(r,prefix=None):
# Logging
if r.verbose:
level = logging.DEBUG
else:
Expand All @@ -285,6 +294,8 @@ def main():
else:
api_url = DEF_API_BASE + r.coll + '-index'

logging.debug('Getting Index From ' + api_url)

logging.debug('Getting Num Pages...')
num_pages = get_num_pages(api_url, r.url, r.page_size)

Expand Down Expand Up @@ -313,6 +324,8 @@ def main():
else:
output_prefix = r.output_prefix

if prefix:
output_prefix += prefix
def get_page_job(page):
job = {}
job['api_url'] = api_url
Expand Down Expand Up @@ -360,4 +373,15 @@ def get_page_job(page):


if __name__ == "__main__":
main()
try:
r = get_args()
if r.coll or r.cdx_server_url:
main(r)
else:
api_urls=get_index_urls(DEF_API_BASE)
for api_url in api_urls:
r.cdx_server_url=api_url
prefix=(api_url.split('/')[-1])[0:-6]+'-'
main(r,prefix)
except KeyboardInterrupt:
logging.info('Received Ctrl-C, Finish.')
2 changes: 2 additions & 0 deletions requirements.txt
@@ -1 +1,3 @@
requests
beautifulsoup
urlparse

0 comments on commit 586f2ed

Please sign in to comment.