If --coll and --cdx-server-url are unset cdx-index-client.py does a l…

…oop over all index available in http://index.commoncrawl.org
ikreymer · Nov 30, 2016 · 586f2ed · 586f2ed
1 parent 237ef28
commit 586f2ed
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,6 @@ It is often good idea to check how big the dataset is:
 `./cdx-index-client.py -c CC-MAIN-2015-06 *.io --show-num-pages`
 
 will print the number of pages that will be fetched to get a list of urls in the '*.io' domain.
-
 This will give a relative size of the query. A query with thousands of pages may take a long time!
 
 Then, you might fetch a list of urls from the index which are part of the *.io domain, as follows:
@@ -79,6 +78,7 @@ optional arguments:
   --in-order            Fetch pages in order (default is to shuffle page list)
 ```
 
+If --coll and --cdx-server-url are unset cdx-index-client.py does a loop over all index available in http://index.commoncrawl.org
 
 ## Additional Use Cases
 

diff --git a/cdx-index-client.py b/cdx-index-client.py
@@ -14,9 +14,16 @@
 
 import logging
 
+from urlparse import urljoin
+from bs4 import BeautifulSoup
 
 DEF_API_BASE = 'http://index.commoncrawl.org/'
 
+def get_index_urls(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text,"lxml")
+    return [urljoin(url,a.attrs.get("href")+"-index") for a in soup.select("a") if "/CC-MAIN-" in a.attrs.get("href") ]
+
 def get_num_pages(api_url, url, page_size=None):
     """ Use the showNumPages query
     to get the number of pages in the result set
@@ -195,9 +202,10 @@ def run_workers(num_workers, jobs, shuffle):
         for worker in workers:
             worker.terminate()
             worker.join()
+        raise
 
 
-def main():
+def get_args():
     url_help = """
     url to query in the index:
     For prefix, use:
@@ -241,7 +249,7 @@ def main():
                         help='size of each page in blocks, >=1')
 
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('-c', '--coll', default='CC-MAIN-2015-06',
+    group.add_argument('-c', '--coll', 
                        help='The index collection to use')
 
     group.add_argument('--cdx-server-url',
@@ -266,9 +274,10 @@ def main():
     parser.add_argument('--in-order', action='store_true',
                         help='Fetch pages in order (default is to shuffle page list)')
 
-    # Logging
-    r = parser.parse_args()
+    return parser.parse_args()
 
+def main(r,prefix=None):
+    # Logging
     if r.verbose:
         level = logging.DEBUG
     else:
@@ -285,6 +294,8 @@ def main():
     else:
         api_url = DEF_API_BASE + r.coll + '-index'
 
+    logging.debug('Getting Index From ' + api_url)
+
     logging.debug('Getting Num Pages...')
     num_pages = get_num_pages(api_url, r.url, r.page_size)
 
@@ -313,6 +324,8 @@ def main():
     else:
         output_prefix = r.output_prefix
 
+    if prefix:
+        output_prefix += prefix
     def get_page_job(page):
         job = {}
         job['api_url'] = api_url
@@ -360,4 +373,15 @@ def get_page_job(page):
 
 
 if __name__ == "__main__":
-    main()
+    try:
+        r = get_args()
+        if r.coll or r.cdx_server_url:
+            main(r)
+        else:
+            api_urls=get_index_urls(DEF_API_BASE)
+            for api_url in api_urls:
+                r.cdx_server_url=api_url
+                prefix=(api_url.split('/')[-1])[0:-6]+'-'
+                main(r,prefix)
+    except KeyboardInterrupt:
+        logging.info('Received Ctrl-C, Finish.')
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,3 @@
 requests
+beautifulsoup
+urlparse