## Print the directory structure of PCH Daily Routing Snapshots

https://www.pch.net/resources/Routing_Data/

### PCH provides historical data from more than 100 route collectors.  

### The website provides directories with BGP snapshot tables in the MTR format. <br>However, the website structure is not the same for all stored data.  <br>This script lists all the directories available for a specific year. Then, you can <br>use this output to download the MTR files.



In [5]:
import pandas as pd
import os
import sys 
import requests
import pytz

In [6]:
import cfscrape
from bs4 import BeautifulSoup

In [7]:
url_base = "http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/"

In [8]:
# list of years to be processed
years_list = ["2008","2009"] 

In [9]:
# list the available collectos
def available_collectors(url):
    scraper = cfscrape.create_scraper()
    print (url)
    scraped_html = scraper.get(url).content
    soup = BeautifulSoup(scraped_html, 'lxml')
    table =  soup.find("table", {"id": "resources-table"})        
    table = pd.read_html(str(table)) 

    # create dataframe
    df = pd.DataFrame(table[0])
 
    # df filter
    df = df[df['Name'].str.contains('pch.net', regex=True)]
    list_of_available_collectors = df.Name.unique().tolist()
    return (list_of_available_collectors)

In [10]:
# list all the months for the particular collector
def available_months(url):

    scraper = cfscrape.create_scraper()
    scraped_html = scraper.get(url).content
    soup = BeautifulSoup(scraped_html, 'lxml')
    table =  soup.find("table", {"id": "resources-table"})        
    table = pd.read_html(str(table))
    table

    # create dataframe
    df = pd.DataFrame(table[0])

    # filter the name
    df = df[df['Name'].str.contains('\d\d$', regex=True)]
    list_of_available_months = df.Name.unique().tolist()
    
    return (list_of_available_months)

In [13]:
all_urls = []

for year in years_list:
    
    print ("---working in year: {} ---".format(year))
    url = url_base+year+"/"

    # get all the collectors 
    list_of_available_collectors = available_collectors(url)
    
    # route collectors found then we build the URL 
    if (len(list_of_available_collectors)):        
        available_files_url = list(map(lambda x: url+x, list_of_available_collectors))
        all_urls.append(available_files_url)
        
    else: # old website structure

        #  find available month 
        list_of_available_months = available_months(url)
        available_files_url = list(map(lambda x: url+x, sorted(list_of_available_months)))

        files = []
        # all the available months and respective collectors
        for url in available_files_url:
            
            # find the available collectors in this month
            list_of_available_collectors = available_collectors(url)
            available_files_url = list(map(lambda x: url+"/"+x, list_of_available_collectors))
            files.append(available_files_url)
        dir_list = [item for sublist in files for item in sublist]
        all_urls.append(dir_list)
all_urls = [item for sublist in all_urls for item in sublist]       

---working in year: 2008 ---
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/
---working in year: 2009 ---
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/03
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/04
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/05
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/06
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/07
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/08
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/09
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/10
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/11
http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2009/12


In [12]:
all_urls

['http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/paix-ny.woodynet.pch.net',
 'http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/inxs.woodynet.pch.net',
 'http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/equinix-ashburn.woodynet.pch.net',
 'http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/npix.woodynet.pch.net',
 'http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/wix.woodynet.pch.net',
 'http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/route-collector.nbo.pch.net',
 'http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/route-collector.mia.pch.net',
 'http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/equinix-sin.woodynet.pch.net',
 'http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/nspixp2.woodynet.pch.net',
 'http://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/2008/hkix.woodynet.pch.net',
 'http://www.pch.net/reso