In [3]:
%load_ext autoreload
%autoreload 2

In [30]:
import os
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from IPython.display import clear_output

class Client:
    def __init__(self):
        self.client = None
        self.wait_driver = None
        
    def create_client(self, options={}):
        if self.client:
            return
        
        options = webdriver.ChromeOptions()
        options.add_argument('--start-maximized')
        self.client = webdriver.Chrome(ChromeDriverManager().install(), options=options)
        self.client.implicitly_wait(15)
        self.wait_driver = WebDriverWait(self.client, timeout=10)
        
        return self.client
    
    def close_client(self):
        if not self.client:
            return
        
        self.client.quit()

class Scraper(Client):
    
    def __init__(self):
        super().__init__()
    
    def get_data(self):
        if not self.client:
            cli = self.create_client()
        else:
            cli = self.client
            
        cli.get('https://broadbandmap.fcc.gov/data-download/nationwide-data')
        select = Select(cli.find_element(By.ID, 'state'))
        #52
        for i in range(3, 52):
            clear_output()
            print(i)
            
            select.select_by_index(i)
        
            divs = cli.find_elements(By.CSS_SELECTOR, 'div.border.rounded.p-3.h-100')

            target= divs[0]
        
            table = target.find_elements(By.CSS_SELECTOR, 'tbody tr')
            
            for t in table:
                if 'Cable' in t.text or 'Fiber' in t.text:
                    t.find_element(By.TAG_NAME, 'button').click()
                    time.sleep(2)

    
        

        

In [31]:
scrape = Scraper()

In [32]:
scrape.create_client()
scrape.get_data()
scrape.close_client()

51


### Extract Geography Zip
One time extraction of the zip file containing geography data to correspond with each CSV's provider id

Will be used to match the provider ids in the main data with human readable location

Since this is only one zip file, this wasn't really necessary, but I used it as a test for the other data sets to make sure the code worked

In [21]:
from zipfile import ZipFile

directory = os.path.abspath('../GEO-DATA')
d_list = os.listdir(directory)

for d in d_list:
    f_path = f'{directory}/{d}'

    with ZipFile(f_path, 'r') as file:
        file.extractall(path=directory)
    
    os.remove(f_path)

### Create Fips Code JSON
To assist in renaming the files, a fips table was copied from online and turned into json

In [68]:
import json
with open('../misc/fips.txt') as file:
    lines = file.readlines()
    obj = {}
    for line in lines:
        key = line[:2]
        val = line[2:].strip()
        obj[key] = val
    
with open('../misc/fips.json', 'w') as file:
    j = json.dumps(obj, indent=3)
    
    file.write(j)

### Extract Cable/Fiber Zips
Extracts each zip downloaded, renames according to the fips code and if it's cable or fiber data

In [33]:
import json

fip_path = os.path.abspath('../misc/fips.json')

with open(fip_path, 'r') as file:
    obj = json.loads(file.read())

d_path = os.path.abspath('../FCC-DATA')
directory = os.listdir(d_path)

for d in directory:
    if d[-3:] != 'zip':
        continue
        
    fip = d[4:6]
    f_path = f'{d_path}/{d}'
    
    if fip not in obj:
        continue
    
    if 'Cable' in d:
        with ZipFile(f_path, 'r') as zip:
            f_name = zip.namelist()[0]
            zip.extractall(path=d_path)
        
        os.rename(f'{d_path}/{f_name}', f'{d_path}/{obj[fip]}_cable.csv')
            
    elif 'Fiber' in d:
        with ZipFile(f_path, 'r') as zip:
            f_name = zip.namelist()[0]
            zip.extractall(path=d_path)
        
        os.rename(f'{d_path}/{f_name}', f'{d_path}/{obj[fip]}_fiber.csv')
    



### Check to make sure I have all the data
I should have 2 csvs for every state for a total of 51 entries (including DC)

In [None]:
from collections import defaultdict
from pprint import pprint

d_path = os.path.abspath('../FCC-DATA')
directory = os.listdir(d_path)

state_dict = defaultdict(int)

for d in directory:
    if d[-3:] != 'csv':
        continue
        
    idx = d.index('_')
    state_name = d[:idx]
    
    state_dict[state_name] += 1

pprint(state_dict)
print(len(state_dict))

### Remove the zip files

In [37]:
d_path = os.path.abspath('../FCC-DATA')
directory = os.listdir(d_path)

for d in directory:
    if d[-3:] != 'zip':
        continue
    
    os.remove(f'{d_path}/{d}')