In this notebook, we will analyze NBM data and answer the following questions: 
1. How many unique BSLs are there in the latest version of each major release in Virginia?
2. How many ISPs provide service in Tennessee?
3. How many unserved BSLs are in Virginia in the latest version of each major release? An “unserved” BSL is one that does not have service exceeding 25Mbps download speed and 3Mbps upload speed from any ISP, ignoring ISPs that provide service to the BSL using any form of satellite Internet or *unlicensed* fixed wireless technology. They must also have either residential or “both” service and the service must be classified as “low latency”. Hint: you’ll want to filter based on technology code, download and upload speeds, business/residential code (values R and X), as well as the low latency flag.

In [11]:
# Import depednecies and read files. 
from get_parquet  import *
from categorize_bsl import *
from get_df import *
import concurrent.futures
import numpy as np
import pandas as pd
from libnbm import *


Question 1: How many unique BSLs are there in the latest version of each major release in Virginia?

In [13]:
majors = [20220630, 20221231, 20230630, 20231231, 20240630]
minors = [20240510, 20241104, 20250210, 20250201, 20250218]
major_minor_dict = dict(zip(majors, minors))

# def return_df(major, minor, fips_code):
#     DATA_BASEDIR = '/home/playpen/data/nbm_evolution/data/nbm/bdc_single_file'
#     columns_to_read = ["location_id", "max_advertised_download_speed", 
#                        "max_advertised_upload_speed", "technology", 
#                        "low_latency", "business_residential_code"]
#     table = pyarrow_pq.read_pandas(f"{DATA_BASEDIR}/{major}/{minor}/bdc_{str(fips_code).zfill(2)}_single_nbm.parquet", columns = columns_to_read, use_threads=True)
#     return table.to_pandas()

def return_df(major, minor, fips_code):
    DATA_BASEDIR = '/home/playpen/data/nbm_evolution/data/nbm/bdc_single_file'
    table = pyarrow_pq.read_pandas(f"{DATA_BASEDIR}/{major}/{minor}/bdc_{str(fips_code).zfill(2)}_single_nbm.parquet", use_threads=True)
    return table.to_pandas()

def get_bsl_count(major):
    df = return_df(major, major_minor_dict[major], 51)
    unique_locs_count = df['location_id'].nunique()
    return(f"For major release {major} and minor release {major_minor_dict[major]}, there are {unique_locs_count} unique BSLs")

result = []
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
    result = executor.map(get_bsl_count, majors)

print("Here are the number of unique BSLs in the latest version of each major release in Virginia: \n")
for r in result: 
    print(r)

Here are the number of unique BSLs in the latest version of each major release in Virginia: 

For major release 20220630 and minor release 20240510, there are 2840311 unique BSLs
For major release 20221231 and minor release 20241104, there are 2914429 unique BSLs
For major release 20230630 and minor release 20250210, there are 2920197 unique BSLs
For major release 20231231 and minor release 20250201, there are 2907830 unique BSLs
For major release 20240630 and minor release 20250218, there are 2911038 unique BSLs


Question 2: How many ISPs provide service in Tennessee?

In [15]:
majors = [20220630, 20221231, 20230630, 20231231, 20240630]
minors = [20240510, 20241104, 20250210, 20250201, 20250218]
major_minor_dict = dict(zip(majors, minors))

# def return_df(major, minor, fips_code):
#     DATA_BASEDIR = '/home/playpen/data/nbm_evolution/data/nbm/bdc_single_file'
#     table = pyarrow_pq.read_pandas(f"{DATA_BASEDIR}/{major}/{minor}/bdc_{str(fips_code).zfill(2)}_single_nbm.parquet", use_threads=True)
#     return table.to_pandas()

def count_brand_name(major):
    df = return_df(major, major_minor_dict[major], 47)
    return f"For major release {major} and minor release {major_minor_dict[major]}, there are {df['brand_name'].nunique()} unique ISPs"


with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
    result = executor.map(count_brand_name, majors)

print("Here are the number of unique ISPs in the latest version of each major release in Tennessee: \n")

for r in result: 
    print(r)

Here are the number of unique ISPs in the latest version of each major release in Tennessee: 

For major release 20220630 and minor release 20240510, there are 108 unique ISPs
For major release 20221231 and minor release 20241104, there are 109 unique ISPs
For major release 20230630 and minor release 20250210, there are 109 unique ISPs
For major release 20231231 and minor release 20250201, there are 120 unique ISPs
For major release 20240630 and minor release 20250218, there are 116 unique ISPs


Question 3: How many unserved BSLs are in Virginia in the latest version of each major release? An “unserved” BSL is one that does not have service exceeding 25Mbps download speed and 3Mbps upload speed from any ISP, ignoring ISPs that provide service to the BSL using any form of satellite Internet or unlicensed fixed wireless technology. They must also have either residential or “both” service and the service must be classified as “low latency”. Hint: you’ll want to filter based on technology code, download and upload speeds, business/residential code (values R and X), as well as the low latency flag.

In [16]:
# Define directory and mappings
DATA_BASEDIR = "/home/playpen/data/nbm_evolution/data/nbm/bdc_single_file"
majors = [20220630, 20221231, 20230630, 20231231, 20240630]
minors = [20240510, 20241104, 20250210, 20250201, 20250218]
major_minor_dict = dict(zip(majors, minors))

# def return_df(major, minor, fips_code):
#     DATA_BASEDIR = '/home/playpen/data/nbm_evolution/data/nbm/bdc_single_file'
#     columns_to_read = ["location_id", "max_advertised_download_speed", 
#                        "max_advertised_upload_speed", "technology", 
#                        "low_latency", "business_residential_code"]
#     table = pyarrow_pq.read_pandas(f"{DATA_BASEDIR}/{major}/{minor}/bdc_{str(fips_code).zfill(2)}_single_nbm.parquet", columns = columns_to_read, use_threads=True)
#     return table.to_pandas()

# def df_classify_bsl(df):
#     conditions = [
#         (df["technology"].isin(valid_technologies)) & 
#         (df["business_residential_code"].isin(valid_res_codes)) & 
#         (df["low_latency"] == 1) & 
#         (df['max_advertised_download_speed'] >= 100) & 
#         (df['max_advertised_upload_speed'] >= 20),
        
#         (df["technology"].isin(valid_technologies)) & 
#         (df["business_residential_code"].isin(valid_res_codes)) & 
#         (df["low_latency"] == 1) & 
#         (df['max_advertised_download_speed'] >= 25) & 
#         (df['max_advertised_upload_speed'] >= 3)
#     ]

#     choices = ['served', 'underserved']
    
#     df['status'] = np.select(conditions, choices, default='unserved')
#     return df

# def get_best_status(df):
#     df = df_classify_bsl(df)
#     df['status'] = pd.Categorical(df['status'], categories=['served', 'underserved', 'unserved'], ordered=True)
#     best_status_df = df.loc[df.groupby('location_id')['status'].idxmin(), ['location_id', 'status']]
#     return best_status_df

def unserved_per_major(major):
    df = return_df(major, major_minor_dict[major], 51)
    new_df = get_best_status(df)
    unserved_count = new_df['status'].value_counts().get('unserved', 0)
    return (f"For major release {major} and minor release {major_minor_dict[major]}, there are {unserved_count} unserved BSLs")

with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
    result = executor.map(unserved_per_major, majors)

print("Here are the number of unserved BSLs in the latest version of each major release in Virginia: \n")

for r in result: 
    print(r)

Here are the number of unserved BSLs in the latest version of each major release in Virginia: 

For major release 20220630 and minor release 20240510, there are 367789 unserved BSLs
For major release 20221231 and minor release 20241104, there are 373066 unserved BSLs
For major release 20230630 and minor release 20250210, there are 368998 unserved BSLs
For major release 20231231 and minor release 20250201, there are 305541 unserved BSLs
For major release 20240630 and minor release 20250218, there are 276281 unserved BSLs
